|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
( A' T! X( ?3 L* ^6 H8 B
缺少一个gethtml,用下面这个:
$ }0 x0 p- k; X5 X, d- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 $ T! t9 M5 E- K+ w8 l) o3 D5 {/ F
- {
( X/ ]/ K# C0 Y1 S, E% T - string strWebData = "error";- X1 O; n1 m& L$ B
- try8 m5 E" l3 A$ ?, Y+ V( t a K
- {' \, e9 c7 |" f( _! z+ n7 Q6 j
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 4 {$ G9 i) ^5 T. `- h1 U$ q
- // 需要注意的: " z4 r5 |1 b! {" ?6 `
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 # s& ?( ^0 S4 h' P
- //这是就要具体问题具体分析比如在头部加入cookie
& y/ |( I' z" j3 v1 i/ d - // webclient.Headers.Add("Cookie", cookie); 6 g; m0 H* l5 n3 l
- //这样可能需要一些重载方法。根据需要写就可以了
/ R" t+ \+ n& s - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");" z" X' n" ^6 V+ \" B
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
4 T/ k6 N6 N' h( ?% b - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
. z% I7 t" O5 V4 `8 ` - myWebClient.Credentials = CredentialCache.DefaultCredentials;
( w+ y3 J2 j. u; h1 w - //如果服务器要验证用户名,密码
( b' K% n# U D' ~% o - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
( c- n5 s4 K2 g& C6 t9 B" Q- H - //myWebClient.Credentials = mycred;
% {, i5 M, A9 [1 y - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) $ C8 G% j( D# L$ W( M
- byte[] myDataBuffer = myWebClient.DownloadData(url);
% E$ r4 j9 }# j* `/ f- n - strWebData = Encoding.Default.GetString(myDataBuffer);
/ @+ `, S) y9 T$ V/ ^, X5 z
2 i: U6 |- u# u4 K) s, [/ F- //获取网页字符编码描述信息
. h1 j$ ^8 j( p, F - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);" a" i# m5 d/ @, \1 [+ ?9 i; R" @1 M
- string webCharSet = charSetMatch.Groups[2].Value;) G" T3 h7 V$ b) S
- if (charSet == null || charSet == "")! W5 u6 F% U, P. @
- charSet = webCharSet;* G8 B U, u" |; K
- if (charSet.Length > 0)
3 y# H3 D4 f% o2 k1 H# w" N - {0 K( I4 x1 \" o
- charSet = charSet.Replace(""", "");
) E' b$ Q, `- `* J6 F* s% H - }2 Q5 s: Y, p( U2 S0 j
- if (UseUTF8CharSet)
* w4 h/ M, b! O. d+ W/ p - {
5 h) i$ v7 _: [* s0 q( ]0 M8 x# q - if (charSet == null || charSet.Length == 0)
% A& D3 w# `0 j! o0 } - {
* m/ Y1 n9 R8 ], l8 } - charSet = "utf-8";4 x$ D& e% }2 N1 p+ M
- }
U: `8 d6 c6 p - }* e5 M' H$ e! T( Q
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
1 M# U$ p! u6 |5 l5 m5 w - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);% L( j/ n! U" Y4 n3 N& j% d7 k
5 O/ M9 u5 Q' R A- }
: W" A2 u3 J1 s4 m: x& w; ^ - catch (Exception)
, _; B. K: G9 K1 ^2 z3 C - {
( Q8 a* O7 i/ O. G2 Z2 r - strWebData = "error";& q9 a" l$ u9 o( t" d X9 N' E
- }
( x4 k9 z+ \" T' g* H& x" M - 5 v5 M$ Y( A! o# \7 k+ A
- return strWebData;
. }$ ^& V6 k$ M8 I2 i - }
复制代码
; T: Q! ?6 C1 N9 O* t/ }( r( C+ P- Z6 o; I! h: D, n$ ^1 x) ?
|
|