|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
4 [: h5 T: I* L$ s7 @3 |$ n$ c
缺少一个gethtml,用下面这个:2 U" F4 [# x& O# V' ?) F
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 0 S! m e8 P# q5 m
- {
9 x6 \ {" c7 C9 R4 D+ j: @ - string strWebData = "error";
- y' F6 `1 M; R. p2 S9 J - try
$ z& y8 j* k$ F - {
; V% g+ E1 h7 ?; m- C! L: X - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
! c: X9 ~ ]( | - // 需要注意的: $ M# \) D4 T' }0 ?/ ^5 k2 w
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 0 w0 y# V1 }9 m
- //这是就要具体问题具体分析比如在头部加入cookie
: m* D- p1 |. Q# y3 ~& p4 P - // webclient.Headers.Add("Cookie", cookie); 9 ^1 W ?6 s" J( {, q* e, T
- //这样可能需要一些重载方法。根据需要写就可以了! a2 O5 R* _. d- D$ G- M
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
. H: p: `2 c; d- e2 p - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");* [7 K; G$ ~! I% z; h: L/ q* I! M
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
$ q* k' _8 q$ k/ A - myWebClient.Credentials = CredentialCache.DefaultCredentials;, z* a4 J4 Z- L3 H2 d$ d9 C
- //如果服务器要验证用户名,密码
/ F0 F: \; R' O* x/ V4 b* s: L - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 2 V9 `" [8 o G: E: b8 o+ g7 {
- //myWebClient.Credentials = mycred;
; t4 Y/ M. Q. I& c) U2 a - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 0 n2 M% C: r/ o7 c0 m0 O
- byte[] myDataBuffer = myWebClient.DownloadData(url);6 o% a4 }0 o- z" H
- strWebData = Encoding.Default.GetString(myDataBuffer);2 a/ G$ R" _8 _+ Z- U z. y3 ^) n
- 9 J9 y# R# B$ @5 h: N6 o$ e" A2 {
- //获取网页字符编码描述信息
/ a u' ~7 p& P, l6 ^3 _) \ - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
7 K( u; Q5 T6 x0 o - string webCharSet = charSetMatch.Groups[2].Value;
. Q: N, g- S1 w - if (charSet == null || charSet == "")
# S) f- G# j9 Y T' V% p4 l - charSet = webCharSet;" ^' a( R! d- v2 V/ J1 s0 v
- if (charSet.Length > 0)7 Y" u' O; x6 I( S# h
- {
5 L/ g$ O" h+ M& t. [ - charSet = charSet.Replace(""", "");
- `% `/ X6 w4 a: c# k2 R - }
- q2 {% s: ^8 m* d% v8 d - if (UseUTF8CharSet)( T5 w) r, {0 B3 H. J
- {
0 F: e/ N, t( ]) _) m - if (charSet == null || charSet.Length == 0)
, ^# y6 I5 u' P/ G- H! M: o. U - {
* N; O' l5 b9 O6 L% j - charSet = "utf-8";
# h# Y9 U. `4 ?, h, _% \ - }
1 L) y; S7 x% T% d( W( [8 X - }
! j. ]; r+ ^3 g! S - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
6 O1 G8 j) V0 D4 A- J' V - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
; _$ h! x. `4 H& z0 G& E2 L7 o& d - $ N( } g6 Y5 G5 E7 W
- }) [, U( ^6 D0 S3 G1 T5 r' m& s4 m
- catch (Exception)6 o, r* {3 Q/ [* Z, i$ N+ t; n
- {
" @3 Q+ A/ R, q: }$ x0 y) \1 g$ Y - strWebData = "error";
" f% R1 ~5 c2 ]) U- J4 }5 L - }
3 N% y7 V+ g" M7 k8 J, c - & J$ i( x+ z) p# N1 r5 Y
- return strWebData;" o& J1 N$ _ W
- }
复制代码
0 }( L. W9 y% G8 q2 V" R; ~( [. Y; O" y9 f: m
|
|