|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
2 g; C) b: ^( N7 ~; ~; @) U
缺少一个gethtml,用下面这个:
7 A$ d& F! S9 b( ^- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 + x( T5 G/ c4 S* E! L0 P1 ^$ }
- {
8 w3 U' Q' t$ M; R; {) f - string strWebData = "error";
. [ k3 V/ N1 `* g* D9 v - try
1 B: r) u5 N( w; k. ?4 x' I - {
! K) Y7 C0 y6 o h+ p9 U4 d6 t - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
, A p- m7 I$ v4 H4 A" H( W/ E% u& E - // 需要注意的:
) s3 e4 n0 {3 i- c7 v* C - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 % ~( x, h$ D2 @
- //这是就要具体问题具体分析比如在头部加入cookie 1 @. \% I4 B" I' I' o( n
- // webclient.Headers.Add("Cookie", cookie);
1 p5 m, C' m8 T4 y) z+ u2 p - //这样可能需要一些重载方法。根据需要写就可以了; i' G0 A9 W; q- W
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
' v7 F" l) ?& g5 n$ H/ Z - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");: V1 z! i+ I* u# r# K: d8 z
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
% e9 }* F0 x! o& b1 v/ Y - myWebClient.Credentials = CredentialCache.DefaultCredentials;0 M' K6 z7 `% w, } f- t
- //如果服务器要验证用户名,密码 5 s( r$ O1 T+ J9 E) K3 q$ I6 A
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
* w- ~0 I8 X- n3 I - //myWebClient.Credentials = mycred; 7 r" _+ G! \# C$ B; J
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 4 {# h: N' p5 o. E( [( U9 D: [
- byte[] myDataBuffer = myWebClient.DownloadData(url);
! ?% O3 \ i1 c - strWebData = Encoding.Default.GetString(myDataBuffer);
% {. t% e+ {+ `7 s7 p- r
0 r5 c7 _3 }( ^; H% g- //获取网页字符编码描述信息
9 |+ x3 v7 J2 Z( V% @% z - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
/ p/ k( l& \3 Y - string webCharSet = charSetMatch.Groups[2].Value;
; k& `# i9 g9 l: y% C - if (charSet == null || charSet == "")( X( I. \3 b3 i+ _
- charSet = webCharSet;, ]. r5 I, H5 g) L4 w9 H
- if (charSet.Length > 0)
% j* s: h# _# {* m; m - {7 r3 n% [1 P4 k* U b" w9 K
- charSet = charSet.Replace(""", "");
1 m: [* d: g( s$ I - }- a1 N3 A; {3 [2 V0 f( Z- R
- if (UseUTF8CharSet)
0 i b# d3 i* S1 r' z! o" z3 c0 c; | - {3 W& f3 T3 X, E4 R) R. ?
- if (charSet == null || charSet.Length == 0)
! O7 n9 @# t2 V( x1 G - { ~; ?% i: c/ m! y0 F
- charSet = "utf-8"; |6 W8 l. I: ~! s
- }; j0 ~9 U. ?; q3 [$ K
- }
0 ]6 E4 h3 u' q" h - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)6 b) S; R& e+ j4 Y6 }4 _
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);+ h* ?% e* L' M1 h0 x& a L! P7 P
- * F. ]- @) y, r# f7 J! d; J; h
- }
8 T% B9 V$ h- K/ ^5 [2 V - catch (Exception)# t% H' v* R1 V) @1 A& T% W
- {
- S4 Z5 R+ A+ i5 p# n# K - strWebData = "error";) A. C/ y" F; X/ F: q: e
- }) i) m# M# H* g" W0 I$ ]+ G! t
- [: J, |( @ `. ]" E, J1 @9 G. J- return strWebData;
7 j, @# D, h2 r) j% Q; o" T' | - }
复制代码 - N. p$ N Z4 r0 w
& @" K; ~5 w& v$ D/ U |
|