|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
% B! }- @* P3 I缺少一个gethtml,用下面这个:
4 t/ ]: K. q/ {2 \% z9 w- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 + Y6 V. l% Q! Y; I- p& g1 \% P7 x
- {- i9 m) w4 ~$ a% u
- string strWebData = "error";
) _: f" S' Q2 |2 d - try
- D8 q) m- O ~+ {" a1 f. U( U - {
9 o1 w( b" n- E" q8 ] - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
/ E7 z- a$ }! O& Y% u/ M+ F - // 需要注意的: 6 Y) j3 ?% {( i* J
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 % B; X9 ?2 j% o- q+ m
- //这是就要具体问题具体分析比如在头部加入cookie
0 I3 A. t! Z/ O - // webclient.Headers.Add("Cookie", cookie); . ?- s) a$ ~( |' R9 k
- //这样可能需要一些重载方法。根据需要写就可以了
9 Y) m! U( o' N- m; m" S) F) ~ - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
: n+ Y- B' a. r. I2 g; y6 V# U - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
2 G* ], Z( I; c! ^ - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 ' A7 \7 j- _* R4 i, w2 d0 ~
- myWebClient.Credentials = CredentialCache.DefaultCredentials;( R& a9 ~; Q; S% `1 S4 `
- //如果服务器要验证用户名,密码 4 \7 h& G) O) ]" w, o
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); % S; F. |! u9 I. V
- //myWebClient.Credentials = mycred;
; i1 l9 O% ^1 m2 C' N% J& @ - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
) ]$ N$ ]5 y( h' t9 R - byte[] myDataBuffer = myWebClient.DownloadData(url);* ^) D6 H- N2 a! g( J2 h, U
- strWebData = Encoding.Default.GetString(myDataBuffer);5 b- ^) `, D, o5 F: x, e, h" S$ _
- 2 E G4 v6 d3 T8 m: ]* N
- //获取网页字符编码描述信息
: m, J1 x, ?: q - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
3 e+ w9 l6 U7 V - string webCharSet = charSetMatch.Groups[2].Value;
9 D# p' N Q9 f, d: ?' ~ s - if (charSet == null || charSet == "")
! C/ z# L: S9 {3 C. f# n" Z - charSet = webCharSet;
& [% I7 J; w$ _# l- L. |4 P+ F - if (charSet.Length > 0)
4 E$ q+ A0 E$ C: w/ K - {
6 }* H& j c' c- M - charSet = charSet.Replace(""", "");
w( v9 S6 Q; P) M/ j6 d4 v - }" F$ N: |4 R0 H# l: X: J1 Z
- if (UseUTF8CharSet)
C/ i, A: X( @0 i4 g' j - {
5 w1 H4 V1 |2 s0 g5 M8 k5 `; x - if (charSet == null || charSet.Length == 0)
: f; K X2 I! i; E9 \! u2 A* z - {
) ~1 T. D6 C5 ^7 h7 {- Z' _; ~ - charSet = "utf-8";
! k# c" w7 \$ k: o* t3 q, r - }
7 B5 C5 S' U8 ~% n- P; {4 {, F1 ?( ^ - }0 X4 [4 [ d w2 `
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)( m1 e& P0 ^6 l$ t" w
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);% }6 _8 _. }1 |
- 5 l5 `! a4 |( A1 L8 O% S
- }& x) e. P- D7 |* r% ?
- catch (Exception)
9 ?1 j0 b7 N$ v0 g7 B( K5 ~2 } - {/ L( v, A) l, a% Y. I
- strWebData = "error";0 w9 V* T3 E/ E# g, a6 ]! {9 ?
- }1 S$ {: X/ x2 a; z/ x0 g
- % i* w; u) O6 X
- return strWebData;( G6 ]6 p. j/ [- T
- }
复制代码 ) D/ R ?/ a$ S4 E) v' x2 g
/ Q' q5 K9 X( N {: u3 {. O8 A |
|