|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
/ L# B; g, S: s; N
缺少一个gethtml,用下面这个:
+ |, R, J0 L; {# t9 p) H2 e- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 " t+ P4 [, _! j0 t
- {- `6 r9 ?+ A9 p1 \* G. V3 E
- string strWebData = "error";* v' V# M% S$ H! Y. Z# n
- try, b3 Z6 r3 T6 O6 X. ]1 P/ @0 c/ n
- {
1 C. x( V1 k" S$ u8 a8 F - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
! O: |! O0 m t$ W! w3 y- G - // 需要注意的: C) M) o u2 c# v+ L
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 $ Y7 X2 P& C! m' y1 o) H
- //这是就要具体问题具体分析比如在头部加入cookie
8 `- { T6 A1 c, X - // webclient.Headers.Add("Cookie", cookie);
& @( D/ T6 ]- Y6 V - //这样可能需要一些重载方法。根据需要写就可以了
" U2 g- f5 n: M - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
+ O; u2 U3 e6 | - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");( h/ g: A* L T2 U* v1 J% Y: q
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
0 g5 A. c7 P7 b$ z2 b/ M5 d - myWebClient.Credentials = CredentialCache.DefaultCredentials;
+ d5 x; D; X1 Y. J - //如果服务器要验证用户名,密码
$ ?1 ~( J8 x! ?: i, g - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
2 x% V3 v3 j* H& R. t - //myWebClient.Credentials = mycred;
S2 _5 T; }6 M. z; u8 K - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
0 s# z5 D: e# H- ?, }1 M* L - byte[] myDataBuffer = myWebClient.DownloadData(url);# g" }* a t; v w
- strWebData = Encoding.Default.GetString(myDataBuffer);
. `0 A, P) j1 e - $ X' U% b1 m% R+ ]
- //获取网页字符编码描述信息
% N8 ?* e: f1 K2 V - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);- v- _9 A' t3 T+ k5 q
- string webCharSet = charSetMatch.Groups[2].Value;# b7 Q' j/ m+ [& E8 @) K
- if (charSet == null || charSet == ""). P1 \. H0 q$ N/ M+ K4 }
- charSet = webCharSet;
' X% \# Q0 `0 y3 W. A; P5 ` - if (charSet.Length > 0)7 L* a; v: ~- ^8 R0 t$ E
- {7 k) z8 u8 z! z, L1 O+ P
- charSet = charSet.Replace(""", "");& M4 I3 O2 }% w0 i* }
- }
& \6 r- P' P6 S# T P/ y ^ - if (UseUTF8CharSet)
. f9 I4 D7 d: w' j$ v* w - {# s' v$ j- J: D) ~6 E5 Q: H
- if (charSet == null || charSet.Length == 0)
- D% y( I; P5 h2 J& T - {
* F5 X0 O4 U, ~3 ^ E- w - charSet = "utf-8";
1 `6 D" K: b- u3 |& Z, }7 m% u* J - }
6 b% |" h5 M3 Z3 U, P# ? - }4 F* v4 y( n9 ~8 p) `/ o
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)& @% u8 k% d6 a: v
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
8 m- p9 D0 S% k; s6 Z0 `
6 d$ C0 x3 c( U) }- }2 J) G% N: G) y; y3 y' u# h4 D) @
- catch (Exception)
0 p: V3 U& U8 [( N6 {, \& Y3 v* ? - {2 }5 A. m9 Q' {1 x) M- C4 G
- strWebData = "error";
( P9 `. w' F; \ - }
8 h' b0 A( [( F: a/ L Q
! @1 F* t- _* l( y6 q- S2 A+ P+ |9 Q3 s- return strWebData;% u5 z5 q1 P! M+ h
- }
复制代码
2 j: ^ G. Z, a5 H0 z
$ H5 N- J( _6 I |
|