|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
9 O5 ]3 r9 p- {* V* \2 Z
缺少一个gethtml,用下面这个:
$ S" V" h: Z6 r! {8 I% V7 m- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 + W9 C$ G) w/ l7 P4 I
- {0 e5 W+ s2 c- i) w9 q
- string strWebData = "error";
+ D) a' C6 e r$ b2 Q - try& v/ Z p. P/ \. h
- {% c/ {! v. Q0 }% M, J9 m. I4 R1 }
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient / h6 Y' u: x7 W
- // 需要注意的: / R5 A% ~& o7 X7 K, v
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 # _5 f! X8 ]6 y7 e
- //这是就要具体问题具体分析比如在头部加入cookie O; d6 g3 q/ J' S
- // webclient.Headers.Add("Cookie", cookie);
! p$ B$ M7 Z$ r- ]( j5 w' w - //这样可能需要一些重载方法。根据需要写就可以了
* G# J4 V, X8 W - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
3 u# ?5 a7 d# b6 i+ B$ e4 F+ p& B - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
( U* _4 u( ]2 T/ r - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
; D- I5 L" j2 ?& j S% s - myWebClient.Credentials = CredentialCache.DefaultCredentials;
; d; g( E1 j O& ~ - //如果服务器要验证用户名,密码
: V; h8 T2 G$ t3 t' B7 W2 |+ l. ~: h - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
# N4 v& I: V. d: ^' G - //myWebClient.Credentials = mycred; 8 t2 S0 R L5 B: M. I
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) + w4 ]7 k3 X* h K: o+ S) a0 P @
- byte[] myDataBuffer = myWebClient.DownloadData(url);% g5 l" Z+ n/ H5 {8 N, K
- strWebData = Encoding.Default.GetString(myDataBuffer);2 Q2 }/ f. _4 E6 C/ S
- 6 [- \0 a" c! E. y
- //获取网页字符编码描述信息
6 z, h' U- V3 J- w" } - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline); R( k, [0 P2 a2 A
- string webCharSet = charSetMatch.Groups[2].Value;6 ?; n5 n* m# A0 H: g
- if (charSet == null || charSet == "")# b& T) p3 }' c% F4 {! c
- charSet = webCharSet;
( P. ^& p9 ?3 o+ M! U# p$ Q3 u - if (charSet.Length > 0)
/ G: X. \. Z% `/ V" Y) Y8 T( I - {( C4 p6 E1 G2 }7 a+ l$ Q
- charSet = charSet.Replace(""", "");
d% P( Q, M' S8 } - }
4 Q/ C9 n/ X" c& d: s4 `& M" _( B - if (UseUTF8CharSet)
8 h8 G" \" V" `2 J8 p) v) f - {
7 ?( L0 o% J, u/ m; T - if (charSet == null || charSet.Length == 0)+ }, I* e& @. t2 U" \
- {0 Q3 n- b u0 c& I: r3 _
- charSet = "utf-8";: F3 G% X8 l4 v) U
- }
D8 o" ]9 \- ?* q; k2 r - }% s/ e- ~8 ~% A
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)+ a K$ M1 t: W6 ^# x# S
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);2 F- R+ M/ a& Q
- 6 k' r- g( ]+ Q+ E8 L
- }; A7 A1 s* P) S
- catch (Exception)9 x1 z( t6 R( u Y2 {4 p+ f! T
- {
( ^: S; k- y2 I) z. }/ x* g - strWebData = "error";
* e: F2 `- d7 g9 k& k* b - }. L; @% ~4 N9 h- z# i
$ l! G" E0 \* L- return strWebData;
: C4 ?" R$ _. @& b+ E) y - }
复制代码
$ O( E0 G) s- Y, ^! j4 ~
8 Y+ a" q7 Z" D' h0 |2 v9 q( K |
|