|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
! X4 o0 y) B0 i' a- H2 R缺少一个gethtml,用下面这个:! J' ~7 o7 G. v* X1 E8 @( y$ w
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
; A7 u0 b4 J+ ?* y4 m, Q - {, L( ^5 B0 a% R& K+ `9 r
- string strWebData = "error";
; {4 [- m- L* `- l- B3 @2 S" l" { - try. N( E9 A4 J: m, ^% \/ X" o9 S
- {
1 G- n3 e% r+ T+ q9 B4 H4 Z - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
K- R( Z& z- C9 M) U- C0 W( s - // 需要注意的:
( B6 A" X7 v) H' a7 ^: p - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 % ~: C. [5 l' _# k% v
- //这是就要具体问题具体分析比如在头部加入cookie
: D, ~2 ^2 N6 r3 ~$ v$ T9 k - // webclient.Headers.Add("Cookie", cookie); u9 F1 i* @4 E) O B1 |4 r
- //这样可能需要一些重载方法。根据需要写就可以了# P. H: l% c+ O
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");1 w5 B" M4 z: V2 c0 ]; J1 o
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");4 T E. p5 J! T: R
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 8 E7 ^' ~0 j3 w# v% @$ _
- myWebClient.Credentials = CredentialCache.DefaultCredentials;9 L/ p# c }) B, F3 r, A
- //如果服务器要验证用户名,密码
6 G% A8 d! k( e - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); * }$ R" g7 b* r; ^% }6 V3 f
- //myWebClient.Credentials = mycred;
/ {2 @6 V* t j' e7 k B - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 1 S/ G. ^) A; j& P# T$ {# x- Z8 Z' \1 V$ c
- byte[] myDataBuffer = myWebClient.DownloadData(url);
7 S: h3 I* U- A& g! e( R9 S - strWebData = Encoding.Default.GetString(myDataBuffer);
1 w+ {9 b% Z0 |2 K& `, _ - $ v3 R. p" b. Z' @
- //获取网页字符编码描述信息
; Y- G0 v0 W$ A" v - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
9 P+ j! Y( S8 {5 |5 ~$ C; m1 H - string webCharSet = charSetMatch.Groups[2].Value;6 M7 F9 r! a. G2 i F8 [7 q7 k: a
- if (charSet == null || charSet == "")8 A9 k; g5 h- x6 S" T
- charSet = webCharSet;
* e ] A7 k4 B0 p% d* v/ D# I - if (charSet.Length > 0)
4 B1 P; @5 v7 V1 \2 z- d - {7 K, k' G" J" V5 N* b4 M; ^5 \; A
- charSet = charSet.Replace(""", "");
9 B* y7 s1 r7 E' O. O - }" K+ y: N& f9 h+ E
- if (UseUTF8CharSet)' ~+ G! N, g3 k, [3 t1 U( J
- {4 R6 \) b5 y# O3 V4 s" g4 C
- if (charSet == null || charSet.Length == 0)
4 i9 [' j3 r( _( P! D - {
% n/ ^0 h" N- l4 d7 N' L2 I8 ` - charSet = "utf-8";
7 l4 O% g; Q$ r8 X, d - }
' M+ t+ W3 }! _! u- R - }
- _$ c8 b* W: I0 u+ t4 q1 R - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)- R$ }$ j9 q8 c/ R1 S" {, R
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);4 ?! O8 B0 O. U1 c9 ~/ p1 o
- ! l) p. e% M' a0 P
- }
2 U8 T5 c; ~; ]9 v9 T - catch (Exception)6 k4 c7 e J% S- Y" ]4 g1 P
- {
( S* e3 ~8 n: m - strWebData = "error";
6 Z3 v- I9 A+ q: B6 |3 a - }
5 I- M& G$ R/ M, G, A0 o( A& s+ ~8 k - 8 _2 U/ @2 {# m% d( [
- return strWebData;
6 D7 B+ k9 K) Q. {0 ?- A# l - }
复制代码 4 {: a, C; o, k
, C) s3 f) X) L5 e: ~& U% |
|
|