|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
* n' h C/ T* R: `4 j% Z% W, E8 j I
缺少一个gethtml,用下面这个:
' x7 ?3 I/ [2 t3 {& f. s2 i- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
' a1 R9 ]) h4 I b ~* ?% p - {
1 w% C$ ?. ~# t: F - string strWebData = "error";
/ e1 a* O6 L' r3 }2 L- Q - try
4 t5 R5 _+ }3 N6 Y% e1 ^ - {6 a2 k$ w) k9 I9 U: O7 ]' L
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
) G0 S, @% E- ^) V6 I - // 需要注意的: : G5 L4 I* W+ l0 c; p+ n6 @3 E: `
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 , Z2 v+ l0 W5 Y; w7 m
- //这是就要具体问题具体分析比如在头部加入cookie X( C" h" V) D1 [9 ^! K
- // webclient.Headers.Add("Cookie", cookie);
3 c# g6 Y) n$ T1 U& m: q; \ - //这样可能需要一些重载方法。根据需要写就可以了
' G- j. v% |9 q+ J Z' `% M - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
, B% ]1 g; v* c* M T - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");2 O7 ] [0 w3 |* s' Z5 H2 i
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 * ?- J6 M4 S8 B: `$ g* L" f* |
- myWebClient.Credentials = CredentialCache.DefaultCredentials;0 |3 ^: _5 |6 W) t
- //如果服务器要验证用户名,密码 " p4 l6 d" M7 x% e1 b. j. w: [
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); - e/ P* G9 Q3 t7 l
- //myWebClient.Credentials = mycred;
0 ^, D% O+ @ Y' _! Z {# U - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
, t8 N4 T8 A7 W" N - byte[] myDataBuffer = myWebClient.DownloadData(url);
4 a m b( c- m1 k9 t7 U4 V1 L - strWebData = Encoding.Default.GetString(myDataBuffer);* A# {1 b2 e( C: E6 ~( w5 j5 i3 {
! \ a5 G" [" n& p# I _. U- //获取网页字符编码描述信息
+ _# J% n8 k+ e6 Q8 }; Q3 d - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);1 q0 w6 [, q# p8 f2 X( U
- string webCharSet = charSetMatch.Groups[2].Value;# M. P" ~- w! B9 N, G6 o+ C: ?0 t/ Y
- if (charSet == null || charSet == "")& @- l/ m# @2 `6 `9 b% A8 K Z
- charSet = webCharSet;
( c& w( h: m* q6 y - if (charSet.Length > 0)
! y: U ^& V$ e' J" y - {5 ^( M2 b @! L( Y
- charSet = charSet.Replace(""", "");
! S! @: j4 e2 Y - }
1 o: H& }, B! O9 L - if (UseUTF8CharSet), q% c; \- |2 v6 t
- {( t1 J* N: l$ f: k5 _, Q
- if (charSet == null || charSet.Length == 0)
, K5 W1 G$ T9 C. Q - {
9 _; P: t2 l$ M C1 p1 h - charSet = "utf-8";7 v3 u% U: r G2 l
- }. k% o' `4 @2 z- J9 e4 d9 u
- }& A: K+ a5 w& o
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
/ g; [/ `8 C" H# Q$ I/ v+ q - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
8 X& x6 [% @; D0 u7 P/ V
1 f* E$ ?6 G& O& }9 Z5 `- }9 D: [3 M% h2 n: K; Y, V
- catch (Exception)" L6 p* i0 M7 C
- {. K6 |6 B/ J% O7 K- Y
- strWebData = "error";. Z. M0 P& E3 ^( E
- }
1 |, a1 i- J6 x% o8 \) e6 _4 L Q
0 ^! u2 M# I! F% S1 g" a! y1 I3 g- return strWebData;
" D4 k* u) g! c2 S - }
复制代码
: c5 x! F. v6 l ]( N$ S
$ m: F' F+ Z+ ]) {8 F( T+ @5 s |
|