|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
5 C5 q& G/ h( A! G/ H缺少一个gethtml,用下面这个:5 o+ d) ^( h9 n8 h' l
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 7 [8 F7 A3 W) Y$ z$ I
- {
0 H9 ^' T8 U; A( n* ~+ g) ] - string strWebData = "error";. D' r2 y! n- N& l1 ]9 l
- try; S2 C# ^2 O; p# l0 p0 |
- {
1 x. m3 b- X: w4 D+ u' ~ - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
1 |# V8 o9 A! X" } - // 需要注意的:
" o- h) M9 H& g( h - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
9 k+ q2 n8 E3 O - //这是就要具体问题具体分析比如在头部加入cookie
2 ^* h5 @4 x. o' r; U& | - // webclient.Headers.Add("Cookie", cookie);
9 ^; p3 ]9 c$ i4 Q - //这样可能需要一些重载方法。根据需要写就可以了# ?4 a' X. h% L X
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");8 a0 b9 p$ P# f+ |8 `4 Q
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
. ~2 | W( ]/ }6 K/ l - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
1 H {1 g6 p( W. @4 R, ]: s - myWebClient.Credentials = CredentialCache.DefaultCredentials;- F+ D) t3 ~/ n! H
- //如果服务器要验证用户名,密码
1 K8 ?) W% T+ d* {4 I4 H u, r - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
! o/ t2 r* i8 Y7 T5 z' A( D1 C - //myWebClient.Credentials = mycred;
0 ~5 {- j$ l. o$ F+ ~ - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) d+ s, J( |/ L$ z6 C
- byte[] myDataBuffer = myWebClient.DownloadData(url);% j5 u/ f. z4 y" K
- strWebData = Encoding.Default.GetString(myDataBuffer);
8 D# p1 \( @0 l0 N
: h3 X% W/ _$ c- //获取网页字符编码描述信息 7 `. w5 D( t7 T. Q8 w# ^
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
' {- k8 ]6 {. l- A - string webCharSet = charSetMatch.Groups[2].Value;$ J0 y! `) P9 m' H( Q
- if (charSet == null || charSet == "")( \; [: N( x) @4 B
- charSet = webCharSet;9 z# R& d. h) d/ E- K {
- if (charSet.Length > 0)
0 _9 [2 m% }/ y; H; A - {& r5 `, C3 `. L9 O/ q& F
- charSet = charSet.Replace(""", "");
: n' \& K) I8 R - }
0 Y0 X: `' H- j1 v0 b% J - if (UseUTF8CharSet)
) B' H! Y; b$ Z& ^: g) A - {, O9 f" L6 l$ \4 L% D
- if (charSet == null || charSet.Length == 0)* J* b" ]/ x- r
- {, T) z* E/ D" I* _7 z
- charSet = "utf-8";
' F1 x1 b: ^; d/ X9 v - }
, i3 ?6 a$ F3 y' I0 X7 h - }9 Y" E" Y2 W0 k; D6 T! r
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default): L8 @) Y3 ]" q# p7 _5 F
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
. h- U" n9 `( k) {2 B5 [( x
+ l* s/ ~. k% {( g0 p- }5 m8 t; M5 |9 o
- catch (Exception)
Y. K. ` V( N+ j6 d% i# T0 W - {% Y( _4 E. m$ G8 ?% T
- strWebData = "error";, t, N4 A+ i2 k) z. {9 n7 J4 A% ^/ W
- }
6 v7 [0 Y7 `+ A - 8 B3 d. @2 H' [2 Y0 |$ Q2 @* I- V
- return strWebData;
7 q% H# |% B; c+ G, B- |& _ - }
复制代码
4 x" f: O* u; m5 m3 }( v7 T& l. }3 g0 q( s5 d0 |* \
|
|