|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
7 Y q6 R+ ^ p缺少一个gethtml,用下面这个:. h: p7 A* W, K
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
: k; k/ n3 R8 c U, m* A - {/ ?5 M+ x5 B, O6 P& ~
- string strWebData = "error";7 J' z" ]0 T2 X+ S
- try, v6 C4 z. ?1 d2 v+ Y, K# e0 ~
- {
; s6 ?9 Q' P0 x# i( g( n6 h - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 9 J) G5 A5 [$ Q
- // 需要注意的: 0 W* m, N: _: m6 r, X
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 ) s( E% b: Z) ^: ]' J! R/ H6 Z: e
- //这是就要具体问题具体分析比如在头部加入cookie 6 d0 A$ w0 v) W$ W& Y
- // webclient.Headers.Add("Cookie", cookie);
: D1 b, a2 w2 e6 l - //这样可能需要一些重载方法。根据需要写就可以了
+ \. ~. I+ A$ D( y8 g/ V, ] - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");% L8 Y2 G9 [; {! k6 }
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");. ~* H5 Y& v5 o# r
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
$ N( z# g4 e4 r& a - myWebClient.Credentials = CredentialCache.DefaultCredentials;
+ u! S, } I# N7 S' k9 V, h" p2 b - //如果服务器要验证用户名,密码
9 w3 X8 D* C" {# Q' f4 c3 w - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
: |) A) J) l! p5 E+ `4 ?; C- A7 Q - //myWebClient.Credentials = mycred; 2 l' \) \& C$ l
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
& v3 W' N/ h5 @1 v% ?1 q; B - byte[] myDataBuffer = myWebClient.DownloadData(url);
) h& Z: K- a7 f) S p - strWebData = Encoding.Default.GetString(myDataBuffer);6 c" Y: X3 W3 I9 r; _/ L
6 M5 K) D& _# J! g4 B- //获取网页字符编码描述信息 / r4 v6 ]+ q" w
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);5 g$ B( o3 ]% K6 M
- string webCharSet = charSetMatch.Groups[2].Value;
2 h6 _8 k9 b$ G, i - if (charSet == null || charSet == "")
$ _/ `- h; z) p) ~1 v1 p8 X' | - charSet = webCharSet;
/ c( ]3 l! t( I' I# u% J0 l" q* ` - if (charSet.Length > 0)
' |- a) {! w+ K) O - {1 D* T$ Z3 M/ P+ j+ I
- charSet = charSet.Replace(""", "");
( z! {) C H! R) c - }
9 @1 l8 `) D6 d7 } y' b; q - if (UseUTF8CharSet)" g2 F4 X& r* @7 f9 x) O S. S
- {2 E: p8 I. w: O) s$ G% l6 ^ o
- if (charSet == null || charSet.Length == 0)+ I% J0 d0 x' T: E8 d* z
- {
% y+ q+ T7 i: A" w# l - charSet = "utf-8";
1 Q7 \6 o8 o: W7 R/ H9 X- \ - }& r2 _; N* H# v5 n
- }7 H* @5 u8 l/ V a. t0 H
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default): c6 _& h5 _. [- A
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
6 h) k, E4 T; F9 H5 H6 _2 t
% i; q% Z2 m6 X: v2 m) E, A$ c& _5 E- }% R$ {/ ]4 q( f/ W4 G
- catch (Exception)& @; ^3 {, y" H0 ]& o4 F- X/ f! i/ M
- {% D0 }! X b) W% U# p# d' i* c
- strWebData = "error";' }0 }" V6 \- P2 v ~
- }
* t. D" L" m. ?0 p1 V8 ~& V/ x
& t }2 S" ]2 a$ w- Y i0 `$ ]- return strWebData;
: L5 g# U' T( `5 n" A0 f/ u - }
复制代码
) g- j. |4 I! X0 j, X1 V, ?4 s
! M: l! Q5 M8 E |
|