|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
2 L3 F; B* f, a6 s! C0 u5 H/ s缺少一个gethtml,用下面这个:
' w7 _' E3 j3 e: v9 Z) {- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 4 C8 K, c/ J$ c
- {
& _* [. ^8 d9 @8 b, x - string strWebData = "error";3 g& \) U( K7 N. n, ~4 q2 |0 W
- try0 x+ n- M! X4 B6 P3 U, R' t
- {
7 d/ ^' v. ^1 t* W5 m2 Y - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient " E# n7 w5 \9 `4 x/ u( v6 Y& i
- // 需要注意的:
8 H7 N* R* H9 n; C7 M. D4 f - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 3 `% y* o' F, k4 B1 m( F- k' k4 b
- //这是就要具体问题具体分析比如在头部加入cookie
5 B( j! ?3 t) g! x- c, l: M - // webclient.Headers.Add("Cookie", cookie);
; K9 f0 R/ T9 b% s* A6 }2 d - //这样可能需要一些重载方法。根据需要写就可以了
% K& x1 D7 f- R5 |& L8 q$ r - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");, ]4 E6 ?2 B: n0 Y& Y8 B
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");" d I' C6 j- L' I7 i9 l8 k
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 ; p2 ]) Y/ K- d N
- myWebClient.Credentials = CredentialCache.DefaultCredentials;& o, S& _7 u- m1 N/ ?
- //如果服务器要验证用户名,密码 * T [! l8 [& z& c
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
) ]+ N" s$ x( L w4 C) I- j. H - //myWebClient.Credentials = mycred; 3 f+ `: ^4 t2 h5 \6 b2 ]; y
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
5 L) m) K) H, n" @; [ - byte[] myDataBuffer = myWebClient.DownloadData(url);
8 y% a" B! A% ^ [) _+ [ - strWebData = Encoding.Default.GetString(myDataBuffer);* J3 _" d0 e1 u, X' e1 u
- 7 }. H& D0 r) j0 h2 P- Y% A
- //获取网页字符编码描述信息 : T+ M9 |& z7 g3 F, L9 z7 `- \
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
$ j g8 p% v+ F - string webCharSet = charSetMatch.Groups[2].Value; O+ Q( s/ V, j( y. P0 v2 V& L
- if (charSet == null || charSet == "")8 r7 X2 F4 ]2 p' ~2 ?; [6 |
- charSet = webCharSet;- {' J9 Z2 W9 y' p7 U _$ A c
- if (charSet.Length > 0)
% \* Y. G! ~+ T9 C7 V: R. o - {) l4 i+ ]6 P. A6 O; w
- charSet = charSet.Replace(""", "");. X- Z3 W6 C4 x! i" W
- }
V' l5 f* G! N$ R* t - if (UseUTF8CharSet)2 u( v ]% w" ~+ f' _
- {
: |+ `: X" H! k! s* Q- f; \5 u - if (charSet == null || charSet.Length == 0)6 F1 T- h) z/ g! g' W+ m
- {
8 P% F7 Y- y; |9 m3 ~! b; L - charSet = "utf-8";
: t' d2 ?0 Z5 N+ s* |: b - }
" d% Y+ K' ]+ ] - }
" b3 w8 i# z% P - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)# G5 ~, P g/ ]# N! f) I
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);. I6 z* c. f: Q/ ]/ Q
- , f" u& k# v) `9 a) q% R- M
- }! G8 E& X+ o6 }8 W$ |) `
- catch (Exception)! [+ D6 n: y) Z, ~" G
- {: \2 M. y, X' W3 h/ f
- strWebData = "error";# U9 N, r1 @( s3 ~9 U2 o2 H
- }. A6 V4 |1 e9 O
5 K: x$ Q1 j# _/ m, U k- return strWebData;
1 h7 b W/ S% D& g7 E5 R7 U - }
复制代码
# V n m9 c0 f M
( \$ u5 M6 E4 g; @8 ?0 d |
|