|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
; t; ] o" e% A; T缺少一个gethtml,用下面这个:
+ ^" Z7 Q9 C; n1 p- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 $ l [% L5 E* R* I6 R* r
- {( U# ^3 g/ ?+ X3 f- _5 m; ? \+ \
- string strWebData = "error";: D: W+ G: W( i& W( d: Q; y
- try
. Y9 f9 d8 t5 u - {
$ f# g1 P' U5 H0 H; @ - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
& L5 f0 n6 G6 X# N6 r) Q8 o - // 需要注意的:
" @* J9 i- g+ j0 G" l - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
& r/ P, V( [4 `' E; J4 m - //这是就要具体问题具体分析比如在头部加入cookie
; [8 [( i- U8 [' X - // webclient.Headers.Add("Cookie", cookie); - }0 `! Z8 V; W' K! T h' I+ O
- //这样可能需要一些重载方法。根据需要写就可以了
, Q7 N, `1 Y. H3 h$ I6 J0 D - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");: C) G0 I# O. @, U d. Q7 I
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
& L5 V6 |5 ~) H4 W, N+ e9 N - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 2 `1 J5 }3 ]" Z" Q
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
" V. v* L* w, ]* @- o0 M - //如果服务器要验证用户名,密码 6 J K( b, K# X Y: Q( t
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
1 H4 \# _% U1 O3 G G2 y- [ - //myWebClient.Credentials = mycred; 9 }. Z. u% z. }
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
+ n: G- h( z6 S0 \$ ?4 M - byte[] myDataBuffer = myWebClient.DownloadData(url);
0 |9 N$ Z0 o8 B9 i - strWebData = Encoding.Default.GetString(myDataBuffer);
+ s# T" o8 a1 O( W( \) v5 L - 6 M5 T$ o% @. S! G% n% u! R+ O
- //获取网页字符编码描述信息
% o- r: [9 N( g& @; S - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);+ p1 j7 c0 B: _% I: T. t6 T
- string webCharSet = charSetMatch.Groups[2].Value;
5 i( A8 C9 p( Z! ` a+ k - if (charSet == null || charSet == "")
# @7 ^ A& A( H- B0 k! K - charSet = webCharSet;
3 e6 @4 F% Q, b" G3 } - if (charSet.Length > 0)8 f/ B# s2 x' }. u% S3 E: e1 m }: |
- {$ Q {/ w3 M& B
- charSet = charSet.Replace(""", "");
]5 h8 `. e4 m: w, T - }+ @+ w& J9 L' p
- if (UseUTF8CharSet)
" C$ `5 r6 P( y/ L - {
- Z$ n+ _& {: C: Z6 n/ P6 k - if (charSet == null || charSet.Length == 0)+ G8 n6 t k( {) g, `
- {1 K% H9 A1 J, |, o( J1 |, t! C
- charSet = "utf-8";; W/ {- f4 D8 J# F
- }
& H; D1 [$ E3 c# f - }
0 H1 k' }9 T9 @4 N - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
7 {6 r1 L( D9 X: ~5 O, y - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
0 l/ o' C& S* w8 J0 r$ a. g - # a( b% D( u" B7 L; d
- }+ O% Y. X: W# H
- catch (Exception)
; m' @ T- I+ G7 ], P- p+ V - {
" L. |0 f2 p( [ - strWebData = "error";9 p1 O" h% ?9 D3 y5 Z) p7 Q/ V8 m
- }
0 t2 @* r- N# a) \+ p) O! g; [ - & ?" _& E3 G) n! [3 E8 X
- return strWebData;
; c( }* X1 | R' v3 `8 o5 M) b( O - }
复制代码
- e6 @/ J/ e4 E2 g B6 X* P& j* Q$ ~% _
|
|