|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
: r2 V" \ R& g1 v8 T9 M
缺少一个gethtml,用下面这个:
7 d) y. q: P+ }/ _& x- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
' V3 }: w' ]( E3 f9 ` - {0 X& s3 v- E' J3 \& D) n8 v n
- string strWebData = "error";+ J0 k; H/ B) g* X- r/ @4 B# F) r
- try
4 K6 X( C0 E- F' F4 A+ o - {* W) Y6 w' n; D: y4 ^1 }
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
6 W- `: j" h* [: n9 [( J - // 需要注意的:
# Z! `" u9 ^, S2 U" Y - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 # \! N: U) s, ~; q. Q; I- A6 Y# Q6 |
- //这是就要具体问题具体分析比如在头部加入cookie ( t2 {- ^ Q# {; @0 b- {9 c4 r
- // webclient.Headers.Add("Cookie", cookie);
# v1 s6 W7 H4 L9 ] - //这样可能需要一些重载方法。根据需要写就可以了
( J; p, k3 r! P2 j - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
+ ?# u$ |+ u$ x& `, ?* m" u - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");' g3 F9 w$ U; `3 J/ r! K4 }
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
}5 v3 g5 i* l" L, | - myWebClient.Credentials = CredentialCache.DefaultCredentials;
" L& z" t6 F8 ?* y+ i. a8 J0 @% n - //如果服务器要验证用户名,密码
' J: ?. S5 i. v: ?; S - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); $ V9 u: ^+ p' S& H) I8 J& Z
- //myWebClient.Credentials = mycred;
( H! s. }5 o- v' [- N/ K - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
4 A* r6 l0 Q. a+ a& [, ~6 ` - byte[] myDataBuffer = myWebClient.DownloadData(url);
0 H6 }9 S: P9 `3 _7 j - strWebData = Encoding.Default.GetString(myDataBuffer);
' N( p& J9 Y' ~! c3 T+ d
, K6 |* P* ?# B" b; o f/ |/ U- //获取网页字符编码描述信息 ' e0 Q& B' \. ~( H0 H
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);1 v3 K/ Q6 x6 \5 i
- string webCharSet = charSetMatch.Groups[2].Value;, ]" Z4 L1 X" k( L" e1 l5 M8 I/ F
- if (charSet == null || charSet == "")$ ^7 L5 ~' e# p) I4 G( G7 l
- charSet = webCharSet;
. C3 z8 F3 }4 t# j4 T+ P0 _/ s+ i - if (charSet.Length > 0)
% d! u. g1 X4 D' s* Z2 @; U. Z7 Y7 @ - {
5 h0 C, Z/ r! i# R2 V: f% _, U - charSet = charSet.Replace(""", "");
8 R+ l) K2 q& n! N: I - }
7 J. V/ Y( r: P) [7 ?9 h# C! U - if (UseUTF8CharSet)
& F3 W0 T6 T4 b5 U/ N - {
$ e ]# l0 n! W v. I - if (charSet == null || charSet.Length == 0)
6 z% g( H& ~# K) I% ^' p - {
: z: k2 `, H. H, R - charSet = "utf-8";! V ^( K3 w1 Z
- }
( F8 k7 F% G- F8 I! M _ - }
* H* r t: P8 g S* q& H - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)) d# H+ \; [6 V8 ]1 G3 k) r/ i1 [
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);3 i( B% F9 N2 @* h8 ?
9 z- F& X3 v1 J. k- }4 b; H% b2 E; e" ?4 s
- catch (Exception)
4 J ^6 y6 l& x. Y& a - {
- r; o( Y6 R K' ?4 t) n - strWebData = "error";. Y( `7 P9 R2 C! j$ ]7 i
- }& y& Z. D# w" P% V3 m
- $ ]) M/ y- S& O, w# K( @
- return strWebData;
/ R& x8 G2 a2 @& P ^% p. P - }
复制代码 ! \# l, |: N. Y* w
N. Z `* t2 z: Z
|
|