|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
/ h+ G* S# s* N( q, P缺少一个gethtml,用下面这个:
2 k# n/ m! N+ ~" `* A- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
2 n& k- U- g8 x - {3 K- Q( b) S+ I8 N+ Q
- string strWebData = "error";, B, n$ W) V& q$ v4 I0 T
- try
& z4 P2 Q, _, N% N5 L - {
% }$ o$ O& \; \" ` - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
6 ?+ R& j ?6 ~0 L - // 需要注意的: 8 f( `2 L( U- K) l6 m. E% d
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 8 t7 m8 A' H) q- X: S
- //这是就要具体问题具体分析比如在头部加入cookie , p+ b7 J/ I) C8 s5 A1 t+ _$ W
- // webclient.Headers.Add("Cookie", cookie);
5 U( L# o$ Z' Z+ y3 y7 M' F- \ - //这样可能需要一些重载方法。根据需要写就可以了
' n) u4 [* E- F; R4 l4 I+ c8 ]( e - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
0 R r: h: S U9 w - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
1 k4 j% M7 Z5 X; j& [0 z& ~- W - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 $ h# }" l# c1 B8 ]% t/ x
- myWebClient.Credentials = CredentialCache.DefaultCredentials;5 D$ X8 P' Z& e6 r$ o0 \1 o
- //如果服务器要验证用户名,密码 * N. L; y: H8 S1 B& h' z
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); ) x* O8 R* E3 B g
- //myWebClient.Credentials = mycred;
7 m# [: Y, b5 B& K9 F7 E7 g8 @ - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 2 O- X0 Z5 B1 ^$ Y5 H; f
- byte[] myDataBuffer = myWebClient.DownloadData(url);; e$ O: I, t6 I, H m- Z
- strWebData = Encoding.Default.GetString(myDataBuffer);1 U3 b0 t$ y: V5 D+ a. x' a7 _$ O+ o
- * v8 i9 J G( Y, H9 ^9 l0 k
- //获取网页字符编码描述信息 4 q8 v" Q7 d" q5 [2 n3 Z3 s- }
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);/ d: a* U/ u2 Y, o
- string webCharSet = charSetMatch.Groups[2].Value;9 z/ g- [0 n6 g6 u
- if (charSet == null || charSet == "")( @/ `7 }2 }) e5 f: @- M
- charSet = webCharSet;& K7 [5 h7 D3 ?' O" [- P, b2 }% v: m
- if (charSet.Length > 0)
+ C; _# Y( Z2 y4 a - {
( W4 p2 q4 S$ k4 V" L, N" C, e - charSet = charSet.Replace(""", "");* q1 l; O' i5 a' t& _0 p2 F
- }
/ L. }4 F6 s& [: F3 D6 F - if (UseUTF8CharSet)
/ k1 q* n" N+ m0 l4 b5 _; [ - {- ?( G4 Y, J* Y' h; B4 y1 a
- if (charSet == null || charSet.Length == 0)
; T3 h4 S! O# ~ - {
! k8 Z5 z { o* M2 L$ l - charSet = "utf-8";
+ ` y0 {0 u/ E) k. ` - }
1 X& @ R) p6 I' e - }
$ t: b; o' T3 I. [) J, H - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
" ?# |! A3 ~; Q4 J: p n1 A - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
$ D" |* P- n! N v0 k. D
+ O8 X/ U# N# @2 Y$ t- }) @( r) a# e0 p! p
- catch (Exception)
2 d% W. C6 ^, t. p' |& ~ - {
- P' h% K/ s0 K1 n: j# s- F - strWebData = "error";
2 s* p# G' M( @: |3 u) S - }
1 c+ h: R# ]; H8 y
8 H( x6 n6 W# `1 l! C- return strWebData;2 m$ z' H7 C6 u( B1 |# M
- }
复制代码
, L7 \. f1 f3 y9 O3 I! e
3 D" Q+ w; D& M% f- w M& v |
|