|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
, J3 J9 m8 }! `3 n- |缺少一个gethtml,用下面这个:
& W- I+ k$ L" o' Z- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 : Q" M6 S" f6 t, M$ L5 j% J
- {# P$ @( t2 \' L- P; K" C, B
- string strWebData = "error";* D, ~ b+ W E0 n# X
- try8 e w' E, V" S/ f; Y
- {7 Z# n Q6 Q5 a8 _) `7 X
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 5 E* K9 R7 O! D! \& E
- // 需要注意的: m6 Y l- n# O: \6 X5 X" r4 x8 S" M( A
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 + C: v0 u' @) V1 I! c
- //这是就要具体问题具体分析比如在头部加入cookie : s: F4 h, D) L! a" N, x
- // webclient.Headers.Add("Cookie", cookie); 2 W+ ]0 m" ?0 Q$ r
- //这样可能需要一些重载方法。根据需要写就可以了
. U p( t- S3 | G, F% R - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
" N/ f/ P" I8 s/ A# k - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");/ H! l6 ]& {% {8 Y
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 . r) M) T' O; n$ I2 p
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
# P6 ]; r" f* n# e$ D, m - //如果服务器要验证用户名,密码
- K! o4 ^! \0 @( Z* f - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
/ y: F0 V6 B- t/ p6 H4 _' M1 U - //myWebClient.Credentials = mycred;
& t2 J. o# F* T: S) b - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
+ v7 G8 a' B( F* b" e - byte[] myDataBuffer = myWebClient.DownloadData(url);
3 u+ @" b a+ I2 G3 x - strWebData = Encoding.Default.GetString(myDataBuffer);7 O/ W) I; n% @) }% t0 T" [
( i8 t; H" L4 l& q& n4 j _- //获取网页字符编码描述信息
/ x# m* p# |6 m) r - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);- P3 v2 ]0 W3 ~" k/ x$ p, B
- string webCharSet = charSetMatch.Groups[2].Value;
8 Z6 L$ Y9 A: |: F - if (charSet == null || charSet == "")
# f) N7 k1 N7 X8 V: }, { - charSet = webCharSet;
: v9 q6 w, I$ [, r* I) N# G8 @5 v - if (charSet.Length > 0)
3 |& Q1 D; C' @& A, U9 s! R - {/ x/ Q- y1 x( W t* Q, n
- charSet = charSet.Replace(""", "");: ^5 }$ H5 Q- s. j& G- g3 v, A
- }
) I" C# Q2 i5 }( K - if (UseUTF8CharSet)( ~' D; F9 u- K% [% q$ E t: n
- {+ @1 ]2 Q4 o0 b1 M8 m2 Y( n' g
- if (charSet == null || charSet.Length == 0)
! y; k5 J u+ r, T - {# ]' U7 Y3 G6 [* e. u+ T( P$ H
- charSet = "utf-8";' `( o% C, t2 f9 y; K
- }
3 X5 _9 U# n! }3 J9 ^. Y - }
# F8 d$ U+ A# c; z8 r - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)2 c+ k$ [5 a6 r$ b7 H9 Z8 l
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
5 U, |9 n* X* o0 u5 r - 0 A! P7 g, {6 @8 o
- }1 ?. V; B5 N- i! m* A& f3 C
- catch (Exception)
/ `" y: I6 w& L# Y6 _# Q. @# B8 e - {
" i) }# ^+ e- o" E4 p - strWebData = "error";
9 w# O/ W9 G) `9 z# \) [' W6 h( E/ A/ X - }
" {' l3 y9 Z. v - 0 m) f( z5 J+ B6 `
- return strWebData;5 r) `0 {/ H* `6 Y; r3 b
- }
复制代码
: ^* }; f) B# z |: c4 ^$ h2 h
) N2 }( D7 A# A |
|