|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
+ k- e$ Q1 w0 G7 b( b, c- t
缺少一个gethtml,用下面这个:
! V5 ?) z' j9 M8 ?! h- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
: s1 i( C4 x% ?+ L* Y- a( ?% \- g - {
- x" a5 }( q4 f5 f& e - string strWebData = "error";1 \6 |, {" ]* R- d
- try" S' b9 q3 d. _3 [/ z6 q1 m
- {
* D/ N1 j- o( [ A+ i/ G! ^ - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
! @( V% S# `$ U ] - // 需要注意的: 5 t }4 l7 h6 v, B4 i
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 $ ], O$ N$ V6 G4 W+ L( O( {4 j
- //这是就要具体问题具体分析比如在头部加入cookie
& e P' V; b! x' K; Q0 \+ z - // webclient.Headers.Add("Cookie", cookie); ! [ u. e2 b3 u5 |7 C0 d- | J4 `) z
- //这样可能需要一些重载方法。根据需要写就可以了
5 h, H" D# M5 n - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");" }" Z: D" A% Q- l% ~
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
+ t5 w) S/ Q& N0 A/ L - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
0 ^/ L1 r4 b) [- c - myWebClient.Credentials = CredentialCache.DefaultCredentials;
, l1 f- z- ]8 l$ h+ x& N; V - //如果服务器要验证用户名,密码
. c1 C7 X! I* p, ^5 T6 v0 [, R - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); # G0 y" k6 h- ]
- //myWebClient.Credentials = mycred;
! v6 T+ V& @7 s4 g. e - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
, k) d* T: \& E - byte[] myDataBuffer = myWebClient.DownloadData(url);
2 @ P# W5 F! q3 H - strWebData = Encoding.Default.GetString(myDataBuffer);
9 ~" @ h& a" D8 ?% t
; s" w$ W- S4 x% i. s% i5 ~- //获取网页字符编码描述信息
( O: H- w, m1 T1 [+ z' G - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);/ [: A2 R# F* I/ L2 K, [
- string webCharSet = charSetMatch.Groups[2].Value;2 E+ O& H; ]4 k& f* [ j* M
- if (charSet == null || charSet == "")
% q& I( f1 M4 @$ W1 l - charSet = webCharSet;
, h6 F6 P9 ]; Y4 y- f; p5 M4 ]( w - if (charSet.Length > 0)0 K& A" E$ z t8 ~0 m
- {" ^7 y2 a7 @7 o/ X
- charSet = charSet.Replace(""", "");
( i. r) T9 \& M - }
3 b7 v8 E3 t6 i/ p3 A% D1 | - if (UseUTF8CharSet)
9 c5 I( @1 e. O$ ^6 Z+ K! z5 @ - {; ? G& h0 a4 G& R) M
- if (charSet == null || charSet.Length == 0)( q C/ [$ | C5 l2 \" F X) m
- {
3 _3 b! ?# o* i2 p# B) D - charSet = "utf-8";/ K1 ]9 n( d5 u' h- i- F7 |# l% U
- }5 V. B7 M- [! M& K
- }
2 Z" b2 x8 [+ B$ ?! k' ~. G - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default): q# e3 f' M {
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
, S( ]3 }9 E; L, e - 0 [8 ~! j; u4 N+ m
- }
! v8 Q! c9 u P+ A5 c - catch (Exception) f1 L+ g5 y1 b" k' L5 t
- {
+ d, O/ M* `! O' D. x4 ^ - strWebData = "error";
8 Z! U- g, I6 h# c - }! T3 x) |& V0 X
* S2 e- Q. @! X/ c y+ t- return strWebData;* J, E7 z( d: x' u8 o
- }
复制代码 6 R% V# p3 S" s
8 ?& C- R8 ]. H- @6 L+ o( P |
|