|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
. F# k$ W4 P8 A+ a& Y
缺少一个gethtml,用下面这个:
V" D- M7 n! Z- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
5 I& H* @7 X$ @) Y( U4 b - {$ h: n$ v, k) L/ I) Z3 q
- string strWebData = "error";7 @/ L6 |% g d$ S4 g/ `+ W
- try4 C+ S) G/ n& J
- {+ C! ~* w& h* ]* j. S
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 6 K# b4 b, F b
- // 需要注意的:
( g6 V' f3 g8 ?7 U+ \4 _ H" s - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 " v n/ r8 _& e' J. [
- //这是就要具体问题具体分析比如在头部加入cookie . F3 H8 ?) i3 ^
- // webclient.Headers.Add("Cookie", cookie);
0 ]' k6 K( ~4 k7 z$ ^# r" ], L# { - //这样可能需要一些重载方法。根据需要写就可以了
7 a2 K) t0 R, W& E, a' r5 a - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
4 F# [: _$ H$ u9 u! H8 Y - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");" U2 y w" _! [1 Q) W
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 & V! j' z* ]: h
- myWebClient.Credentials = CredentialCache.DefaultCredentials;+ `" [; J0 ]5 ]" n- @
- //如果服务器要验证用户名,密码
' V: l) E0 T: X - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
7 k9 b2 ~1 S1 P - //myWebClient.Credentials = mycred;
+ [8 P, \+ v& k; o - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) & Q4 a3 ^$ l5 P
- byte[] myDataBuffer = myWebClient.DownloadData(url);0 c4 p: \9 d0 c G. {
- strWebData = Encoding.Default.GetString(myDataBuffer);
( s" n. q. C5 w - & \2 Z+ I* T( s- d5 K2 V$ g3 y
- //获取网页字符编码描述信息
" k7 f9 S6 w4 q" B2 U* s9 ^! g - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);+ C; p( N4 c" C, o- s8 t1 A
- string webCharSet = charSetMatch.Groups[2].Value;8 C% q: I; q, f9 n8 U
- if (charSet == null || charSet == "")0 s0 O- U9 G7 M" h0 l3 A9 ~
- charSet = webCharSet;' w5 m! z1 B' w2 W4 ?7 `: c, Q9 R
- if (charSet.Length > 0)4 q+ X& G8 p+ A6 T
- {
, }& m- l# N/ L - charSet = charSet.Replace(""", "");
- [7 U8 b4 Y( b( r9 D o - }/ [3 T8 k' D0 @; i7 r. z! T, U) a
- if (UseUTF8CharSet)
D1 S; ^& D* e/ s3 f5 p0 O - {
/ ^0 @6 T; g' G# v+ c2 s- e - if (charSet == null || charSet.Length == 0)
- t6 ?/ \. l3 y- ? - {# W2 _2 v- I; M$ S& m" ?' C
- charSet = "utf-8";
* f/ b+ u( C' U) P& Z - }
- c& N' H" Z6 L& P- L - }
7 R7 I3 I7 D) |' C7 p J - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
+ z' t+ n) W" m2 K& c. W+ @ - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
6 l, o( H6 k) R1 h - . `7 y. N& s9 R, K B
- }
. @; `. ~- o! d6 q5 w" { - catch (Exception)
6 `2 u+ j: I) u4 ]: X - {, [* ]3 `4 M% v0 C
- strWebData = "error";( n1 b: Y6 z7 `$ K* s3 x0 G
- }7 p5 a- o2 q! G
- 2 w5 X3 y- h( p4 D! r
- return strWebData;7 j2 j1 b* T; C
- }
复制代码 5 J2 z6 q- c4 ]$ {5 V
& H5 ~$ e( e1 z- S& F" g
|
|