|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
6 R' P7 S( _& e) r q
缺少一个gethtml,用下面这个:
5 L% |! ~$ M# H; U- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 f) G/ V; f7 V" ^
- {
6 r4 h! T* [* x - string strWebData = "error";9 z6 B9 F$ `+ C
- try
1 L' @ m+ U% y1 w ]' o- O; R; P - {
" Y" m& d6 |1 Y! f/ W6 E - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
+ F$ i& `5 m2 O6 [6 N - // 需要注意的:
- u5 |: }. z: R0 U9 l - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
8 g8 ]. u- v5 O - //这是就要具体问题具体分析比如在头部加入cookie 1 \* H' P1 n/ y& r% w
- // webclient.Headers.Add("Cookie", cookie); & `5 t& m+ y" }, P1 n
- //这样可能需要一些重载方法。根据需要写就可以了
$ r1 i) a, F' N/ g! D# O( e - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");- F# L" z" n+ N1 U# X* l! B [
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");9 m1 F6 r# |$ d. j: D" u2 Z5 ]& D
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 & d! N: l- b5 l0 x! Y
- myWebClient.Credentials = CredentialCache.DefaultCredentials;( |; r+ S3 K' H8 w
- //如果服务器要验证用户名,密码
+ O/ _, m2 e4 N7 h& r0 ]' k" g T - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 4 h# d- [- F( B, |$ J) ~! Q
- //myWebClient.Credentials = mycred; # b3 q3 M/ O& Z1 ^5 Z% D. V; P
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 6 q9 H6 L5 N9 b
- byte[] myDataBuffer = myWebClient.DownloadData(url);2 |! V4 Q# ^1 e
- strWebData = Encoding.Default.GetString(myDataBuffer);3 k) X% Y# S! d) ~
- B( |$ Z- Q0 S2 b) y- //获取网页字符编码描述信息 % H: k# M: Y4 ?- J
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
5 O$ R5 C3 E3 } - string webCharSet = charSetMatch.Groups[2].Value;9 q) Y) ~& D7 E' _
- if (charSet == null || charSet == "")' h7 L% `: T$ |/ @/ n
- charSet = webCharSet;
7 l+ [& u9 @. T - if (charSet.Length > 0)
) E, x l" G" G d* L3 ] - {% w; J9 r/ i( ?9 `9 m* l) Z
- charSet = charSet.Replace(""", "");, P/ ?+ T; E' H
- }9 _9 S, {0 X7 m
- if (UseUTF8CharSet)
2 |5 s( _8 |7 ^' a) w7 V - {
( `5 ~3 l k i - if (charSet == null || charSet.Length == 0)
" K& q5 S# k( `1 I: z7 C& } - {
6 U5 Y$ |$ P2 j" }/ T" r - charSet = "utf-8";
* M: e- a3 j8 V( u( d - }
9 P/ j, ~: ~' b, l' @) v! N, j - }
1 i. T3 x6 B1 A7 B7 s0 I ]# a) N - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default); E6 m/ c1 _! P: _+ ^1 Y
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
& e: W0 T& W; ]
% C' N% s d. a- }( t6 x/ V, X) |4 C- r9 p
- catch (Exception)# K! _! L7 A8 H, A
- {
0 T! v }' \$ L7 t - strWebData = "error";
# y% l' s! W+ K) e3 a1 e. _' D - }
: O; n J9 `9 }2 m7 k6 A( _
$ t: m, Z* x( K4 R7 s) B4 k- return strWebData;9 y. M& n9 D9 a
- }
复制代码
; R3 I! J3 F E) n2 C) j
+ m! l" x5 m) W d( P |
|