|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
' N' y4 S# \5 Q2 W
缺少一个gethtml,用下面这个:
3 r' H4 h* o8 \- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 7 K% I& Y7 X7 y' j) p d0 x
- {
}: d$ x/ Z' q& b- S% J4 a - string strWebData = "error";" o# `: S; i0 V* T& F6 B- C
- try
' Q E. @& A! h' _; i& C) J1 X - {; L1 Q$ W7 c) }: \% J9 S
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient * H, k* k* F9 f+ f6 d/ |9 P! W
- // 需要注意的:
. z( ]# J! u8 D7 g2 d L0 |% Q - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
0 c8 q' \- r1 g. E) @2 @7 L* N% ` - //这是就要具体问题具体分析比如在头部加入cookie
0 f3 e; o1 l# [, h6 u - // webclient.Headers.Add("Cookie", cookie);
7 B7 j; b+ z$ q& {1 y. \/ J - //这样可能需要一些重载方法。根据需要写就可以了' H" x3 Y3 ]1 U5 k% t' o; D
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
4 N9 I7 O p7 s - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
7 o7 b$ B. T/ H6 n - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 7 V6 u0 b' `! r7 y$ }
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
# p3 O* \7 H) w& ~/ ~) ]8 R( Q - //如果服务器要验证用户名,密码 $ _4 \: p* j" x* n
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
. ?. k( i t3 `: M8 `7 r! t - //myWebClient.Credentials = mycred; / b9 l8 F/ N o
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
: _ X8 v2 m% r/ v7 A$ R - byte[] myDataBuffer = myWebClient.DownloadData(url);
( K, P$ O( }7 h$ L3 D. m - strWebData = Encoding.Default.GetString(myDataBuffer);
# }5 z5 m! z, ^6 W1 V5 v - 6 m( j7 X& v/ p
- //获取网页字符编码描述信息 & e* O+ i/ d7 }3 p, k2 b) `
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
6 n7 N. `- W, m - string webCharSet = charSetMatch.Groups[2].Value;
. z q8 L f9 Z" P3 ^$ \ - if (charSet == null || charSet == "")
5 q/ y& }1 {. R2 p+ U% H - charSet = webCharSet;/ R5 [" j2 G' ]2 g/ ~& I: u* F9 i
- if (charSet.Length > 0)+ X! P. R5 G9 [+ Y
- {4 L9 P5 d1 [" a9 D
- charSet = charSet.Replace(""", "");
, U5 I0 M: v! ?& t: v - }
8 r8 l, u8 Y- G5 I - if (UseUTF8CharSet)
8 C& y0 _3 Y" N- Z y) ~, ~* l7 ^ - {
N0 S' Z( V" s2 S" `+ K - if (charSet == null || charSet.Length == 0)
8 j: W/ q1 @# ?6 z, {8 b - {
- ~: c7 B2 q* t4 J - charSet = "utf-8";
5 |; Y& z B6 i' W - }% C7 [7 R+ y5 }
- }, q+ E' v. Y% o
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)- R; {# D$ O: J6 Y5 C3 \- P$ l5 ^: h
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
( ?+ h# G3 o# y. T7 s" g+ o/ l - ) {" f% I# s5 z; \3 G
- }
0 g/ N5 b: s, X. j8 D4 u - catch (Exception): ^, R% R9 n% A: D2 Y1 a
- { O- P x3 d. L7 ?
- strWebData = "error";
- ^0 \; S8 H' Y - }
# ~" M% a! m+ V+ I4 E% R
3 [6 @' } }1 q1 Y- return strWebData;
4 e+ w7 }# D" h% G5 Y - }
复制代码 . I( b+ J( E" @7 T' \2 k, E9 T/ P
* p% c6 V8 L' ^5 a, _6 { |
|