|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
1 r3 U2 ^4 Z9 u1 P0 `
缺少一个gethtml,用下面这个:" X# Q* u! P s) u$ s, }8 i
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 4 J- D0 C* s8 {8 P1 q5 @, a) B) B" w
- {
7 ~8 C/ z" i6 _1 s2 z8 P1 K - string strWebData = "error";
7 e# a- [, J+ ^: O4 F, @ - try
; N6 }# s. B1 P9 o# m3 Q# d, c1 {, V - {
" [: y5 [# ~& L- I - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 1 y' \ |9 D/ ?* K
- // 需要注意的:
0 O" \& ^! b3 H+ q9 c, L8 l - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 , Z+ x' C" f2 e1 `, O/ _# C; i
- //这是就要具体问题具体分析比如在头部加入cookie $ y/ s. u' l. k x# h+ N5 J
- // webclient.Headers.Add("Cookie", cookie); 4 ^1 P; r( m% x; M8 l. k
- //这样可能需要一些重载方法。根据需要写就可以了
1 L0 G, P1 E, u: W7 E - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
! h! y, q! n) V! m - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
# }* [4 Q6 [% s3 D - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
( U6 M. |: I/ o& R( M - myWebClient.Credentials = CredentialCache.DefaultCredentials;. q! l m/ R/ M2 G, P0 Q
- //如果服务器要验证用户名,密码 U8 U0 J3 Z1 D% C G/ c
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); * }! ~. j6 Y; R% H: n* i4 b
- //myWebClient.Credentials = mycred; 1 O4 r# n- [+ o% N$ _' [" q- z2 q9 Q- m. f
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
3 t; Y$ k) f( ]1 v" o. V3 j P; o - byte[] myDataBuffer = myWebClient.DownloadData(url);( k# ?; \: n4 S- {
- strWebData = Encoding.Default.GetString(myDataBuffer);& y P% F2 H: }# H
0 I& W( k0 N+ [0 J" f4 Z- //获取网页字符编码描述信息 ( X, I. v- @3 J
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
! q! t4 C( G' ? - string webCharSet = charSetMatch.Groups[2].Value;
d4 i3 Z$ {, C7 s a" B' z p - if (charSet == null || charSet == "")
* j Y0 E6 `* R) }8 H9 R - charSet = webCharSet;
* \: @; T4 Z/ x; ]3 q - if (charSet.Length > 0)4 k: j" s3 E8 I2 S2 G! ~
- {
9 q9 c; b$ X- K - charSet = charSet.Replace(""", "");2 [( ^: U. y6 J5 v! b1 Q
- }
?5 b4 d: f3 i$ j; [6 q - if (UseUTF8CharSet)
, R' \* K4 o7 c - {0 ~6 W" |/ L7 A0 L; r7 `# v& K S
- if (charSet == null || charSet.Length == 0)
/ L2 o o2 I) {; ~* Q3 C - {3 z7 V% ?" C U5 X% E g8 N
- charSet = "utf-8";
$ l9 X- Y, U" L( }, X - }
" N6 K! c( S& v - }# X/ j3 R1 @- ^: k3 n
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
" L, \! E# c% c( M+ ~ - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);. T6 f; I! e) l. R8 x
9 a* _7 [5 R$ Z- }
( }0 I/ C Y: o: B! D. ] - catch (Exception)- T. p; o$ a; g/ X$ j
- {
h; j% H1 x( ^* L) o8 V - strWebData = "error";
; O) c5 I. l& i8 f6 r% g - }
/ E @4 x# @& J - 0 Y3 T1 R7 }% U7 A
- return strWebData;
5 u3 g3 z6 u3 n+ ~4 X - }
复制代码
# m1 H- T. Q% B9 `
0 N+ q! S% F: J2 _$ ? |
|