|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
5 `9 h. l9 P+ k! t1 D8 g6 @2 X$ Y- u缺少一个gethtml,用下面这个:# o0 H6 s1 ]0 o
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
3 U7 \; l% h' E, R2 |, ~# I - {% @* k4 r0 J/ H f1 u8 S% @
- string strWebData = "error";
2 X }) q' b- z; V - try4 u2 p% b. p* D L, z
- {8 h! `5 z) x2 ~( L0 p' ?4 O8 e! }
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 1 C y5 ?& |% i( l! j
- // 需要注意的:
/ ^- a* L- I$ M' y5 X6 Y - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
3 Q, c. b# m y( a9 s0 _) b- C - //这是就要具体问题具体分析比如在头部加入cookie # k2 A1 l- ~. Q8 I' J, K
- // webclient.Headers.Add("Cookie", cookie); " I% \' K0 V8 Q; H+ L
- //这样可能需要一些重载方法。根据需要写就可以了
( Q0 l" P, o* p5 q% U - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
! `3 E& R1 T c; N9 M# i - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
9 N+ r" I2 u$ x) }0 t. f# b) b - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
+ D* W, Q5 G8 M) g; t v$ f - myWebClient.Credentials = CredentialCache.DefaultCredentials;
3 A4 o f. y Z8 ~6 t) S8 Z( ? x - //如果服务器要验证用户名,密码 ' P# P3 ?4 X5 e- G
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 8 `2 n2 a& C; H/ }- k- Q9 Z
- //myWebClient.Credentials = mycred; ; J7 J7 l/ L+ ^8 d/ H
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 4 f4 u6 ]1 z# F) {- g# |3 h
- byte[] myDataBuffer = myWebClient.DownloadData(url);
; J3 @9 P$ T) o* Z& o - strWebData = Encoding.Default.GetString(myDataBuffer);
$ G" K" E8 V- Z a! r, V - & d* z* H3 F8 x4 D- ]( T w+ \6 H
- //获取网页字符编码描述信息
$ n& z+ Z2 k5 f* L - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
0 T! _7 Q, ^4 t5 m; \ - string webCharSet = charSetMatch.Groups[2].Value;0 _# x0 J0 c6 H, v9 `5 `+ G
- if (charSet == null || charSet == "")3 K0 M' ~' l8 w6 K
- charSet = webCharSet;! O9 [' S; I9 g, P. U5 T
- if (charSet.Length > 0)
C2 f [ f' p& D - {
" \ ~/ J" s- x* \ - charSet = charSet.Replace(""", "");! T/ b% s. I4 t
- }8 W: T" [3 e3 k& ?7 ?
- if (UseUTF8CharSet)$ V7 k: k" \3 p
- {1 i) N K" a, [7 u, c
- if (charSet == null || charSet.Length == 0)7 I# v* O* r2 \; d) u7 o) E# d) j
- {6 S& I/ C4 b. V$ M* G
- charSet = "utf-8";
' }* D" @. Y' h, F9 | - }
. [4 y4 I% H: w - }
/ I/ g& T' W/ H - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
7 {1 j a4 C6 g z0 p& T! O0 m - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
* o" j) W* g _* X+ \. e0 B
+ I: l* Q. D' v4 P' `- }
9 i( ?$ k8 ]# M - catch (Exception)& W, U5 l: S. `/ \
- {6 Z) c$ _; ^# @# m. q3 C; N8 \
- strWebData = "error";
# m6 |5 X% n* K. j) y - }/ c; B$ A! p- L/ T* h5 G8 b& }! \# Y
( g3 \4 k$ ~6 y6 L! }- return strWebData;- j- G6 J: ?/ | B' W3 O. Y
- }
复制代码 , @+ f' S. ~( G$ L! q
3 G! n8 u' o0 s Z2 N |
|