|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
* z" h- r0 C( }1 `7 L5 q1 B! L
缺少一个gethtml,用下面这个:4 C, v* J* Y, E5 W+ ?. @/ _5 G
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 - D) w% q3 ]0 k0 K# l: D; f5 R
- {
0 _8 |8 C. w, x+ h7 G: R* y& G - string strWebData = "error";
' r0 G' v7 w4 D7 b# `( j; A B - try
/ a4 N6 b. x; N$ z4 j' S - {- R% w6 |4 K& [! M& b; }
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
& q' C: }. ?+ Y! n - // 需要注意的: % A* u' H5 t, Y3 ]( K" J1 \
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
9 R8 T2 [5 Q3 L; \. z$ C - //这是就要具体问题具体分析比如在头部加入cookie ! N( p( Q6 U ~. f7 p
- // webclient.Headers.Add("Cookie", cookie);
/ Z% E$ x$ I3 N - //这样可能需要一些重载方法。根据需要写就可以了
4 x: t% b/ o2 S0 X - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
& R+ N2 h4 H( O: t: b - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");7 y- a, U5 z0 {# H) c1 [& @$ \
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
! G2 ~# @. j5 o5 n: j" J& V- N - myWebClient.Credentials = CredentialCache.DefaultCredentials;& t% u; Y# n: o) z. E# T
- //如果服务器要验证用户名,密码 * _: R9 {: w. S
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); & v: w. M5 Y3 Q
- //myWebClient.Credentials = mycred;
4 Y" J4 |$ v+ w9 }) F0 F - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 0 w. v! `! q. m; x. Q
- byte[] myDataBuffer = myWebClient.DownloadData(url);7 z3 y& L, j3 m* t" D( p. m
- strWebData = Encoding.Default.GetString(myDataBuffer);5 w. Y5 O p$ X9 y1 j) B
4 R' Z; F4 v' t+ d7 C- //获取网页字符编码描述信息
$ P; g" W" o- J. p - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);: p. M$ z4 }6 Z8 T$ a5 ^: k
- string webCharSet = charSetMatch.Groups[2].Value;1 Q& s& q+ ^) M g" ^
- if (charSet == null || charSet == "")
9 v) W ^' M, \: {8 W% m - charSet = webCharSet;4 O, f5 ?' Q) d' M/ D
- if (charSet.Length > 0) k3 S1 n d1 U* I' h; \
- {
( B! l7 o& y0 a' I' r/ b% g - charSet = charSet.Replace(""", "");
, ^0 [# t( H# @ - }
2 T5 [8 b: C0 y5 J4 { - if (UseUTF8CharSet)9 O2 w% }4 p' S8 Z# x$ s% m
- {
% d% M9 d; }, d8 s: M, c - if (charSet == null || charSet.Length == 0)
( c9 a, R; l$ z - {
9 w3 q; _* O: G7 c - charSet = "utf-8";
; f0 r3 Q. ^& e" L1 ? - }* j' V; Q$ T' @* b) R
- }
# {, ?: e9 ]- _3 N/ h- w - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)% C k3 O+ x; S3 F7 E
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);4 F4 S6 h# g" K5 d
- + K6 v, c8 j6 P/ x* r4 h2 L
- }
4 Q" y' R" D9 @7 M! W( ]- q# H - catch (Exception)
9 P5 Q' T, I t4 c - {0 o. k5 C9 T$ U0 U5 w! l( N
- strWebData = "error";" _5 Q' e- y9 ?; q
- }) Z! o8 S; l% r" }( w0 C
4 c8 r$ V3 ]' r6 d3 r- return strWebData;2 r' g" B% |: j( m& j; u
- }
复制代码 0 H5 p$ \0 ]! A& A$ k$ Y
+ }. q; y+ X6 U+ ]
|
|