|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
5 e4 |5 A; \4 I2 Z" x
缺少一个gethtml,用下面这个:) }$ K4 z- Z. t. R; ~( V, S
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 9 x. z" x9 i& ~5 v" p+ ^7 f
- {
. m. ?/ a9 W" U( a* c9 [% P - string strWebData = "error";* X' K2 e, |; R$ j& {( w R
- try" A! Q, U! J7 h5 i
- {4 Q* S% X- s2 \6 \
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
8 ]7 a; n! a7 F - // 需要注意的:
8 a7 X7 f7 C/ |% } d$ D - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
/ A" G6 t ~6 d$ E, n+ J - //这是就要具体问题具体分析比如在头部加入cookie : `- d: D# x4 k+ v
- // webclient.Headers.Add("Cookie", cookie);
5 z. u* A: Y' C - //这样可能需要一些重载方法。根据需要写就可以了. K5 F2 S/ s' y, \) b
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
3 b8 o9 f1 Y/ R6 I - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
6 T* N( F* m$ ?* \9 S( n - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
5 | s2 U+ k' L% u9 x - myWebClient.Credentials = CredentialCache.DefaultCredentials;0 G% z6 r! t( {9 D) P( n
- //如果服务器要验证用户名,密码 : P. C7 m7 ]. R6 }* _
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
/ N4 A8 \2 W3 v - //myWebClient.Credentials = mycred;
( `9 o9 Z- l0 f0 R5 M - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
' `9 k1 [" `& @ i- u5 \" f - byte[] myDataBuffer = myWebClient.DownloadData(url);
( U# B0 E; V6 Q1 E, m" R5 x' }$ I! \ - strWebData = Encoding.Default.GetString(myDataBuffer);, n8 V1 J: E0 }
- h0 {8 B7 r9 a' D- //获取网页字符编码描述信息 # }5 N+ w7 E- K% n- j7 N
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
7 F$ t& G2 w( R3 q0 R0 V& K - string webCharSet = charSetMatch.Groups[2].Value;
+ n3 c. h$ Q6 y. P. w - if (charSet == null || charSet == "")1 _8 ^# [1 L5 D/ c' m
- charSet = webCharSet;9 B; I- W4 s* Q8 J/ Q9 M
- if (charSet.Length > 0)9 C; j6 P5 J* j. u0 C
- {7 e1 n; d' r* [$ ]& v4 B, U& ]
- charSet = charSet.Replace(""", "");5 `4 |/ ]8 _: b
- }% y I* H/ Y# g9 A, d% L* v+ V: [
- if (UseUTF8CharSet)
. J( }' B) R7 C' S - {
; ?4 ]# s9 E+ Y1 |! p8 x" } - if (charSet == null || charSet.Length == 0)
8 ~' a3 C. s2 h H) g$ n( U/ z# ?6 S" l - {, G; P: N6 M4 q! Z8 u3 t
- charSet = "utf-8";
% f) V( c2 x4 s* {+ U* x - }
. H* }6 M, U6 J% V% ]5 h - }
1 d2 _8 L3 A5 a - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
5 B+ c, o3 o2 M7 h s, a( @, N - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);4 ]5 m8 i8 h! d
; ]3 x: i( X4 U- B9 b- }
C; Q# N" q J0 v$ |; b - catch (Exception)
5 e3 P( v* i# u' Z0 e# A - {& K, R1 k+ H7 I3 P% A3 g1 I! y5 `
- strWebData = "error";3 o! l8 N6 h7 C2 h: W/ O
- }
2 F2 C; p) [. A
* f, y) M! U) H% Q5 _- return strWebData;
: o1 I9 b# c9 [) O x - }
复制代码
; g8 A5 N0 r, L2 c6 Z! p# r2 G1 D3 z4 A3 W
|
|