|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
0 g% S3 l o- R) ^. ~缺少一个gethtml,用下面这个:$ V5 P) M: o8 j+ N7 U
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
6 k9 f4 e) ]* ~( f% P0 M - {8 O# t& _% i: Q. N6 n" r: t
- string strWebData = "error";
! `+ a2 f; T/ x9 l0 N. C4 ? - try- s6 N: Q' ~- z/ Y. m5 R
- {
* G9 D) X, m& G- ` A% b0 V - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 7 Q( Q4 R# Y" C- }7 I
- // 需要注意的: ( k% W; a" K" Y5 d9 W( `0 K
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 % P- ~# f- \* G6 D# u
- //这是就要具体问题具体分析比如在头部加入cookie
. R% s& K5 `1 O8 g% o - // webclient.Headers.Add("Cookie", cookie);
; B& |9 I9 R% [4 t - //这样可能需要一些重载方法。根据需要写就可以了
/ _; C$ u0 A# M: Q! x - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
7 j# n( d+ L( i. t- f - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");' I, y; e% B+ f3 E0 R% p
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 $ ]' s' a0 [' I5 W9 _ |6 A/ N2 {
- myWebClient.Credentials = CredentialCache.DefaultCredentials;4 w! p0 ?! D( x6 n+ E& a' v
- //如果服务器要验证用户名,密码 q$ A8 f/ [2 M2 a( L
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); + L) _( M% \( P- x+ t
- //myWebClient.Credentials = mycred; + }: a- `2 ?: ]
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) " V" j; u3 m8 F: s0 n
- byte[] myDataBuffer = myWebClient.DownloadData(url);
* O; G9 w K3 O U2 @- T - strWebData = Encoding.Default.GetString(myDataBuffer);, v& ?7 y: R& z+ X' s0 t& q( W
- 9 Q- x, d; w3 J ^) E* B
- //获取网页字符编码描述信息 + F) z& y s# B6 {/ Z7 c. \
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
$ W( }) l% B+ p: w W% Z9 b - string webCharSet = charSetMatch.Groups[2].Value;* Q v& [$ @6 C, @% ~+ A/ `
- if (charSet == null || charSet == "")
& @4 x' V/ O& P" L- B+ v - charSet = webCharSet;
2 Y7 W) F& V. D( @8 Z - if (charSet.Length > 0)6 a7 ~) f/ l; x: d: @* c, V
- {6 ]" e! f& t) l* K% v
- charSet = charSet.Replace(""", "");
1 C: a" s* u; F1 ^% T5 A - }( p m( o) L, V( f
- if (UseUTF8CharSet)' o2 u/ l* {4 X+ o3 W5 G) ~
- {; f7 }9 v1 T, w D# \
- if (charSet == null || charSet.Length == 0)
/ h; Z; E- c0 n4 [5 Z9 L) t4 { - {
& s& y9 E0 T- Q: r - charSet = "utf-8";. \3 Q( T9 _" M( j8 ^% a' s
- }; \; X. J. g. r% ~8 V# n' n+ t
- }* t& `; u4 I2 \" C
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
! d: d* D1 t; N* t - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);) P1 h2 Q# @$ c" Y, S/ w
1 Y5 _. M3 k2 G7 [- }
' [2 m, v5 H( \( n& h$ V1 F - catch (Exception)
E8 p9 Q6 B( C - {2 S, Q \9 f/ m( s/ \
- strWebData = "error";
' N9 h) d* C6 X3 e* F5 e+ C( M - }- Y; T0 r% p. r5 l+ P8 V
- , r7 ?3 y# w% q2 p
- return strWebData;
, H) x( C1 ^+ h. q, m R - }
复制代码 * v* C0 J% F0 K+ Q6 I
- Y- u! u) r' H
|
|