|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
0 Q, r$ Q$ z! u# R; A
缺少一个gethtml,用下面这个:# s P- }* b4 h( }# u
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
' g( U+ ]+ N% P8 m - {
& [" T4 V3 h5 ? - string strWebData = "error";
) [ c3 `0 H! g y7 } - try
5 x4 t7 D* D9 ~' h6 k$ t - { e5 _; A2 @- m# K h
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
) B( b8 O; X# k! ? - // 需要注意的: ) z/ l; H0 y/ o! o: m* v+ a
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 5 p: [' M5 x c/ J8 T. |" r3 a6 ~
- //这是就要具体问题具体分析比如在头部加入cookie & }: R6 F, D) W* \
- // webclient.Headers.Add("Cookie", cookie); 9 @# P& a+ t6 t
- //这样可能需要一些重载方法。根据需要写就可以了
( l# K% L- ~ k - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
* l" P0 U: `2 p: p8 a% o - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");- ]. {2 @6 I+ P, W. d3 }0 l4 U; I
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
3 y0 w& Y+ d s) I, } - myWebClient.Credentials = CredentialCache.DefaultCredentials;
0 J1 I( R s+ A; K/ W - //如果服务器要验证用户名,密码 6 V+ c9 Q) V _- m+ Q1 y
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 4 d& y9 I7 F4 y9 `5 Z1 F5 v1 Z
- //myWebClient.Credentials = mycred;
3 r" S0 ?) G: L" j0 F - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) , n2 k4 v$ B6 u& |# \3 w+ G9 ^
- byte[] myDataBuffer = myWebClient.DownloadData(url);
; ?) l, I4 j6 q8 E+ V( q: J - strWebData = Encoding.Default.GetString(myDataBuffer);
1 b$ s( a- p' p/ ]& Q
5 A0 D3 ^6 T' c- ~' f- //获取网页字符编码描述信息 / d: n3 L& e/ `6 G1 `- ~% k, g
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);+ j0 J0 I# L, [% N4 T! C
- string webCharSet = charSetMatch.Groups[2].Value;
y3 f4 e* { U! D, g. S! L - if (charSet == null || charSet == "")! Q, x( Y2 @" x" K( Z8 ]( J
- charSet = webCharSet;
; M* [7 o& \' \) P' B/ o9 l/ P - if (charSet.Length > 0)
- o0 _$ Y, `3 _0 Z - {
9 y. u4 v7 s" w5 y0 [' ` - charSet = charSet.Replace(""", "");! _8 `0 ~9 h1 b: A% M) a+ t2 X
- }
! W ~2 [8 P3 I+ I6 p - if (UseUTF8CharSet)+ V* M s, C, Y) d4 Q; U& H( \0 V
- {: ?8 ^1 S6 l2 K! ~
- if (charSet == null || charSet.Length == 0)' W' O3 d: E2 Y" r% U/ {1 D& L- r
- {; v, e0 i4 T# ]6 D. J R& f
- charSet = "utf-8";# ^8 D5 ~4 \# f- ?# X
- }
" x1 f2 ^4 l; m& E - }
6 \& M1 _' O+ v2 b - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
7 y3 \4 D" i( S( V; F% Z2 Q% g - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);5 K% O- }& l2 V* N
- % r% v5 _, T5 e7 S! ?* a
- }
( r0 E( V y, X4 m - catch (Exception)
' H4 W$ y+ D. V( x# f - {+ v7 E/ J: N. ^) s" P \
- strWebData = "error";
: r: q( r# j0 {# ]4 I$ Z2 f8 o - }: o3 V' V% V* E
- . N# w% f7 @$ q. \. s- [ G
- return strWebData;9 V6 C9 a, V4 R/ R8 j
- }
复制代码 9 v. m, y5 {7 x% }) W5 A2 K
. X: y: b( K8 h. x r# ` |
|