|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
$ R0 \: W# r* e4 P
缺少一个gethtml,用下面这个:
i! ~1 ?5 u c2 ^+ w" p8 {- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 ' y9 `# ]- U. U: r( h
- {
* e2 L8 x8 _ g% k* m$ _ O' n - string strWebData = "error";2 v4 M" `' }6 \; [3 t
- try% S) c9 ^' j( H/ j
- {
! n/ n$ k9 s K9 l - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
( V$ g2 U3 b7 N4 L, ~+ F7 P" O - // 需要注意的:
& |5 h9 M6 F0 X - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 4 S. b" ~' R/ t" `. f1 q
- //这是就要具体问题具体分析比如在头部加入cookie 7 j) Z2 B# ^7 ^+ C; n! R8 l
- // webclient.Headers.Add("Cookie", cookie);
) t! n. z8 T- c- C* G b, @! Z - //这样可能需要一些重载方法。根据需要写就可以了# q) [2 d$ c( N& S0 U$ }- Y' `
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
" q2 @ H. M3 n/ a: }0 W0 u5 j - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");7 v5 ?" j6 C( z& E. E3 S" H2 ]* Q
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
; G$ r/ c: y" `- \: g - myWebClient.Credentials = CredentialCache.DefaultCredentials;
8 d$ P: K0 D# ` - //如果服务器要验证用户名,密码 3 y% t4 d0 y8 `& {: O* E9 l- |
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
# I5 c4 H4 \7 G$ K8 r4 p' C: `* h - //myWebClient.Credentials = mycred; # ?1 i) Z8 Y5 n$ l& O" D& J2 T7 l
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
' Z- V% l3 \- v5 i- n& Y+ J - byte[] myDataBuffer = myWebClient.DownloadData(url);, f- n& b& I- u7 a M6 [3 Y A
- strWebData = Encoding.Default.GetString(myDataBuffer);
2 E9 z# P+ F% r+ l - ( m9 d. Y, y! w* P; t/ |
- //获取网页字符编码描述信息
2 k2 b8 H9 m* @" |( g - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);" k+ Q8 }. w; ^
- string webCharSet = charSetMatch.Groups[2].Value;
' p2 D* ~) d3 R- u h A - if (charSet == null || charSet == "")6 I$ V. c! e( ^6 w8 H0 M
- charSet = webCharSet;& p! `: h* y+ _! p7 I& e
- if (charSet.Length > 0)3 v+ J' l( G8 n& f1 z2 _
- {
4 q+ w; o: l( B0 P5 f5 V0 [/ G6 G5 k - charSet = charSet.Replace(""", "");( j0 D% m! g# Q
- }$ T# a6 {- T: [, L
- if (UseUTF8CharSet)* L9 y2 q1 P6 B+ Q4 v' r B
- {
! f! O3 ~4 N2 E - if (charSet == null || charSet.Length == 0)$ j$ H& O0 p$ G; z, |
- {5 Z/ q9 g( A4 P2 u6 \
- charSet = "utf-8";, I0 ^8 f4 t3 M
- }
0 ^% O& j) O9 \% g2 f - }
6 D( `9 i0 w9 ]0 t B - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)3 m; Z. R+ I( q$ {, P/ E+ W, N
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);1 X$ ~- B' K0 U0 p. j! S
- * r7 G8 I7 O* v& T
- }
- ?8 `6 L$ b6 p* E# L& ] - catch (Exception)
* _4 s% R+ [" e" B! v - {
; ~8 M0 Y/ k2 u, X1 i1 W! T - strWebData = "error";: o) [8 C$ h1 f0 P$ S
- }2 ]( T) c6 w9 l/ Z) m
- $ e) [9 N) c2 p3 A
- return strWebData;
) A2 c9 J$ c# ^% x - }
复制代码
( l. a% X/ T1 A5 l4 k+ d' v* U" P7 f- m5 Z& T
|
|