|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
% }. W+ J" u. h
缺少一个gethtml,用下面这个:
3 a$ X% Z9 ` _7 F8 n- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
4 L+ O$ ?8 m8 z! l& s8 b7 _ - {
# f# E; |) o0 I9 I4 e, [' ^ - string strWebData = "error";
8 B9 e: n0 Z* d! u: L2 F# X5 x - try
, a! ]! g* H. c( E1 { - {; p9 G6 e4 t2 U: A
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 9 C2 w7 M$ H0 n/ |. m, W% A+ _- J
- // 需要注意的: . y' c9 O% ]6 I3 b$ Q9 s
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 8 ^: J7 o# r, H& t& P
- //这是就要具体问题具体分析比如在头部加入cookie
; ] g- ~6 ?/ ` p2 f - // webclient.Headers.Add("Cookie", cookie);
& B) D! X( r$ V3 w - //这样可能需要一些重载方法。根据需要写就可以了
7 H9 ]1 }% `- s, t7 N - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");- W8 d. \5 w; h4 S+ {( ~
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");. e9 V7 e" e' C% w- p& t8 ?
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 * x6 R5 L, m0 j( c1 u
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
6 V5 B/ S b& W. f: w% F - //如果服务器要验证用户名,密码
; }3 `' _) f& x/ g - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 2 I1 `- l Q' {6 s+ J
- //myWebClient.Credentials = mycred; + z( v( b) ?8 s5 c5 f4 [% `
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
0 P& `5 p0 c0 C" {2 v - byte[] myDataBuffer = myWebClient.DownloadData(url);
4 B! ^2 h9 E$ g8 L6 M" J& }9 v - strWebData = Encoding.Default.GetString(myDataBuffer);% W4 P |, N- q; S
1 Q5 j( I. g/ Y. R, e# K- //获取网页字符编码描述信息
; [# E( l8 c" D2 Q - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);5 M( i) V6 e7 V9 o! ?2 H
- string webCharSet = charSetMatch.Groups[2].Value;( s6 O$ w$ o# ~2 i( t. p+ V
- if (charSet == null || charSet == "")
7 [% A% C+ [; ?: F u - charSet = webCharSet;
: B+ V$ ?+ z0 }. B0 h - if (charSet.Length > 0)1 I: B+ \6 v) t; J
- {5 _" B$ I1 }3 r( [: R
- charSet = charSet.Replace(""", "");5 P- h) \" g. Q4 d
- }) a+ y* e: {7 v I# `+ j4 e
- if (UseUTF8CharSet). r: q5 ^4 m; `$ ~6 u: u- `
- {" D( O' G# \3 x7 ?+ N# J L
- if (charSet == null || charSet.Length == 0): B1 c# D1 V: _
- {
" N. z5 g& v {. c+ K* X' B - charSet = "utf-8";
7 m; k( ^. k: j% R, m5 |, h - }
F% k, i+ R* S1 r - }
) ^) u( d3 q- ^0 v( e+ j8 v1 k4 _% u - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)4 a' b* v- a' g( |% G- u/ j
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);; D/ ^3 s B& P' Q( I
( g5 w1 F: J4 {6 ?- }
! ?9 ` b% T0 [, w' \6 X( t; P - catch (Exception)
2 m7 |4 {, S* _" T - {
- _3 w9 g k2 h. v" O, d; } - strWebData = "error";- S0 h z" B. Y2 S
- }
( N& F8 k$ Y1 k! e - ' [1 g( K- |5 ]9 A; i3 {# |
- return strWebData;
9 H: @5 x& X8 V8 G* l: t& o) P - }
复制代码
* m7 K+ x% \. d$ x* R
) l, Q% P# g; f6 D r- U* B |
|