|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
! u; X- _* d: O: _) X- e" }缺少一个gethtml,用下面这个:! N, G$ I4 B6 X( ]3 `/ M# y. X
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
0 [* I6 a2 h. N3 G- Y3 i - {( S5 v8 E7 P7 A( s( F! B
- string strWebData = "error";
# \; R H. |7 P: p! y k9 u - try3 b" S: e& I- E" T& h
- {& f& T6 U4 f1 V$ C! ^
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient $ f8 \) c. f" D5 \# M- r; n8 s/ O
- // 需要注意的: ( u4 q. [: n1 H( {9 l( F
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
% w) J& v. S# U2 n/ W - //这是就要具体问题具体分析比如在头部加入cookie 3 v) y* n- C; l9 t4 k0 T6 z
- // webclient.Headers.Add("Cookie", cookie);
6 d) z2 O! t4 y; ? - //这样可能需要一些重载方法。根据需要写就可以了! {( j1 I7 z2 K, P
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");4 U: [: L) v% F+ x a$ V
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
8 B9 O8 n) N" m% l - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
- P- l1 y+ ~& f. g7 o4 [1 N2 k - myWebClient.Credentials = CredentialCache.DefaultCredentials;( {& C8 g% L% N. n' E3 x
- //如果服务器要验证用户名,密码
" _" b0 ?! F2 b$ j L) q. @ - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 3 h" m4 Z* I" D; j
- //myWebClient.Credentials = mycred; 2 V8 t: \/ t8 @( S# h
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
9 p# D" W& Y! ^' d. i3 d; m- } - byte[] myDataBuffer = myWebClient.DownloadData(url);( \/ g. i% K1 _; y
- strWebData = Encoding.Default.GetString(myDataBuffer);
' [: `+ S0 v# t* Y9 ]. I - $ h- f$ [ D" @. W# {( g- I
- //获取网页字符编码描述信息 / Y7 O5 m* z5 S
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
1 e0 a/ f+ _' p4 J; Z- {8 [ - string webCharSet = charSetMatch.Groups[2].Value;
8 Z; z; v# s6 e1 K - if (charSet == null || charSet == "")
% p0 j4 n) M: }- }& F7 t3 Y - charSet = webCharSet;
) G* Y% l8 S( j6 P - if (charSet.Length > 0)
( w+ Q) J: s8 h - {
7 Z3 k& H' b( c2 |1 {. ?6 N - charSet = charSet.Replace(""", "");
) v9 f. e" \+ V A% S - }4 B' X$ B; L5 O2 P8 @
- if (UseUTF8CharSet)
" }- D3 Q3 i+ F; D* b' o9 k3 q - {
$ @5 U L, I6 } - if (charSet == null || charSet.Length == 0)
3 y! B7 c6 h# S( v - {2 X$ s# y' V0 P& {% m9 J; c
- charSet = "utf-8";/ |* C* S% H, F0 Z
- }
0 d4 M# k9 q# T) m- {) V - }1 G, f5 r* ?, L
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default) @, ]3 r# ?( L
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer); u6 ]+ f6 j! k6 q
/ q: d) {1 T0 `- `4 Q- }" j: j% N0 h5 h1 o5 s% c+ [0 P. r+ }4 w
- catch (Exception)- T/ U; U# ?/ B7 f
- {$ [9 B$ B# L5 S( H
- strWebData = "error";7 f; _$ ~& ?/ [
- }
; \3 v0 M/ x6 r; |& Q! m( H
/ T$ R% V0 e: W" o* v- return strWebData;2 w/ w" C. _+ ~5 x7 p( S; y X: T
- }
复制代码
* a$ Y8 y+ P3 _% G# z; j
; Z$ |+ Y* F0 S0 R% B% x/ V1 n6 g# ? |
|