|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
4 i/ \" X/ m7 |3 r
缺少一个gethtml,用下面这个:
+ Y; X6 j! |) q. k- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
3 g* s& S" ~. S" y" W/ C$ m - {" r# Y) U+ a ~4 j% H
- string strWebData = "error";
5 t$ c* G7 L# S- S - try
/ V- F1 ^. {" M2 w7 p6 E9 z - {6 ~& k' E7 m# G5 L
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
0 `, X; r0 S3 |9 p - // 需要注意的: + K% ]! q0 ?5 l& p
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
) ^0 a) J0 V5 n+ d" A" ]2 R - //这是就要具体问题具体分析比如在头部加入cookie
5 T" z5 c' Z, v9 q% `( g - // webclient.Headers.Add("Cookie", cookie); , J3 l/ Z% @5 c/ d8 Z
- //这样可能需要一些重载方法。根据需要写就可以了' |6 e4 u& b9 d" a6 n
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
+ D; a( W2 i* k - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");8 K* o3 e6 x* d
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
3 F8 d4 |6 T1 r" a3 K ~$ r - myWebClient.Credentials = CredentialCache.DefaultCredentials;2 {$ Z: G+ x: k8 d0 {
- //如果服务器要验证用户名,密码
( X9 w( y7 K/ T - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
# C: T! o$ C" I! d - //myWebClient.Credentials = mycred;
* R, b: n4 T" ?9 \% \* G - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 8 }' d- g! G) g7 h, J
- byte[] myDataBuffer = myWebClient.DownloadData(url);- ~. @8 ?3 W. G+ |, B, b
- strWebData = Encoding.Default.GetString(myDataBuffer);+ ^8 D" L! {& c2 f
- ! i- ?: N3 |- ~! n S
- //获取网页字符编码描述信息
4 i. H2 v: J# j/ b/ A3 s' A - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);6 Z. d6 @/ Q% j+ T# `: N
- string webCharSet = charSetMatch.Groups[2].Value;
; T: u! q3 E/ H- R% F) d - if (charSet == null || charSet == "")
7 {: R: R$ O& e/ ]8 ? - charSet = webCharSet;
+ y& b$ R" k2 [0 E; W" N: |- x. s - if (charSet.Length > 0)
, t% c; {+ ~! ?! ~- D - {
" S" Y) t1 j1 V; T$ m# } - charSet = charSet.Replace(""", "");
! s/ U: k r1 c. Q6 b - }# l7 p# C) X% a2 \
- if (UseUTF8CharSet)
9 w; v4 @+ [) P A" E - { u: [3 n6 F3 Q, k7 ?( M
- if (charSet == null || charSet.Length == 0)
, o) ]' a+ B$ ? _ - {. S& y- @3 S/ E T T7 O
- charSet = "utf-8";
1 Q1 i4 x+ y+ y% \5 d3 X$ ? - } N4 l3 a9 W7 o2 `' m( Z
- }6 b6 S# v+ ~. @) R5 C% q
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
3 i& V/ T g0 G. W - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
0 n0 t( e% X% A - ( n- u& F' c) g4 G
- }1 n% m( Q. ]- e2 v
- catch (Exception)2 J/ Z. Y2 B. | c% }5 x0 n
- {7 W3 |& X B% v' s1 b1 c a) \
- strWebData = "error";
" V F" W# L, t! z - } P8 k4 F1 o6 h+ D" R5 ?' K
* ]7 ?: t, k' J- return strWebData;! h& @9 }2 C7 {
- }
复制代码
7 v/ j, }* N* K: g R0 L# B' n* M0 v/ u' h9 u+ a' F* n6 W
|
|