|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
/ m; w) O2 A, i4 a7 z+ r
缺少一个gethtml,用下面这个:* u3 i6 ^% f- K. a
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 ( X+ h& M# t. V" f# F' c) T* |5 F5 j
- {
1 b9 u" a) D7 T/ i0 t9 ` - string strWebData = "error";
( y( J9 [, v% J- a i - try/ E F8 _* O) D: O2 a
- {
. u3 w8 b L: v8 P - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient * @2 u* [/ r5 k1 ~; f: D
- // 需要注意的:
( M2 K5 l+ t8 p4 o! n2 e - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 # m, M+ n. m) d2 f9 ?+ N
- //这是就要具体问题具体分析比如在头部加入cookie
6 }2 `/ v7 V* [+ q% u/ r- k, l - // webclient.Headers.Add("Cookie", cookie);
' j6 d( m) f& Y: q - //这样可能需要一些重载方法。根据需要写就可以了
# Y8 b2 b4 g x) I - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");9 Y1 Z; R) U/ Q8 \: g& p# N4 B7 C
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");( s4 ^" n9 n8 r( l; U8 D
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 2 E! b& S" p' `/ s/ t; a7 C, V
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
3 b( z& F0 ~( S3 z( c7 j2 \ - //如果服务器要验证用户名,密码
/ m; }& s! I* z3 q - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
+ g2 \9 d7 G" D! M" l& S - //myWebClient.Credentials = mycred;
* ] J5 q% _+ r, t" p G - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
- j4 s' h. j, T - byte[] myDataBuffer = myWebClient.DownloadData(url);
. W$ P, i/ x& w6 l - strWebData = Encoding.Default.GetString(myDataBuffer);
# p0 n! c2 A& F6 C2 h) H - 3 S' [% u; D4 I# ?/ ?
- //获取网页字符编码描述信息
* R9 r/ L Y' Q( W3 w3 `/ O - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
L# n7 s% W5 J; n9 L* | - string webCharSet = charSetMatch.Groups[2].Value;
1 p ]% o# h+ j0 K! L+ ^ - if (charSet == null || charSet == "")
: S" j$ Q/ h4 \ q5 h- l - charSet = webCharSet;8 S4 h% X$ {9 W. S1 R
- if (charSet.Length > 0)/ I2 X8 q& d# y7 a# |
- {. i7 W8 l. ^( X/ M( ]0 Q% F6 d% [4 j
- charSet = charSet.Replace(""", "");
# h' u' k4 u% k# b; n, D1 U - }+ m& y* _1 U9 u. [
- if (UseUTF8CharSet)' V5 G) E. h* h5 w
- {' i' h/ K2 m. r8 Q( c. A
- if (charSet == null || charSet.Length == 0); P% U0 i- }4 N9 F- \+ U
- {
, v" y) L2 a' [ - charSet = "utf-8";2 S1 y! q) f1 T! v
- }; @! y, b3 j- Z# s* m( L% g( }+ m+ w
- }; F' k2 c$ \! i# T9 q+ }, Y9 B- A
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
& w v4 W- A' g6 b6 Q0 s z0 a - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);: Q6 V6 I" t* C- [# k- F6 s
: c( u2 ^4 R. o) A/ a- }% A! w# W5 o/ w
- catch (Exception)
* L; `- A$ j5 |& A: c7 C5 t' {+ V - {( l; O* V6 j1 k+ c) R: K! o
- strWebData = "error";+ | t% E N5 N, {
- }
, F5 y7 [; E. y) u; W; m9 j
* v2 c W3 c. w; s0 G6 O- return strWebData;
4 H6 J# U7 G8 A2 J: V - }
复制代码
# G) |/ z7 k E! |% U. O4 ^4 R! Y) E4 a% ^1 u6 h7 f+ \
|
|