|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
8 Y" M4 t) z# f9 v7 L
缺少一个gethtml,用下面这个:" V8 U# L) L7 {5 h' M
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
3 G& i: U7 v/ r5 V1 H# _( z - {/ R! n; I9 X0 n$ ~! {% c
- string strWebData = "error";
$ Y) @! E. b/ O V - try( }; I9 M! X t3 d, T
- {
8 S& z& W; t' t; t0 Z7 {2 g) x: T - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
, W& F' _: d4 J& ]- x+ V - // 需要注意的:
1 \0 x$ _; h- g! H% F/ `; W - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
8 v* B9 ?7 P1 O2 q6 W: I( M1 M - //这是就要具体问题具体分析比如在头部加入cookie
* z( ]: `! L: B7 j, t- |5 F V' O" k - // webclient.Headers.Add("Cookie", cookie);
# Z9 {) [0 q0 r1 ]. G6 v3 L _ - //这样可能需要一些重载方法。根据需要写就可以了
. [# E" w/ a0 o( _/ X- z: W - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");4 A$ A8 Z4 V M9 q: g; x1 ~& Q1 Z$ e
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
! O& N5 C. ~; V* W9 j - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
$ \6 q, b1 G! K" R, B; Z/ i8 d# M - myWebClient.Credentials = CredentialCache.DefaultCredentials;
, F/ }( Y2 o! N J" {6 d5 e - //如果服务器要验证用户名,密码 " O7 @ K \( L
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); % X0 y" i8 ~) h0 `
- //myWebClient.Credentials = mycred;
5 E8 x# H: A/ r! O - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
, f' ]4 ?. _; I7 P# t$ n3 m* O& T$ ?2 k - byte[] myDataBuffer = myWebClient.DownloadData(url);( l% z& e9 {, F4 o
- strWebData = Encoding.Default.GetString(myDataBuffer);' `* V. |; A/ @7 S4 z% n
- ) o: v6 B4 ~1 c( A- z
- //获取网页字符编码描述信息 2 h- O$ I5 e8 x$ h2 U9 G; |: ^
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
2 `1 u# d/ \; x - string webCharSet = charSetMatch.Groups[2].Value;* Q! ]2 b" B! N0 S# N7 N
- if (charSet == null || charSet == "")
% A; X9 L( K4 T# t7 { - charSet = webCharSet;
6 f" b" D. W3 M$ F: z" U3 ?' P - if (charSet.Length > 0)
" ]" A7 }- i& C$ I' b$ i - {
/ l3 d: _- j; k4 v. U) k4 Y+ @+ S1 z& O - charSet = charSet.Replace(""", "");" x/ N7 y7 y5 Z* s
- }
) D2 D& V: c. Z {3 } - if (UseUTF8CharSet)5 p1 a& p, Q5 x, F3 o; W% j
- {
r5 |) P I4 D) M' V7 J - if (charSet == null || charSet.Length == 0)
! W( z8 x( U( J q - {
/ e: Y8 F k! i! M1 w' k - charSet = "utf-8";6 H: J3 K$ ~, n+ Z8 _
- }
2 c6 c; C$ @7 i) K9 P& L J# l0 U - }
( p! B0 i& U! ?' `8 Q k# W - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
6 D" P8 m: ~$ Z) \2 j - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);2 v; n8 G+ [3 }
& S" \5 k. @0 C+ h! V- [! N- }
" W/ x$ A. Q! ?) B% V - catch (Exception)
# P: \) n4 Q- G0 b4 F+ C, { - {" `2 O% n, \6 R2 Q n; i
- strWebData = "error";/ a* ?9 g% X* S* o1 N. w1 w
- }7 O- c( Q3 Y$ l+ S0 T
0 ~: C+ j& N I: e* v: [, | y& ]- return strWebData;5 \3 j5 e* G/ Y
- }
复制代码 - A' q; N# @% O P! y4 ^0 |# D
: S6 L. a4 j0 E* P& [
|
|