|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
7 p! ], p6 {# z" A
缺少一个gethtml,用下面这个:
0 A# ^% \$ @2 W% E2 N5 U0 Z% H2 i- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
. ?; K0 |0 [& k - {* }0 d0 \) l8 G2 }" L' X. Y
- string strWebData = "error";+ p" k1 {( p' E& L# s9 L( T$ W
- try
# |" G# [0 s1 G9 y - {* u. B% O4 S! E1 I' Y
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient . O4 j* E- E* ~4 \7 Z
- // 需要注意的:
" q6 g% Y9 u0 I" r6 y% P - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
3 f0 X: R* y9 S7 w$ x - //这是就要具体问题具体分析比如在头部加入cookie * [/ A6 Z M" K* [3 w# L: K4 A
- // webclient.Headers.Add("Cookie", cookie); 1 J) `- |" R) c% [2 x& l2 ]' a
- //这样可能需要一些重载方法。根据需要写就可以了
) i3 X! h0 O3 ^; T: y. X - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");5 i% l+ W7 R) Z7 X
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
1 }! k" a. q2 O' @( k0 ?6 R - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
# b' K) ^% E; C1 {* M: \ - myWebClient.Credentials = CredentialCache.DefaultCredentials;
# y* u5 A7 D; | - //如果服务器要验证用户名,密码
% c: ~+ g; a0 q4 S - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); / ?; J7 u; Y$ v3 Y9 b
- //myWebClient.Credentials = mycred; z1 ~/ Z- | ~; S. \4 |( W# L' w
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) : N- N) o! e: s
- byte[] myDataBuffer = myWebClient.DownloadData(url);: `. ]* v9 k% z, {8 o9 q& N
- strWebData = Encoding.Default.GetString(myDataBuffer);
: O( n% m9 g, f* J% y' e) K$ ? - . p$ T% Q1 R. e; |0 m3 v3 ?' t8 [3 h6 T- [/ R
- //获取网页字符编码描述信息 7 w( d6 l# K- W4 V- C6 |" M
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
) @# L1 d. J8 X0 j% z6 M3 I0 n1 r - string webCharSet = charSetMatch.Groups[2].Value;' o' x" r: B' F2 R, [
- if (charSet == null || charSet == "")( \1 F0 k4 k# A* \
- charSet = webCharSet;5 A0 I& w$ s$ R% @
- if (charSet.Length > 0)- Z v+ d, w7 ~- `: u) t- n% d
- {
9 i# O1 r" B" `* {% B - charSet = charSet.Replace(""", "");
/ P3 {- D4 O* |4 [ - }4 N3 Q1 }- z- n$ m7 J
- if (UseUTF8CharSet)6 Q) U, s9 o, f5 |
- {: F0 b$ m1 v/ [! {# {
- if (charSet == null || charSet.Length == 0)' x, {7 g3 V9 D
- {" s% e' D8 `7 ^! y7 t) f* J6 @. g' ?
- charSet = "utf-8";
- T7 x* q% w2 b( R% D8 q% ` - }2 f* M2 M1 }$ ?
- }! @/ D* G( n# F. R8 A8 ~
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
; E. m. n5 C" N5 _/ y - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
F2 ]; Q1 T2 |$ @7 R
0 h2 s+ ?- d3 V) {- }
7 U3 S1 ^4 \7 w8 K8 I h2 [, S0 u( m* E - catch (Exception)
( x1 i$ ?7 j1 c: j1 d - {
, i7 u% b) }& p1 s r5 h - strWebData = "error";
( ^/ Z4 p ?9 [ - }3 S% d0 r5 K3 a Q, ^! P4 ^2 G4 T3 e
3 D' N$ R# n$ A- return strWebData;* U: R+ y, S3 O
- }
复制代码
6 j+ t" [5 C3 u3 ?7 Y( e1 Q$ E) X5 r2 x! M
|
|