|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
N4 t( i8 V' q- K8 p缺少一个gethtml,用下面这个:9 H9 ^1 k/ `; n/ ]0 v* |
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 * I& m+ T' ]* w4 g. _
- {
1 M5 E7 r4 X" Q% ?- o - string strWebData = "error";* n5 Y1 r4 b/ g
- try
4 f1 X5 M- T7 y1 z8 K. } - {
( C) r0 ~/ Y- [& D! U9 s - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
) J& O6 W$ @& z5 x - // 需要注意的:
* l) A8 O- u! R; } m. Q6 u - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
, r2 ]: j( \* c+ k o& k. L - //这是就要具体问题具体分析比如在头部加入cookie
1 M0 K# V1 c' V2 z! ^ | - // webclient.Headers.Add("Cookie", cookie);
! e' Z& c: F/ t {& u! U6 _, ~ - //这样可能需要一些重载方法。根据需要写就可以了6 A/ B) P8 B/ m+ \
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");( f5 ?. \6 _1 ?' J
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");2 L/ A' ]; A5 R9 \, q: i2 v
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
( ~5 @$ ]! ?! l j7 B - myWebClient.Credentials = CredentialCache.DefaultCredentials;! X; _2 V+ l2 I9 B5 C w3 i
- //如果服务器要验证用户名,密码 - h0 v; o b. M
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
' g) y* j+ G- _3 ] m2 ? - //myWebClient.Credentials = mycred;
$ }* A0 L( r& | - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) * }" Y' O! B; o/ \. W
- byte[] myDataBuffer = myWebClient.DownloadData(url);
5 V k8 F; B c - strWebData = Encoding.Default.GetString(myDataBuffer);
* O" [9 h. ]! o7 j: V8 u& P. p! f - 0 q1 j, I: b% m" r
- //获取网页字符编码描述信息
& @; J$ u+ Z5 Z2 H% T8 h - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
- n0 Q' H( U" j# F6 \% C8 C0 j% _ - string webCharSet = charSetMatch.Groups[2].Value;8 a% S5 N2 d" i0 h& _5 y* z
- if (charSet == null || charSet == "")- b: Z9 H I1 t8 `0 x+ ?& G
- charSet = webCharSet; x- l% q$ i& E, z. L; `% N! W @
- if (charSet.Length > 0)
9 ~( w( C6 p! [4 S - {) W9 d' r0 q! O }5 U7 V5 Y
- charSet = charSet.Replace(""", "");; y( V) }5 X$ x
- }: [$ I1 k+ H. J; ?
- if (UseUTF8CharSet)
& Z; G/ U& w$ ?6 ? - {6 j7 u, w' p( E: |
- if (charSet == null || charSet.Length == 0)
6 Y* o+ Y+ w! F( o. V2 u - {
) b# d6 B5 `1 t% P( u - charSet = "utf-8";- h- i0 x3 \3 B; L2 u& a
- }1 c5 U( S) c0 [! t" w% X. n
- }9 z. _5 T3 a1 ]& t% y+ g9 u5 {
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
9 P, a4 \: B& {& n4 X- a - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
: r' E) z5 X( B3 o, Y - # y# ]7 |9 ]9 i
- }+ ~" [8 K+ V$ W& v+ S
- catch (Exception)" T' |6 v, {7 t
- {+ h$ t2 u8 d- Y3 T8 l( L h, P% k
- strWebData = "error";; e' c4 `9 p8 |" P' W1 j2 U
- }" c) @8 h8 \4 F8 ^: q+ F$ N7 ~
6 \5 S' S+ W' @3 w% b: b- return strWebData;$ k: d, x3 _- t) n
- }
复制代码 + [+ k; n7 h4 Z$ W! f% ^7 i
* x, D8 j( P6 i, ~% N |
|