|
发表于 2016-2-26 16:23:12
|
显示全部楼层
/ V0 X4 n- x( b
缺少一个gethtml,用下面这个:
0 D+ L+ P# q- c& [! J3 F- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
- [' g- v3 U9 F: E - {1 T g! V; K' u0 @3 p: `
- string strWebData = "error";
2 p0 w5 Y/ a3 E' f; A6 C - try, g$ v" a( J- k7 G8 [0 q' [
- {/ U, n) g1 [ c1 T& I ]$ Q3 ] ]
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
0 U' `8 i$ m3 V - // 需要注意的:
! K# R& B- G! N9 b9 n - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
: {. |- s8 C6 i6 b9 \0 \, j5 a - //这是就要具体问题具体分析比如在头部加入cookie
3 f7 |1 r4 t9 R$ F5 k - // webclient.Headers.Add("Cookie", cookie); & I* G( X- d' Q: L' W
- //这样可能需要一些重载方法。根据需要写就可以了
. ?$ o5 F; |1 r, q( S; l9 x - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");5 G2 z: c# |$ M. y8 K: m& l0 P+ ~
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
, B* T# `! w% x; I) l$ [2 T - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
. ?1 K5 m9 b5 i" ^, K; t/ e - myWebClient.Credentials = CredentialCache.DefaultCredentials;
' Z/ {9 s2 }# Z& x; P( j3 i - //如果服务器要验证用户名,密码 ; ~; B0 {/ ^: s0 |5 W
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
$ `' e0 X& a6 p5 A+ o1 M - //myWebClient.Credentials = mycred; % }6 [% {+ m. L4 S0 t7 d
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 9 I& i1 X' @& Q2 }" b
- byte[] myDataBuffer = myWebClient.DownloadData(url);' y% a% `5 L" D' e( c5 i1 i
- strWebData = Encoding.Default.GetString(myDataBuffer);$ |9 o' n$ x1 Q0 {
. ~* v/ L0 U7 a# ~- //获取网页字符编码描述信息
3 _( s, J# n7 a# u - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);% c' e8 Z* ?6 t+ K7 {: w
- string webCharSet = charSetMatch.Groups[2].Value;
. @8 A) Q& ^! F+ g# M: X - if (charSet == null || charSet == "")6 {4 i( `, }) @1 e
- charSet = webCharSet;
* M0 ?! p9 g3 L8 S8 _! G$ N - if (charSet.Length > 0)) \8 y3 o- r# ^
- {
% R3 }9 k) v3 W& T3 m- @/ u - charSet = charSet.Replace(""", "");
3 i0 t1 W) T+ ~+ D - }& n9 O4 Y/ S8 R! e1 y# Z
- if (UseUTF8CharSet)- Z! T8 Z |, j
- {" g! o' j/ i2 H5 n/ l/ {0 ^- m
- if (charSet == null || charSet.Length == 0)
( d* A8 j; o) V0 A Y9 h' n% ?; _ - {8 s/ d/ {) P9 f( O, m x. t
- charSet = "utf-8";3 t% u- Y) N) h, |( r5 i; ]
- }3 H9 k/ N& V) C6 C) p0 K
- }
; F% Z: ^/ U1 }3 p' ]) n. k% c6 b( X - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
: G" r% F) r1 L2 m& [# L5 p - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);$ J* X: J) F0 p: Z0 P6 I
- ' @ k C2 U. x" J
- } I. d; ?0 y0 ~1 j8 Z3 a
- catch (Exception)
: A4 @! K" a4 p7 h4 P- D - {/ \# x; V5 k( Q$ c
- strWebData = "error";
M4 K$ l# M& L8 [& l' [) H - }3 T6 }- J* I' m& q% r
- ( l9 e) h; e% i" I6 Q S
- return strWebData;; B) M, J1 H6 g8 f$ D& L
- }
复制代码 7 f8 n5 Z8 c) o% E
$ O) N% @) {. o" l |
|