|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
6 e0 U6 E H1 E8 p7 h- Z; S B, l8 c
缺少一个gethtml,用下面这个:
/ y; e2 s/ w' F; [- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 ; I) p- k- j, P1 r3 n4 N
- {
, Z# I3 l7 x z& E8 a0 R7 L$ U - string strWebData = "error";
! s) c. B0 E2 f! Y4 J - try
% \6 Y/ G4 p6 x6 y R1 i2 P4 _ - {! Q2 x" e" N: p: {( i+ P0 g
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
{- n$ ?& W+ g u) \7 ], i& g - // 需要注意的:
4 a+ {" R* B5 w - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 9 S; o0 p! `1 k+ S
- //这是就要具体问题具体分析比如在头部加入cookie
& P+ r- y$ C8 \. }) b6 m4 u - // webclient.Headers.Add("Cookie", cookie); 8 M! z5 x. Z3 X5 S
- //这样可能需要一些重载方法。根据需要写就可以了
- p" j( l0 i# M9 F - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");( h2 ?& p& v7 D, `. A" X( g: C# p( f
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");! D7 W* A1 f$ `1 Q) V5 J, X
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 9 f! {, k- Y7 C: \
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
6 b! y. Q2 O6 l7 A! Q6 U3 q, n - //如果服务器要验证用户名,密码 % @$ x2 E t- l* ]! S) t
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
3 I/ ~3 z; h, m3 d2 K1 W - //myWebClient.Credentials = mycred;
# P5 V0 O$ T4 {; B" I - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
, B- j% f; H8 Z1 c$ O - byte[] myDataBuffer = myWebClient.DownloadData(url);2 h' ^ L% N7 r, s( A
- strWebData = Encoding.Default.GetString(myDataBuffer);
" q$ I5 B5 m V) D
/ h% s2 o: P; v1 M( E1 S3 n9 T, K- //获取网页字符编码描述信息
2 N l0 J! g# H4 u/ R( D4 }, T1 J7 p7 X - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);- @8 y: a! h* [; |2 M1 [8 t3 [' F
- string webCharSet = charSetMatch.Groups[2].Value;
# g2 x6 I) u* Q7 d/ d, H3 B1 J - if (charSet == null || charSet == "")! @. d8 B/ E7 ]5 S" r% a( l$ x
- charSet = webCharSet;- W0 m" s1 X6 y7 y& o9 z
- if (charSet.Length > 0), w) z( l* M; e" c: {7 z
- {3 a2 g8 Y6 \7 C6 f# ^6 b$ b5 C1 W
- charSet = charSet.Replace(""", "");
; [ W. o$ P. `9 A" L3 Z" g9 w* \ - }& u9 s4 @' a& q
- if (UseUTF8CharSet)
$ ?5 X1 D3 l# D' X" u* N - {' k! Y: a9 [6 G& V; r" [+ u; d
- if (charSet == null || charSet.Length == 0)6 n1 {( M' o* a8 d
- {( x$ o, Y" {8 b8 e; a
- charSet = "utf-8";
+ E k6 w- c- [- ^ - }
, B1 |( v5 t. ?# h6 B! b* y$ H: n, @8 K - }
2 L6 \% M& X4 W' x - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
& [1 b4 x7 S! \4 w6 C( d - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);2 T5 w# i1 O; @; Q& Z1 f2 `% m
- + p1 t. {! f q; |0 p ^ \9 A
- }
5 @1 z* y! ~ ~. F1 x, | - catch (Exception)
" q' m A/ G# A' Q- d j - {
# j6 B( r$ I# D2 I; d7 n - strWebData = "error";. Q' O: A, \1 k( z' E" d: h f
- }+ c' K% j2 \/ I
6 u2 b3 R' e: c- return strWebData;% V0 f' @: Z1 U6 z/ ~$ i2 C8 F
- }
复制代码 5 N+ z" H4 ^6 H4 h& p* k+ `$ L7 _
& o6 z: Y2 Q0 h1 W2 \ |
|