|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
1 S% J' N. K6 }缺少一个gethtml,用下面这个:3 G/ } A, F9 C" a
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
+ d$ y* y" T* T9 i: H - {5 L( Q5 Z1 }/ K9 J( q
- string strWebData = "error";
/ }2 _1 g0 R" Z: ?4 {! j - try) B. a! Z9 z' @
- {
; {7 L! }3 I( K; w$ K9 F# m - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient ! N! ~! M5 |# ~4 c- R
- // 需要注意的:
! {) d# ~0 p; `( C2 @ - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 - b3 n7 S! x5 U1 h2 h
- //这是就要具体问题具体分析比如在头部加入cookie
% x/ Q' f2 y* h - // webclient.Headers.Add("Cookie", cookie);
6 ^& ^' d) F ?/ l! K/ y - //这样可能需要一些重载方法。根据需要写就可以了
: L' h# c3 m4 @- P$ y - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
. `$ ?1 S7 Z3 L5 b3 r, ~# C - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");' R5 j1 y& _# W
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
8 ]$ X) r3 m) k - myWebClient.Credentials = CredentialCache.DefaultCredentials;
/ H) S' U V( q+ P# Y - //如果服务器要验证用户名,密码
+ L! i2 X+ c/ W+ d; {+ b- z - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
. b {# G- m8 F$ C) A. B - //myWebClient.Credentials = mycred; 6 D" A; X$ P5 w3 s5 ^1 q
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 7 t; k3 g9 I% ?+ z. b
- byte[] myDataBuffer = myWebClient.DownloadData(url);' f# I3 g2 j3 V# r9 l7 M3 ~+ g
- strWebData = Encoding.Default.GetString(myDataBuffer);) W: y% m0 P5 ^8 ?7 |, t$ u
7 P( A% K' K1 ~. x& X- //获取网页字符编码描述信息 + q- H3 U5 a+ j
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
* G! G3 m: t& p/ q - string webCharSet = charSetMatch.Groups[2].Value;/ _2 F& }8 z$ Z2 u
- if (charSet == null || charSet == "")3 k1 o! s, B# z4 R( e' n/ I( [
- charSet = webCharSet;
# ~) K" }% v- m2 q, X3 ~ - if (charSet.Length > 0)
* M0 F, N. Y0 B6 m( [3 W - {
# T4 p$ X( o" D - charSet = charSet.Replace(""", "");
6 ]" F# W; t: e1 a - }
$ r% o9 R5 h3 T, ^ - if (UseUTF8CharSet)
5 ~/ m" j1 M, X' v1 e - {" I; {( t2 T2 v- e& H
- if (charSet == null || charSet.Length == 0)
/ Z' N# b' G/ F+ v* L - {5 d K* q4 ?* r
- charSet = "utf-8";
8 L& B& F1 D0 J& N: { - }
9 W- O( H- k$ V' k1 j7 @ - }/ Z$ \: ?' S R
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
: |; G. E/ o$ p: H& W# i - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
2 S" c: r u% e4 z0 { - , n5 O* v9 X% }4 s" s( s' c
- }! [. _3 P" Z/ P2 z6 ~ Y9 W2 y
- catch (Exception)1 M: B4 l% Z- m( l% s) H4 B
- {
7 G- A+ w; K: U4 X6 l% i0 r' Z - strWebData = "error";
- S& f) Y @6 c" M2 b) y R& x, Z - }
c9 R3 t m) {+ k% V! C
3 c, U6 @7 c7 Q( ?+ m) ~" ~% k) [- return strWebData;
4 f* O, m0 [, s3 h - }
复制代码
# |/ D9 b7 [0 X6 D' J9 X. `
% H% n) s" V! D# l; p0 t, b ]9 F! p* | |
|