|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
& C+ H* y# {. Y; Q% j6 N
缺少一个gethtml,用下面这个:: l* h1 j) s! H* \. a2 W* |4 G
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
( j9 B7 o+ q/ g6 `! ?- P7 a+ W/ @ - {
8 h. y9 \$ z& g - string strWebData = "error";
5 R% {4 |$ y" p. z - try; i2 z! w2 i, ~% H) G0 |! ~, j
- {+ A2 M2 G5 A% t! g5 K7 Q2 Z
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient * r7 y9 D' _0 a! W. h/ p2 z% A! [
- // 需要注意的:
8 n6 _& `5 ?: C M4 K: M+ Q - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
- t" d# {9 l6 A8 Y, U5 E - //这是就要具体问题具体分析比如在头部加入cookie
, x9 O8 S: p& l. g T# n5 J3 Y - // webclient.Headers.Add("Cookie", cookie);
+ M, k2 G9 m; F1 p1 v( v- M - //这样可能需要一些重载方法。根据需要写就可以了
# j* B( f1 d1 _ T/ O& E# @ - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
( b. y1 F! @% z0 D l& b1 b) [, p8 p - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
5 U, W* R+ d0 _ ?; M$ H - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
! Q# Q. u8 K; @ - myWebClient.Credentials = CredentialCache.DefaultCredentials; @# Y j% {& X8 T! s% ], `
- //如果服务器要验证用户名,密码 ; F. A. ]& Q/ X* p: m8 _% y) L% q
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
8 w, X0 ?* q' f) o5 X% y - //myWebClient.Credentials = mycred; ( C( j4 R: d% O i, {1 b
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
' {8 L7 d0 {; Y& _" O - byte[] myDataBuffer = myWebClient.DownloadData(url);! m* T' z6 d1 ?
- strWebData = Encoding.Default.GetString(myDataBuffer);/ k: N- R& x2 Q2 i
) @! U$ G5 G$ Z( n/ x& x0 \% d- //获取网页字符编码描述信息
" i5 ^( T- ^3 r/ ~3 a- ? - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
! ~( `' \4 W* b/ F/ n* K1 X - string webCharSet = charSetMatch.Groups[2].Value;" u' C/ R& B( U; O' C4 p4 F) V- f+ p
- if (charSet == null || charSet == "")& _9 b- N5 v! ~ G- x
- charSet = webCharSet;
* `" J" r; G% z9 ? - if (charSet.Length > 0)
+ ?+ s2 u8 Z U B$ K - {
8 v; O3 N/ a F; O/ \4 ~ - charSet = charSet.Replace(""", "");
! C9 X4 W5 r( K! b - }1 V' E2 E3 L+ _
- if (UseUTF8CharSet)5 V# G. c) o0 z: V. Y- b; m
- {
3 s5 N6 Z9 F% B, n+ }6 C - if (charSet == null || charSet.Length == 0)4 m6 D! H! X7 |- Z* \$ x
- {& K% Q* w3 Z5 m; j9 V% w
- charSet = "utf-8";
( Y n9 E1 T+ f/ m3 t: y0 _ - }
3 m1 T9 ~. J D9 R - }
; B( ?# ~7 y7 P - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)0 T2 ?9 u- p: X. E: }+ ]2 S7 S K
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
9 d: d( h% j: Y9 q6 T
/ m! o/ B, M* K( X2 h7 ?- }& ~0 [ d7 o; B2 E- w+ i6 O
- catch (Exception)0 Y( D/ ^" w" A& }: p
- {: N$ [' s! S8 g
- strWebData = "error";
0 G0 r/ d2 t. @1 ^$ T: ?9 D8 m - }6 S4 T" P3 q4 U7 F5 [/ X
- ) s8 d# W6 M0 R" e) L; J1 m+ r
- return strWebData;! l5 I9 V5 e3 x4 Y) v, g _; W
- }
复制代码 + {+ H) s: K5 M+ c' ?" @% c5 p/ y
O: s1 Y1 V5 A; ^8 w! A7 e7 ]; y( [ |
|