|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
( X4 J c# v! S! z, G: `# r, X, ~% O
缺少一个gethtml,用下面这个:6 N6 X ~6 S1 n3 o x
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 ' L" z1 v I% h/ [
- {
2 K; W( V3 X. B: z! ~" b6 z( R - string strWebData = "error";1 z- ?- u! L$ ^' J7 j
- try
' f# I$ V I9 `0 Y3 V1 Z - {
; S# u$ Z' m3 O) H4 }+ q - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
% p% r7 U) x g. J$ c2 x: F - // 需要注意的:
; B0 X3 b D/ l1 `& P, t - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 / \, ^+ v+ n2 H, L
- //这是就要具体问题具体分析比如在头部加入cookie
$ \7 N' K( Y' B" ] - // webclient.Headers.Add("Cookie", cookie); 4 T' h2 f* w6 i7 \
- //这样可能需要一些重载方法。根据需要写就可以了$ c2 o7 A) q3 z5 ^" S* z- }
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");) F( m6 F) ^7 b+ N- E
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
( S! r+ i+ Z! O: ~, w% b - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
+ O: Y4 A; u1 y% z/ b" ^ - myWebClient.Credentials = CredentialCache.DefaultCredentials;8 V6 n2 i3 I7 j# A+ h% X4 A, ~
- //如果服务器要验证用户名,密码 % O; ?6 l2 y q. c7 r& v
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
7 t, p. b7 r, ^3 e( w/ o( a8 Z2 O - //myWebClient.Credentials = mycred; ; z: t4 V$ U4 ^/ _1 J5 ~1 i* q1 J
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 3 Z9 j' c, p; L5 ^$ F+ v
- byte[] myDataBuffer = myWebClient.DownloadData(url);
+ Q1 p) {# G6 f( o, J9 u# i$ E - strWebData = Encoding.Default.GetString(myDataBuffer);: f& q! i+ p7 X8 s% W: H( M# {
9 W7 s( b0 s% Y9 `# H1 {8 v9 E1 S- //获取网页字符编码描述信息
+ p. z/ H8 f r3 y7 x% G o - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);3 d7 I H; e5 O% e4 Z4 K' Y
- string webCharSet = charSetMatch.Groups[2].Value;- u6 E P. f( w
- if (charSet == null || charSet == "")- [. H( r8 h( d( l' j" I$ f0 i# ]3 h
- charSet = webCharSet;
% J B# D/ k3 y# ?5 _. @ - if (charSet.Length > 0)
, C: D: |& \: `% u" o6 ]* F - {
( {5 {: H7 A( e3 ?/ q8 [1 i - charSet = charSet.Replace(""", "");7 Z y% Z0 t* i
- }
! w# z0 r2 r- O: [0 \) o - if (UseUTF8CharSet)
7 q9 ]3 ]3 m. Q - {
& C0 @; w; K8 k, v; \ - if (charSet == null || charSet.Length == 0)
4 A2 ~+ D* f4 y- ~. E$ o8 R - {) h% C4 z' w7 f4 g
- charSet = "utf-8";) D: f& u) m" |5 v/ \
- }( ?' A3 F) M/ }* [, @* `& |
- }/ z; d. s& H! q4 {7 B2 G h
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
. U7 z6 G. v5 A. B, w8 h4 ^. p - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
2 }* k$ k, b+ I8 S - ! |: z2 q* Y8 S
- }
9 s. b$ p- G" z$ k" F! \ - catch (Exception)" H; ]9 \6 G" i4 P/ x4 n
- {7 D8 q+ q" s0 C" @' Y, T
- strWebData = "error";* Y3 A7 R- ^! G9 V6 W
- }
9 Y2 ~/ l; M+ p; ~7 l" \ - : @% b) f) v4 y2 H7 Q' `' z
- return strWebData;. j; `3 P; h% y* n, `$ A7 F2 v
- }
复制代码 ; A7 k% O) E3 O2 I) ^
1 X1 `% ~0 ]% ` |
|