|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
0 R- k% J8 D* L- l缺少一个gethtml,用下面这个:
; l* D9 O+ j' L7 n- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 4 i9 c8 ?( W/ t( S5 m. o# g4 z
- { T x0 t% U0 c( R/ g
- string strWebData = "error";
# P& ~5 k2 M* C/ a - try
% V f- Z+ o( O; k - {" N& T( m! `. j3 Z% s A
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient - `& `7 `5 G3 \7 S- u3 C. A8 J
- // 需要注意的:
! `8 M8 s' U1 b- ~1 l - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 : y4 R9 U& ]4 G" m: C% v
- //这是就要具体问题具体分析比如在头部加入cookie
\5 }0 R& K3 k& E - // webclient.Headers.Add("Cookie", cookie); " P/ U8 F. z# [) R5 \
- //这样可能需要一些重载方法。根据需要写就可以了1 O4 f5 J, ~. Q, d2 j5 P+ T9 Z' B
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
+ m. n; ^0 z, [) C8 G/ I8 X- U- O* u - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
7 G O% }4 a! u a5 v - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
8 A0 g" c9 j9 w4 Q3 m - myWebClient.Credentials = CredentialCache.DefaultCredentials;/ A/ _' b# l% o* t
- //如果服务器要验证用户名,密码
0 R' f6 k* G7 j: L% _ - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
" g5 z- ~0 ]7 c3 C - //myWebClient.Credentials = mycred;
) b; x* g X0 P' D; s0 b4 `6 p1 o - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
8 _9 |* Y% O0 ` - byte[] myDataBuffer = myWebClient.DownloadData(url);
, T! I) r0 p/ \* m1 [, r, Y | - strWebData = Encoding.Default.GetString(myDataBuffer);8 v9 |3 Z d) F) \4 |
; [ V0 n9 c0 ~9 D- F e& [% g' T. F6 r o- //获取网页字符编码描述信息 0 J- W! C' u& l: ]1 l$ ^
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);8 D3 d, B i4 w# B+ I. F* m
- string webCharSet = charSetMatch.Groups[2].Value;$ ^' ~0 h) s+ K
- if (charSet == null || charSet == "")
& P4 A- O2 q+ C! A - charSet = webCharSet;
# Y3 z6 F( w9 Q6 q* T: `& p# z - if (charSet.Length > 0)
/ P1 l# m7 y5 h4 O- ~ F - {
2 p$ Q& ~; }% M6 d1 `$ D1 u8 ~ - charSet = charSet.Replace(""", "");* u" c+ [; |! w0 x5 I' j
- }5 r5 O8 W7 _6 U% x2 }$ l1 W
- if (UseUTF8CharSet)
+ `, u8 |1 r- p5 A2 m+ A - { Z; p1 G9 C" t* `: c
- if (charSet == null || charSet.Length == 0). K+ G) ~) A( {9 b
- {6 w- o) M: J Z3 g2 P1 W
- charSet = "utf-8";
: z: x# B& n1 i3 t - }
) N5 ^3 B+ |2 p: d7 b: Q - }
0 B0 J) F1 ]. @9 U% L2 M4 Y - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)* @2 u9 w' z/ u5 V* w% d# a
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);4 \8 v" M) C- ?6 b; H9 Z8 n
0 n4 j5 P- X( l6 z1 W, X- }
: Y6 K/ A/ L2 w9 o( P8 x4 [& q - catch (Exception)2 ? y# o# q' v2 R8 h; M
- { E0 w# i w6 y8 f( E
- strWebData = "error";
( Y8 X* @6 c* a$ u' }2 ~# t9 ~& H1 | - }
" C5 J# ~& L5 b: _& O1 W. N
9 c5 k+ J# }, b/ s7 `: x- return strWebData;! O- y# H) f) @: a. j, B* u
- }
复制代码
& ?2 B! g1 {3 {7 P
( F. n6 L( d3 z3 a$ J; a) x |
|