|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
4 [ y6 E7 y" u! n# i% }: ?5 m; r7 @
缺少一个gethtml,用下面这个:* O9 T( e$ p2 n$ R0 `9 ?) O* P4 Q8 m
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 , I- } M3 h5 `" R9 v
- {$ @: Z4 x- l9 Q3 B! j' Y
- string strWebData = "error";0 n4 h. W/ L V' d' a
- try& D$ \7 V) j) {! A
- {0 D6 \! u4 D$ g I$ U* I3 o
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 4 Y: n9 y* l' V L/ \% L. `
- // 需要注意的:
6 Z3 A0 j2 V8 _/ b - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 + k4 }5 T9 e) Q6 T( |% }
- //这是就要具体问题具体分析比如在头部加入cookie 8 t8 {; A" w5 _2 Y1 X' ~, G
- // webclient.Headers.Add("Cookie", cookie); 5 o* K3 j8 \5 S4 C
- //这样可能需要一些重载方法。根据需要写就可以了5 O) {* S( o2 y. [+ U( `5 T
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");0 w8 t) G' m! S
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
5 S, b+ }2 B. l. e/ {+ S - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 # G4 g5 S% x8 `
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
! O! Z" V, q& w% K - //如果服务器要验证用户名,密码 ! G! S4 Q2 i) e8 _/ S) s x' J
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); / M7 W+ A% M- f& a( W9 I
- //myWebClient.Credentials = mycred;
' L, Y; c, R5 L/ Q5 P8 d5 ? - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) ) t1 q5 x- l8 |0 h* E* F5 K+ A
- byte[] myDataBuffer = myWebClient.DownloadData(url);0 }4 E4 i! H- V- c) J
- strWebData = Encoding.Default.GetString(myDataBuffer);
' {8 i' g4 f+ R9 ^9 ]
- X* C# E; z( m. s% V$ j! z- //获取网页字符编码描述信息 . k( f3 ^' N1 ]1 ^& S
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
8 ^ J0 n9 J/ h d6 M - string webCharSet = charSetMatch.Groups[2].Value;
5 Y. H0 ]1 K: F. I8 A0 } - if (charSet == null || charSet == "")
, P# C0 i8 H; s2 F - charSet = webCharSet;* Y2 x2 u: d4 L6 k% T
- if (charSet.Length > 0)
5 |- ^# r2 C8 j! P - {; M2 M/ Q% {3 t: _6 A- [
- charSet = charSet.Replace(""", "");, E) E9 w S. I7 R5 `9 y: N
- }
4 t5 a7 v: c1 { - if (UseUTF8CharSet)5 h# T2 B8 M9 O5 o7 K1 L4 O
- {
' ?5 m/ J; w" B, O - if (charSet == null || charSet.Length == 0)7 h( s$ q/ q1 |; C- N( g H
- {
' C' k: i; u5 n) [/ g - charSet = "utf-8";
: o# G+ `! P/ o5 r Z- e& B2 M - }
4 g6 O! ~! O0 A& c {3 y- Y, B k - }7 F8 X0 C1 _+ S5 b
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
o" P; P7 _: I" i. I, m( x4 S6 F" s* L - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);% n1 f* F8 B, X% @
7 T" Y( B3 x; {: k5 E! U6 _- }
- a7 U8 g7 `/ K9 w8 { - catch (Exception)6 j$ _" j1 Z' S
- {% x' q' X6 e$ Y* Z$ u
- strWebData = "error";3 U. X- {( _/ Q) T5 P- q/ q
- }3 ?( b; |6 H3 O: o
- 9 S8 d* |) U+ T9 f- F
- return strWebData;
' {, e, V, `! Q% B$ O% x V7 f% W* G% n - }
复制代码
, h) T/ \- v9 O6 s( U/ b, p
. [# s* K) `( F7 W3 U, z( J R2 j v |
|