|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
0 h3 c# t# S& m2 g6 f# ^' U缺少一个gethtml,用下面这个:2 S" _ q L, a1 s1 o
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 2 @9 v# L. M+ s
- {3 d5 H7 ]" F* |, k4 s$ r3 F" H
- string strWebData = "error";/ F/ S" b6 w) _# Q" O S6 F
- try
) w. v$ |- T, {- V+ C5 u0 v - {8 S$ p! ^2 T/ S, s9 P& w, ?2 _
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
4 B6 u7 ]* I8 x" b; N; I U# W - // 需要注意的:
% t# U1 k' C3 b9 w - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 ! o$ T# r$ R6 B4 A) O( j) l; T+ i
- //这是就要具体问题具体分析比如在头部加入cookie + f/ u# P! J; E; b! X7 Q
- // webclient.Headers.Add("Cookie", cookie); ' y7 ?5 d5 l! A7 l8 r
- //这样可能需要一些重载方法。根据需要写就可以了. a4 x2 |: ]+ q2 M! H& W( g8 y0 z
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");# p. ^( }' j/ u) f5 C- a7 Y" b
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");7 I! r3 M# g1 v7 u- i3 {
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
& B2 U) d' G" l- e7 @, ]$ C) ]6 f - myWebClient.Credentials = CredentialCache.DefaultCredentials;, D& e- I" p ?9 o! {% A8 ?
- //如果服务器要验证用户名,密码 4 i5 E( G' S E( m8 J
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
/ p. l3 U/ J L) t o: Z2 k - //myWebClient.Credentials = mycred;
5 Q5 C% d& I. a; P' r, x - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
6 F$ N1 G; E0 s" V5 h* O - byte[] myDataBuffer = myWebClient.DownloadData(url);9 i5 A! A, |% t4 ]" n3 P9 f8 s
- strWebData = Encoding.Default.GetString(myDataBuffer);+ T+ g; Z, ~/ L: A/ |+ \
! ?3 N" M+ q) v; g/ x9 _3 M9 c- //获取网页字符编码描述信息 , J4 j; v$ q2 L( p7 r" H3 l
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
N; i$ o2 b( S) M$ l% L) } - string webCharSet = charSetMatch.Groups[2].Value;
0 z g/ w/ Z- E I6 }9 n( Z3 J4 h: \5 ^ - if (charSet == null || charSet == "")
+ r0 D# ^3 Y2 h! o4 W M0 G - charSet = webCharSet;7 w7 _% N2 j! a
- if (charSet.Length > 0)
$ H. H! c5 V- ]5 _) P- H - {& D. d0 N0 l7 I, Z: s; R5 \" g, Q
- charSet = charSet.Replace(""", "");" o0 \* L9 Q! ]( k+ C
- }
, z" P1 c, Z2 _2 _3 c+ y6 h; L - if (UseUTF8CharSet)6 G" Z- Z: q6 ^( i) E" R- |% E
- {
7 v4 ?" e' E# L/ [ - if (charSet == null || charSet.Length == 0)) _7 e( u8 I6 p8 s; y7 c0 O
- {
2 T/ Z4 ^) K$ S# p% ~; X v) z - charSet = "utf-8";/ y" m2 P) v: m: f1 x7 L
- }; b2 g3 b5 D. T4 `
- }
: u; A2 Z1 `# |" d* p& |. @, ^6 k) S6 j$ h - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)6 U/ e- d0 L Z+ y6 q& M! R7 D
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);0 x9 r5 G0 ^4 v5 f
- 3 A _2 l# @$ L( @: c- f* z
- }5 J/ l- s+ ^. s3 A) ]8 M& U
- catch (Exception)
! k( V, O8 e4 V B" B3 \ - {# c& e2 m( c5 k, \& n) ?: e" N/ g
- strWebData = "error";/ Y: L6 o4 I; c6 |8 ~& W
- }; N8 i9 O# C& j1 t; p5 H/ }
- / U9 _. K+ a. b
- return strWebData;
( [9 u/ o! v8 n - }
复制代码
* E4 H E: h6 L) u& Y! P, P% t% {: _; }, `9 ?, h
|
|