|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
- o6 N$ i: h! X: r2 Y5 F. H
缺少一个gethtml,用下面这个:! T+ b) k' U" o k3 S3 E- ~+ C
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
. [' ?9 ?+ m& T; u8 o/ B' @/ q - {: A4 b5 I" z: i
- string strWebData = "error";
: y, D/ N1 z& y n* c% \8 c - try0 T/ z4 d* r2 f/ p k* T5 [
- {
" _- k2 x" \1 s T; d - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
( A) z/ i# M \/ G - // 需要注意的:
, Q. v: S6 K" W' A! \& k$ k - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
3 k0 i) W" C& Y8 l. T - //这是就要具体问题具体分析比如在头部加入cookie
$ e4 v9 \, e8 s3 _( K3 m: g - // webclient.Headers.Add("Cookie", cookie);
% R0 i5 S r- Q$ o% d - //这样可能需要一些重载方法。根据需要写就可以了
1 u$ `) a# ~' }2 w - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
) q! M9 N3 w5 Y- h! X8 J - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");$ R+ E8 T+ W; s+ N: F5 j
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 + N+ h1 v; ]# P! N
- myWebClient.Credentials = CredentialCache.DefaultCredentials;. D- g1 g& r; T6 {; j+ U
- //如果服务器要验证用户名,密码 5 y. z- L7 j$ x2 x' u4 w
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 9 D$ C# I8 q8 v' U' |0 @" `# @
- //myWebClient.Credentials = mycred;
5 ^( E, T8 d3 M/ Q' q# j4 D - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 2 D! ^$ y. Y9 p1 k/ z; ]. m. U
- byte[] myDataBuffer = myWebClient.DownloadData(url);# R$ G) z& K! B* q" m! u
- strWebData = Encoding.Default.GetString(myDataBuffer);. Q3 ^# D1 l% q, O
- + _& a2 L; z& ^! N% M
- //获取网页字符编码描述信息
, I% N- d& Q ~ P5 T ^) u! ` - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
1 V. `9 p: F( v& C. ^" s9 o - string webCharSet = charSetMatch.Groups[2].Value;
1 `+ g+ U- y& c& F - if (charSet == null || charSet == "")6 @# g$ ]$ R0 D' p3 m9 ?
- charSet = webCharSet;
2 m% H. z2 ?$ B1 Q* n1 L6 X - if (charSet.Length > 0)
, W! O# ^: T0 X% X - {$ ]( N+ s3 Q) `' h6 V$ D
- charSet = charSet.Replace(""", "");
" c# ~; g+ F5 n - }
& D# K0 r0 L" w! U4 M, v$ t1 X! f j - if (UseUTF8CharSet)
/ v+ i+ n( R1 Y1 X- Z$ f, r - {
! p6 |; M# ]# p3 e. z6 J$ i - if (charSet == null || charSet.Length == 0)6 F7 Q/ F3 V6 D1 f! @( p; F
- {
2 h& H+ Z z e1 N - charSet = "utf-8";
9 o t' i* i+ u; V s: u6 g3 ^8 a - }) j c2 I% [- p* b
- }
! L/ l1 H |+ k y - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)$ e" M1 ]% U& q/ B# A2 R# X' W% g: g2 E
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);, e+ Y6 O" B7 ~: W( J _0 ^
- + x7 }9 }$ h7 m
- }
. e# C+ \' s6 E# y8 B) R - catch (Exception)$ x0 U9 M/ {- E* z: S
- {
* o( c: W" x3 w; d# t - strWebData = "error";
0 ~5 l( R8 d: t - }2 h: [* X" K1 t3 e/ W, Z; O
- 1 r/ C3 p+ I+ R/ s' T
- return strWebData;
; I4 L% I1 ^% @/ D6 ? - }
复制代码 ) q1 |* G. C# Q) @8 `
, n4 `9 x$ S" M* x. w. z3 G |
|