|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
& y( L1 b6 @. a0 J |缺少一个gethtml,用下面这个:
! J' W7 d$ E: ~8 q- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
- X! d* |1 v. J# a0 v( O5 o - {
# b, @" {* t% I! k6 f7 I) T2 I - string strWebData = "error";
) |0 u ^, W' C$ ] - try
) p$ Y* o Y( {+ | - {
' h2 J9 v5 r' j7 x - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 8 U. \$ K' j7 o# f3 o6 B0 n- z, J: E
- // 需要注意的: + N: Y% B9 \* j- f- {7 w
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 ' l1 _5 M0 `4 Y. k( q& t( V0 }
- //这是就要具体问题具体分析比如在头部加入cookie % U3 Z9 k, {9 G" L9 f7 n2 C# p
- // webclient.Headers.Add("Cookie", cookie); ! Z- ?; y* I4 } [
- //这样可能需要一些重载方法。根据需要写就可以了; C6 j9 b i4 v( @
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
3 c1 |; g9 D5 J7 N# R0 h) Z4 F9 m - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
' T8 X u. P+ u0 Z) t! ? - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
) W+ l* K. P& U - myWebClient.Credentials = CredentialCache.DefaultCredentials;" e3 x: ~/ E, G& Z# ?2 J
- //如果服务器要验证用户名,密码 ; D# \ H6 n! Q' |5 r# n A4 |
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
( j, P# H, l& ]7 r - //myWebClient.Credentials = mycred; 5 g5 B# V6 N2 y% H
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) $ ?$ q* c' i, f6 r8 |$ a
- byte[] myDataBuffer = myWebClient.DownloadData(url);
1 g. ^) \7 t' w6 e' i; }* ^0 | - strWebData = Encoding.Default.GetString(myDataBuffer);
^; f1 s( }' v# E& l8 X. v4 ]
% p# G2 ]: I' U- //获取网页字符编码描述信息
4 S+ ?% X$ P/ x9 M- e - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);; @6 h5 O; k/ g! t
- string webCharSet = charSetMatch.Groups[2].Value;
5 _8 e& P* M; g0 a; d% f - if (charSet == null || charSet == "")
2 x+ g M- b$ M" E3 u$ w5 ]& h - charSet = webCharSet;: C5 Q0 Q4 n! f
- if (charSet.Length > 0)- h5 B, X% P8 r4 a, g
- {6 i) w& h: \" d
- charSet = charSet.Replace(""", "");
r8 i# y5 g# W3 g' e( O - }0 X. w& ~1 k7 @" t! s7 z" H
- if (UseUTF8CharSet)
( a) E/ F# a( z - {
( D9 d* v2 J( y" `/ c# A6 q! { - if (charSet == null || charSet.Length == 0)
+ ]! E/ ~9 ]6 K* H% V! h k - {6 Q1 L. R5 U3 o/ _, x
- charSet = "utf-8";
8 N w" ?6 ~7 ]) \ - }
' V( O, N' V- x, c - }+ @5 _& I9 v' |1 |, j2 Z
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)1 \; O% ?' `1 W
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
. X' n' z. S* X X3 r- V
) d6 S8 F. ~: r7 J! J- }
; n0 s$ _6 k% Z; c - catch (Exception)
* s. ^# ^& F" j) ~ - {
1 h( x1 K1 ?* m( o: Z, @ - strWebData = "error";0 [ }$ \ v) {; U$ n9 I
- }
B% B3 S/ o5 j7 O$ I& b1 T - . K* A, }6 w$ \' Y0 Y
- return strWebData;$ o* I6 v2 w& ^. g6 a) @
- }
复制代码
& s6 w9 ?$ q7 B
8 }& L1 U! {$ h2 m |
|