|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
& S. d: Z; v% o) b k7 Z+ }
缺少一个gethtml,用下面这个:4 F% s! ^& V- [* `; N" z
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
$ X8 b/ ]+ e: t; {5 q - {
3 _/ X3 n/ @: j; u0 E3 g - string strWebData = "error";
' z% a4 ?: m8 x3 t - try
7 j2 b/ C# |5 `4 X) K! y - {
; n& z& r5 r% k7 q( D( B4 t" z- ? - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
/ B2 o6 S9 i% s3 u, x1 g - // 需要注意的: : K6 Q- A7 X0 l* T1 Z
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 9 g9 R: ^# b- x6 `1 h
- //这是就要具体问题具体分析比如在头部加入cookie
1 {7 U9 ?4 N, [+ L - // webclient.Headers.Add("Cookie", cookie);
6 b K2 ?( d+ S T - //这样可能需要一些重载方法。根据需要写就可以了$ Z- n# e( t4 U1 N- y) m
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");/ a- ^& C5 O/ x4 n
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
* |$ r6 _% q d2 o8 P; {' o - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
9 w2 k' J N# c# d' O& V! U) s8 T1 l - myWebClient.Credentials = CredentialCache.DefaultCredentials;
& e: T5 m, c$ o2 m - //如果服务器要验证用户名,密码 0 b& C( \$ P. v0 I6 w0 F" O
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 8 C2 S5 z% b9 e% N8 ~
- //myWebClient.Credentials = mycred;
# j% _$ u5 k _4 W - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) , n# s+ o, Z: P
- byte[] myDataBuffer = myWebClient.DownloadData(url);; O( K* ?4 w# h C# \, u+ u5 K% }
- strWebData = Encoding.Default.GetString(myDataBuffer);
: c2 X$ C7 G% M: v4 {+ Z
/ d8 F) J0 Z5 e! T- //获取网页字符编码描述信息
/ j) c8 H: D0 M& \& c+ l y/ ] - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
0 G% l" }5 X$ B" X$ s - string webCharSet = charSetMatch.Groups[2].Value;: R! d0 a0 N. x5 Z |4 z- @% r
- if (charSet == null || charSet == "")
F+ v( Q1 y4 D, @4 u; j - charSet = webCharSet;
% `" w, p1 s- d: U9 K6 P b: } - if (charSet.Length > 0)( p/ V, m/ o. v. k/ `5 e
- {
0 p( {# T! d+ o; K2 D \ - charSet = charSet.Replace(""", "");
. f( i) W0 }+ [% A - }
/ @2 Q7 D/ r3 e4 [: H* @ - if (UseUTF8CharSet)
9 X2 y8 q% j, m1 _8 ?0 j4 ^5 [/ j - {
' r' c6 Q. f U7 c! i4 \ - if (charSet == null || charSet.Length == 0)
& c0 B8 n$ x7 O l6 m' S6 r& e - {, L1 U6 c& S) G- z$ k' ]
- charSet = "utf-8";
: e3 x+ S6 `3 e8 Z - }
* c" J7 @! ]8 F. Q B5 ^5 O - }$ ^8 ~ P7 q9 E7 o
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)6 x4 i0 p2 }! |. s, @1 G! Z
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
! V% g7 z' }% V. p! W
0 B R# w7 Z- H- D( G3 G# K6 y5 u8 ]& A9 u- }0 d! Q- E0 g; Z8 B
- catch (Exception)
6 D+ z+ H3 h% |1 h. N; ^2 J4 w - {- q* F5 T& o3 ~$ m% d
- strWebData = "error";
8 Z$ T4 m! U1 P" `5 _2 b; y- X - }
; p+ k' N: U: ^3 f
" X+ o7 J& V3 l- v2 c0 P W- return strWebData; _# G B3 [% v7 ]5 R1 F
- }
复制代码 " |3 u5 `3 n; n3 e# X6 a9 u
) g- p5 \3 i4 q: g+ q2 m |
|