|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
`) X8 A( K1 z: R3 }4 O. @缺少一个gethtml,用下面这个:. N+ ~: k9 @3 |9 J' e$ D# e) i- O
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
+ Q% W3 e* \0 k; t0 d. q6 x6 {, h - {
/ E# X- p/ u+ m5 {! R" ` - string strWebData = "error";
' ?, R4 D' u Z - try
. a ]* |; q( F: O8 D0 J$ E8 R - {
& F$ K0 R( H( W# K4 u7 T - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
+ \) a) H# Z' a7 N - // 需要注意的:
1 c X3 t/ I: Q* y9 d - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
8 S# b7 K- y7 [8 }+ R" e! j& K - //这是就要具体问题具体分析比如在头部加入cookie / m4 X( S* }# f% R/ ~
- // webclient.Headers.Add("Cookie", cookie); 7 F% T5 T8 R6 H* \* V9 G
- //这样可能需要一些重载方法。根据需要写就可以了- B. r4 D+ z. K6 [ A* ^
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");6 g& |, g: F0 @+ N" n% i& C
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
& Q' T0 X$ o: |# X* m J - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
1 ?' `+ ?' [) \8 r2 g - myWebClient.Credentials = CredentialCache.DefaultCredentials;
1 _) K0 [# C% }/ K) E+ x - //如果服务器要验证用户名,密码
6 r8 {8 y! B! \$ m, T, R2 h - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
7 s! w# y8 J( b6 X8 z4 ?3 H8 {6 K - //myWebClient.Credentials = mycred; 9 A, R: d% x7 U/ h2 R! H {
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) ; Z- ?- O+ X9 b$ K/ Q0 N5 ~# ~% D
- byte[] myDataBuffer = myWebClient.DownloadData(url);. m) s) U1 r6 D8 O, z6 w# f& b
- strWebData = Encoding.Default.GetString(myDataBuffer);
" b4 m3 B) i+ k# X# r7 k - ( y" e0 X. t' [ [7 j
- //获取网页字符编码描述信息
: P" x# |9 w$ q8 j# M7 Y7 r - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
$ l9 [' B* x, U+ _ - string webCharSet = charSetMatch.Groups[2].Value;# C/ P" E' S- u7 }1 O
- if (charSet == null || charSet == "")
1 ?" C' Z N, m& M6 N; y* [ - charSet = webCharSet;. O8 W' A' R j# ~
- if (charSet.Length > 0)7 K: W$ U- C+ K5 p
- {
, s0 T+ J e! j6 t O% J+ ?' i% G - charSet = charSet.Replace(""", "");+ j" w" F' i; E
- }; k6 `2 n; [2 i8 s
- if (UseUTF8CharSet)' {- x r2 X4 S Z1 c: n
- {
W' L! o& D4 l+ }# e. [; d - if (charSet == null || charSet.Length == 0)
, C, g: N z/ g/ K - {0 P7 M- z7 z/ Y/ Z
- charSet = "utf-8";) |, o/ { x* p2 y) R0 w6 `
- }
- Y; d( a+ U# k L9 z - }' k' t* ~$ h) `; {
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
% T' D9 V% h5 X5 _ - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);9 w) m( Y9 V' w
* E$ h4 X+ P/ P! d9 v5 V: @- }
/ W0 @0 o1 A$ q- }) d. u - catch (Exception)
2 n( \7 U: L, V. c' x6 z - {
2 t+ ?9 D* d6 K" c8 g( I. m - strWebData = "error";
$ p0 k" U1 d# j2 j& J( v" ] - } r" N9 L8 N$ F/ `
- ; a8 j/ }6 \6 u1 p$ ?; N
- return strWebData;/ q. d. q4 ]" }7 s3 H
- }
复制代码 8 f* @4 ?8 b8 M$ H
( r2 ~7 r( C2 w: k3 P
|
|