|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
' j6 ?0 S+ }" L% H
缺少一个gethtml,用下面这个:
4 z! ?8 v5 T( B- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 7 O6 `3 f5 ]3 D0 k0 i! b
- {6 l) G: P& r# l+ L
- string strWebData = "error";
0 c3 a" K4 ~! K# X! k - try
" D) C+ J" F3 E% V3 W; o - {
- p2 r" `/ \2 B - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 8 \, \" ]5 z! l. C+ k
- // 需要注意的: 8 l) j5 v+ G+ |: S" v
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 6 S+ L4 l# u9 m% x3 O1 _
- //这是就要具体问题具体分析比如在头部加入cookie
' @& F( T4 _% V$ C: H* t - // webclient.Headers.Add("Cookie", cookie);
+ f( {" e4 E! d$ \1 H - //这样可能需要一些重载方法。根据需要写就可以了7 i6 Z1 [2 w3 \/ n
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");' w0 l/ y8 _9 L, m+ P5 h
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");4 e: D; ^/ H' `8 H# o: Q5 w: A
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 / m2 U8 h) c$ h" {
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
" I; i) P$ x! m4 K2 e* u. d - //如果服务器要验证用户名,密码 . h5 T: H5 E* | y
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); ; _* _8 u3 ?6 f
- //myWebClient.Credentials = mycred;
6 f2 K: l. }, z* \ - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
z" B" E1 l( A) e, s( B$ _$ J# a1 | - byte[] myDataBuffer = myWebClient.DownloadData(url);
, V8 w8 Q4 Z8 Y3 X u - strWebData = Encoding.Default.GetString(myDataBuffer);* c: T7 \% N/ J. Q- \
1 e* `3 h# z9 u) u3 t- //获取网页字符编码描述信息 * l' R; |2 v7 U8 s O
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);8 V' r- m& a2 ^' h0 z3 p
- string webCharSet = charSetMatch.Groups[2].Value;
5 X- p1 [6 I5 g \* U0 K1 \ - if (charSet == null || charSet == "")* Q' k, K8 o5 a+ |, X) Y2 e( a
- charSet = webCharSet;* b. u8 |- ~2 f; h3 Q. i
- if (charSet.Length > 0)
* V# ?/ Q* ^1 P b* Y' o - {$ Q! c/ s8 i; N/ w, J2 d
- charSet = charSet.Replace(""", "");
, Z9 ]# | {! j) e2 \# E; a- g - }
* c/ ] ?' I" y( u+ H9 v/ R - if (UseUTF8CharSet)
m7 }% @. J6 {5 A# ` - {, ~/ q6 T3 a v
- if (charSet == null || charSet.Length == 0) h* W! X! e2 B+ |9 s7 S- y
- {0 ^2 O3 _$ ]# l) b, W
- charSet = "utf-8";
2 m3 t& Y/ ^ J( Z3 s - }7 ]: m! \$ Z8 x! W8 e
- }
' O b7 o$ o" D: @) L& g- O6 J - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default), W# f% H. }4 H' M1 |5 V q
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);3 I7 M4 l; N% e- Z e
- ! u$ }: [# G) G, b) z) C7 B
- }
) L+ T+ {6 c/ z; c - catch (Exception)
$ L/ f( B8 C. g( }6 q/ p0 c - {5 W$ ]9 F+ U6 r8 x( P
- strWebData = "error";) w1 S7 j3 j, Z4 r! a
- }: D, d0 c7 N% {& E: Z! l& z1 [
* H4 G1 p7 c, [& A5 d- return strWebData;
7 K6 _- D3 W4 R7 U, |* S0 p) d, k - }
复制代码 4 P$ X8 f7 t6 p0 O2 u1 K
# z6 F; o3 b; M |
|