|
发表于 2016-2-26 16:23:12
|
显示全部楼层
4 k; B o# k: c& z缺少一个gethtml,用下面这个:
6 s5 C/ d* e) h* \) C8 J2 ^- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
* n& |5 f3 P4 G; R$ E - {
6 G* w2 |3 x) P2 ` - string strWebData = "error";1 M9 K% J/ I% ? R" `
- try
. B A& d& S0 c* c2 ~% B1 o2 ?% x - {
6 ]& f x) p/ R$ g - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
1 v2 O: z: m* z- d - // 需要注意的:
9 A C/ U( U! \' T1 _. s, L* C - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 9 a3 R7 i; x: C |
- //这是就要具体问题具体分析比如在头部加入cookie
8 O1 j7 @; V9 D1 K" n - // webclient.Headers.Add("Cookie", cookie);
9 Z' J- u, e+ E. P* z0 D: l7 q - //这样可能需要一些重载方法。根据需要写就可以了
) j _9 x2 f$ _: A& V - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
l( p% \$ d- W. W* e1 t4 c$ S; O/ N g' N - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");5 q' ?1 ?: t4 t
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
: _# P6 Z. Z, X - myWebClient.Credentials = CredentialCache.DefaultCredentials;! K' |* b) _" i1 t+ ^! `
- //如果服务器要验证用户名,密码 ) F7 L. I5 O, a- Z- a2 S
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); , r" t! ] D) T
- //myWebClient.Credentials = mycred; ! `+ B# b$ m9 I
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
5 A' L% I6 M" ^8 {8 N' E. A2 I/ `0 ~ - byte[] myDataBuffer = myWebClient.DownloadData(url);1 P, l; r4 R6 O" |7 P& [! v7 _
- strWebData = Encoding.Default.GetString(myDataBuffer);$ w+ G: K. ?" X+ M
: i( `: G1 E4 O9 ], b- //获取网页字符编码描述信息
$ B9 o* }9 N: a/ P. ^& \ - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);6 a5 P- }. h0 o/ d% n5 O. z$ q
- string webCharSet = charSetMatch.Groups[2].Value;! r) ^1 T i' O' L5 c# Z* s9 L& C% |
- if (charSet == null || charSet == "")
0 `+ l k& x5 P/ c8 z - charSet = webCharSet;
0 N) o l1 u% t6 \/ A7 w, r - if (charSet.Length > 0)) k1 R% ~5 @: `& w( ]7 d! Q
- {) l9 _+ K8 }( V9 _9 N
- charSet = charSet.Replace(""", "");$ C! m" n- `& `9 \; ~' |, m
- }2 f: Q# J# H' _
- if (UseUTF8CharSet)
2 g! w1 i$ i# A - {
- J; A9 K3 C+ A% F0 D' [+ _$ G - if (charSet == null || charSet.Length == 0)
1 @, P6 h6 ^* K - {: Q u n8 F, k( I4 ?2 `( i8 c7 }
- charSet = "utf-8";& E H: A/ ]: e9 |" r" O
- }
6 k3 i/ K6 ^5 O0 `' t# S - }
4 ^1 R+ K- Q4 ] d8 z6 @1 z - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)0 _4 b0 E, K" V) V1 H
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer); \* l& o* U; o
- # _+ F- X1 i- P* u
- }" m5 O0 F0 \8 u( e. J
- catch (Exception)5 Z! y7 ]" \0 Y# e& c
- {
- X* k. ~6 Q0 y - strWebData = "error";0 ^. ?, Q R9 W' g4 M3 G# R- u* _0 {
- }: G, q6 A2 V' s. T3 K2 k) s
& [; j4 O2 k3 Y& `/ M- r- return strWebData;
/ W& ] `4 h7 m; k/ x5 d6 Q - }
复制代码
9 r% U( [( K6 I8 J2 a
- `7 a+ s% ~/ P2 } |
|