|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
; I$ F2 t0 e( C) k- x' i: s- k3 ~) a
缺少一个gethtml,用下面这个:2 d: I$ r% r5 Z4 ]8 n
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
9 |6 z2 H5 Q9 i9 Q - {
7 I, r6 w: }! I9 k* a - string strWebData = "error";
6 `- U! Z3 w- l/ J$ j- a - try5 j3 G! i3 z5 h x4 B" R, w( Y+ L5 k& V
- {
& ?) C3 \0 R J) b b: F' { - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
- y" D& H8 D* S) {" w - // 需要注意的:
" M8 k: {# H& ~3 u' d - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 $ H) C5 ?& M' \- l
- //这是就要具体问题具体分析比如在头部加入cookie
5 D5 n7 |; c/ k9 N* j - // webclient.Headers.Add("Cookie", cookie);
. a9 Q6 s/ v7 g0 [ - //这样可能需要一些重载方法。根据需要写就可以了) Z: `( x8 T2 X' G c$ P. E
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");9 h1 v6 g( K6 E' Z5 g( p$ a" } U3 w* R
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");' V( o+ L. k" D
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 ! O, N- ^+ x. D/ u) K
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
- w) B( v, u9 e6 j* S - //如果服务器要验证用户名,密码
B* f6 r4 P5 y' r6 e - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); ! v1 M" p" F, I& x6 s1 X
- //myWebClient.Credentials = mycred; ( z1 g0 z- N W' ?. O, l) ~
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 3 I- b6 O& {% o* n5 l. J
- byte[] myDataBuffer = myWebClient.DownloadData(url);! a9 M- m. ^8 U3 o
- strWebData = Encoding.Default.GetString(myDataBuffer);
- x" | f9 C4 u1 F; [, T' ~9 z
( b$ ~3 R5 M* ?& |; a8 y& V8 w- //获取网页字符编码描述信息
: V1 X, Z- r" r5 ?* K: t5 P9 q - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);6 }: x# F9 I, N) n" @6 W3 O' Y
- string webCharSet = charSetMatch.Groups[2].Value;
' G- z8 }5 m2 T4 | - if (charSet == null || charSet == ""), \& _; y. j- {
- charSet = webCharSet;5 M! w% p- V2 s7 ]- k8 t% z" l
- if (charSet.Length > 0)
5 U' Z. D! a! `" c) S. W - {/ L) O% m% R. d- h3 {/ o/ Y8 @* U
- charSet = charSet.Replace(""", "");
# E5 u% ?' A) ^0 E$ v6 Q1 u - }4 V2 u) b I5 l
- if (UseUTF8CharSet)
& Q! _4 w. c9 _1 I( ` - {
3 z! g: L7 d0 s1 V5 h9 ?' U - if (charSet == null || charSet.Length == 0)
8 C, p$ x ^/ k9 i% r - {
/ V5 s6 t$ b6 I: p8 W% [ - charSet = "utf-8";
- x+ [2 }4 X: I$ G) [ - }
! T& {4 ]% ], W& l - }5 e# x. v( R' q% ?
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
# `3 {- B7 x% i/ @6 C/ `- ? - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
, K3 z/ l9 u# m! s* @( w- \ - ; [! m* d9 Z# s9 @ m
- }
( \( `, p# e5 }8 @* o* U - catch (Exception)
6 N' V+ M: {' w/ p8 e7 [ - {! x) k4 z$ t0 I" L) z
- strWebData = "error";
5 q( O) ^ E6 G( }0 s( I$ }" L1 k. h - }! p ]3 [. Z1 x- [9 g& ~
- 5 q4 [8 u9 H7 U8 M
- return strWebData;7 R6 \& u) a- f2 d- _4 s8 B2 u
- }
复制代码 : z0 c% f( W. {8 j1 z
, t, U& L# M$ |7 w/ ~ |
|