|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
4 u( x% u2 p# ? S3 M4 [缺少一个gethtml,用下面这个:8 |" ~5 O/ L- V8 r- M3 y* o
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 2 s- v( } E6 v. h# C) I
- {4 y3 C, _# S( {% G4 q
- string strWebData = "error";
+ _) A% T2 ^8 w3 n& P7 ` - try' E+ h+ \ ^0 P% ~
- {& F u. d7 s) [% W2 ^7 Q
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient : d: h+ Y g0 K
- // 需要注意的: + c9 M$ @- o4 ]
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
7 m! G6 x; A& G5 p - //这是就要具体问题具体分析比如在头部加入cookie
) Z& V& U% \. H$ e) N, ~! [ - // webclient.Headers.Add("Cookie", cookie);
: J9 j1 S J# d5 j% p0 v. A1 s - //这样可能需要一些重载方法。根据需要写就可以了
$ v6 ~2 a g0 z9 x* |4 I! ] - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
5 ?, M: P0 A7 p2 h3 F - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
' A+ O" {0 I- G0 F/ f - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 ; a+ W2 b; T+ C% A4 M0 g
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
8 ~2 z l1 J/ c - //如果服务器要验证用户名,密码
& z5 I' ]9 Z, D/ N6 d) E+ I m2 J G - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 5 }! }1 Q" x% ?5 t# e6 Z, l3 _
- //myWebClient.Credentials = mycred; ) ~) r6 M8 }5 H$ U) u" P( W5 n( |
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
4 _+ D% U# ^4 n: a; z4 k - byte[] myDataBuffer = myWebClient.DownloadData(url);
% g# e' [ j. J - strWebData = Encoding.Default.GetString(myDataBuffer);0 g9 V1 X+ N. j3 p! R
: D) N/ b" v( s* v4 [2 r- //获取网页字符编码描述信息
/ a# `. l. h5 \, k - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);2 p: ^+ u4 n1 D1 k+ C( n0 W* I: ]7 y
- string webCharSet = charSetMatch.Groups[2].Value;" d9 j5 x; b6 T" m5 \6 b+ `- V
- if (charSet == null || charSet == "")
4 b! u$ A& h# M2 T2 X1 `" p - charSet = webCharSet;/ a7 P* s1 j) N7 G* p) N
- if (charSet.Length > 0); ]3 A6 D, e0 h
- {; K- ]6 I5 I c1 U. V% j; a
- charSet = charSet.Replace(""", "");
# U' C! i3 o4 I/ A( c1 ?1 \ - }, f* r% U: q+ Y1 w( ~
- if (UseUTF8CharSet)) D* J8 `9 K+ W. @* y8 Q X
- {% k4 H9 P. p# H. _7 i
- if (charSet == null || charSet.Length == 0)
! ]0 M/ E4 D- j/ {( U! ^3 o* C7 ^: h - {. m Z) c, m( F! m+ E- ?; g
- charSet = "utf-8";
- ]; o( X/ m! [: j( ] - }
$ q1 B6 i, L# K9 u1 ` - }
5 S Z9 P5 c- F t: a. S - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
& x3 T, N' E# T7 g5 U2 b7 ` - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);! P# s: C5 S' w2 B0 D
8 {+ j- f Z$ A' g+ e4 s- }! h" j$ e, _8 U, y" ~7 I# [
- catch (Exception)$ ~0 f" A3 J8 _" Q9 [- f
- {
) }& }5 `+ M2 u - strWebData = "error";
# m/ o' T+ e! F6 r; n, j( U - }3 J( ^; L9 g' V& {
0 _1 R) [6 V6 f4 ? P- return strWebData;
( v* Y; L' ^0 S. t4 g+ T, N7 l - }
复制代码 8 x5 k# q4 G# }7 ?! g+ j1 @# }! y
8 a2 j4 n% K% O |
|