|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
* a& V$ E4 ?" w# Z- H7 V缺少一个gethtml,用下面这个:
5 Y6 X7 l$ q0 [/ S' y- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
E" x0 t, `% ^& P) I* b - {
9 ^! k+ p3 \. e- P$ N8 I+ }: [ - string strWebData = "error";
2 o2 n g# L8 G - try; f+ F, W5 `+ d1 |; o
- {
5 O% i4 Q' q6 n! X4 K - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
/ B n% ^/ W' Z+ c1 H5 V3 n5 o - // 需要注意的:
; b+ R7 ] w, ^. P0 O+ @" l; n - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
; j5 }1 U2 O/ n* O - //这是就要具体问题具体分析比如在头部加入cookie " n; u8 j6 v& t1 Z" L* b
- // webclient.Headers.Add("Cookie", cookie); * _/ e# V# {* Y* \
- //这样可能需要一些重载方法。根据需要写就可以了
$ Q5 M2 ]! c7 u5 a8 q3 J) e5 C - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
9 v! A7 }2 M* H L* g - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");! l- R+ e8 L2 v; ?
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
9 k; d: F0 S% w/ l: b: u - myWebClient.Credentials = CredentialCache.DefaultCredentials;
$ f' ~! R7 \3 f4 S - //如果服务器要验证用户名,密码
6 z# `+ Q" Q/ c1 e, | - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
- Q- W2 N# n% j5 K - //myWebClient.Credentials = mycred;
, {% M$ a$ n* v/ k a6 G* r( P - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) # M. D- m% q0 I0 z2 b& o" s
- byte[] myDataBuffer = myWebClient.DownloadData(url);. ^0 i1 u) F% R0 p- @# E1 b
- strWebData = Encoding.Default.GetString(myDataBuffer);8 X7 a; T3 ?! \3 ]( o
' m; L! v7 K$ S) U- //获取网页字符编码描述信息
) _7 s( u, }. I - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
" T# J1 Y4 J8 L# T# W( g: W% h - string webCharSet = charSetMatch.Groups[2].Value;9 a- o4 F5 n9 O$ n0 z; H6 H" a% B
- if (charSet == null || charSet == "")
3 L& \$ r9 S8 D - charSet = webCharSet;! D$ a( M: x# U. V0 j" A
- if (charSet.Length > 0)) j# ~3 S4 R5 Q2 ~- E' ^! Q
- {! [. A0 \- [$ n- ^: R5 S
- charSet = charSet.Replace(""", "");
4 {; F4 E! F W3 j# n. q: N3 e/ o - }
# U0 ?+ J+ G* |: D* A. \ - if (UseUTF8CharSet)2 J0 o* `/ b& N& [; c0 v3 p
- {4 }9 }9 o0 j7 I8 n' Z" j
- if (charSet == null || charSet.Length == 0)3 S0 n# q8 F ?9 C$ @% S
- {
/ h. v- ]! U' ^3 Q& B! }/ b - charSet = "utf-8";
5 V( o* f& h" W" { - }
4 }0 q/ M; S3 ^" W) q) M6 O - }+ ~% m5 s' i# I* I
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)% u4 d& h- X, Q0 F9 Z
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
$ A% `5 g1 M6 C# [ s - 0 _( I/ O3 h! b! S5 u
- }% d0 c2 ?; X2 R. Y6 \) q
- catch (Exception)
% R& r3 h' S' I - {
, T" V! R' J( f$ M7 o1 Z4 P - strWebData = "error";
% t7 B4 y( D( s - }% S3 p( h- v4 ~/ X% s, x2 ]. g
- $ }8 t( H5 |& i3 r; K) s
- return strWebData;0 h1 y A, q6 |; q* S/ d' L, E/ T
- }
复制代码 ' I9 \; X- ]5 P( _& m% I1 e
9 d. l$ F. n ?2 E
|
|