|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
1 O; C, s& X; h: l) s s
缺少一个gethtml,用下面这个:$ ~, i6 ^2 ~: \2 a% y, c6 B- B
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
4 V, |! ^) b3 F% e/ c - {
- B4 n8 X2 _% U' v* c - string strWebData = "error";7 R; ^- P1 S( @! o+ j% e
- try
/ |) S! ~. \0 F3 Q7 I# g- Y. H/ v5 h - {
4 d/ z7 k0 c' l: N# v" } - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
" q( M% v: H y - // 需要注意的:
L3 o7 r9 y4 O - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
! s1 F, ?$ m+ S1 k% B - //这是就要具体问题具体分析比如在头部加入cookie
: }- F: ^- d: b; |2 | - // webclient.Headers.Add("Cookie", cookie);
0 }7 T6 S, ?* C4 m+ [3 T- r( c - //这样可能需要一些重载方法。根据需要写就可以了
' W% U ?& @* V - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
8 j$ M0 M7 M8 g! D, P - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
4 r, p1 I1 v" R) Y - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 : G7 {' C- R2 h P! ^1 `' J, Q+ \* S, X
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
) |; T" c H# R8 t F' q( D/ \ - //如果服务器要验证用户名,密码
3 w0 v8 o7 u2 \; d - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
% o; O7 Q% \5 w8 {, H$ P% H - //myWebClient.Credentials = mycred;
/ l- m" f2 f9 B* Q+ G8 ? - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) * w4 g2 S8 N! A
- byte[] myDataBuffer = myWebClient.DownloadData(url);
* c" L, { t% z. G - strWebData = Encoding.Default.GetString(myDataBuffer);8 @! b' m1 K& k. j
- ! K$ r9 g/ ~& }! G7 @
- //获取网页字符编码描述信息 ( S$ D# _" R& y; {
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);. n" l e6 p' S# R; f8 ^4 V l3 D
- string webCharSet = charSetMatch.Groups[2].Value;
' o: N7 M. F! h1 `% b5 w - if (charSet == null || charSet == "")4 [4 a8 Z7 u* X, w
- charSet = webCharSet;
3 S ~ T$ A& ], A7 U - if (charSet.Length > 0)
|( X3 I+ X! Y. v7 X/ X - {
0 I& X* k6 N7 z! m - charSet = charSet.Replace(""", "");
; n8 Z/ e0 r$ d$ F6 O% i - }
" k1 m) _3 ~: H7 E8 D - if (UseUTF8CharSet)
5 Q( m7 m* \9 H* _ - {" R- g9 j$ Y+ `8 n! I
- if (charSet == null || charSet.Length == 0)
% h" P S; c3 o* ^$ p - {
( Z" e3 Y- c/ W. y - charSet = "utf-8";9 T/ c2 m; \+ g) K6 f& U
- }* K7 E/ ?# k2 x, Y' D" x
- }$ W0 s3 R) S7 }0 E
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
2 K4 K; @& T6 j2 E9 e6 G) C# E - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);& u4 W6 v4 G' x) t
i3 g( N! P& L( ]- }
0 A( k. l" T- t& y2 L - catch (Exception)7 g. T- j- K( p' N" D) K
- {- t" {, T: s G$ b
- strWebData = "error";
* { v( J/ ], H5 a$ P% S - }
+ f* q+ q- S: `: Y& O. C
% J4 x, g4 A' ]! r5 |) @- return strWebData; ^) S$ I; F: O* f: _5 c% }
- }
复制代码
# r4 S& M- w O/ |& I+ r2 L! m* i& R- n6 C
|
|