|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
0 i" R" ]5 U: E+ c缺少一个gethtml,用下面这个:
7 }/ K; _/ F. i# Q, D! e- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
2 z" r6 o2 l: x* A3 F - {
# n& }/ g8 k5 R$ y6 w9 J8 k7 l - string strWebData = "error";
3 Q8 O0 I5 y1 r - try
. x" u6 R3 e# o! ^4 M - {7 p& x. R7 A$ T* ]2 c
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 7 t2 Y% Q: p9 s3 r
- // 需要注意的:
3 F4 M8 v4 X& ^( X6 l+ D - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 ( B* k7 \) g; ~7 i9 L! [
- //这是就要具体问题具体分析比如在头部加入cookie
4 E5 s9 g7 o( K - // webclient.Headers.Add("Cookie", cookie); 0 W6 P$ l, h, }) a3 b3 D
- //这样可能需要一些重载方法。根据需要写就可以了1 O4 f6 C4 j9 C' i3 y
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");# f" y& d# y* X- u
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");& p8 h) `0 W* L- F( t4 A8 [
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
' _4 I3 h- p% n, f6 Y$ T8 C. G8 O - myWebClient.Credentials = CredentialCache.DefaultCredentials;. d; D/ p/ G3 l
- //如果服务器要验证用户名,密码
. [% Q5 H3 ?1 x( L; Y# N - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 7 Q" I$ L7 d* @- D# A
- //myWebClient.Credentials = mycred; ; P6 R; T1 s! w! [" H. t, r
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 8 u" @' r8 b6 b. }1 w" t5 j
- byte[] myDataBuffer = myWebClient.DownloadData(url);$ i O6 o* `8 O% {, o5 J9 D
- strWebData = Encoding.Default.GetString(myDataBuffer);' |, W; ~% X# E$ y4 e& r: f
9 W" w6 R- w4 e: I5 F# V- //获取网页字符编码描述信息 2 \. M- F' i6 m5 x
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
; M+ L- [2 }/ S) n - string webCharSet = charSetMatch.Groups[2].Value;
2 M; I( f; \3 [8 f - if (charSet == null || charSet == "")
( [9 l+ f7 U( L- N2 S% Z4 ] - charSet = webCharSet;; f$ u2 n& M5 D. T0 @
- if (charSet.Length > 0): P5 F, L. X( ]) d( e* b* P
- {7 x& |0 R! N l, ` ~, N
- charSet = charSet.Replace(""", "");) `/ u1 `( z4 [$ Q2 Z# U' u" i
- }
( [' {8 L/ ^7 T. o1 {. o. A - if (UseUTF8CharSet)
_" x* p" x, \6 a - { E6 j; V* p: p! s) s) t( Y4 w% y
- if (charSet == null || charSet.Length == 0)' I! w& [* Z" B. J, C
- {
: b& B/ J- Y$ }+ o9 Q, k - charSet = "utf-8";
1 i% T6 Y' k# L" I* j. a - }
* |1 ^2 r, b$ E% ~* a - }# |9 C% V; G: ]! q7 q
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)/ l& k% j3 K$ ^+ o" K3 d/ O
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);! D# l, n( u5 M7 l
- ; H2 Z& p% M3 O( i1 w
- }
! c3 j1 ~8 r+ p" [4 {. w3 K - catch (Exception)
9 }- R# T2 k7 H2 V% o8 X - {$ ~. [1 H9 [0 q) ~( ? U
- strWebData = "error";
& M# H6 _$ m- e7 |! _/ w - }! l. k1 ?3 R3 i6 ~, r, r# y) k9 z9 ]
, {) z7 f. W: ^7 [; D4 m- return strWebData;
$ i4 \" j& n5 |. _6 [4 x% C0 v. x - }
复制代码 + o1 l) N, H9 D7 Q* K0 I" ^
' h" E' r, h z5 k |
|