|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
9 z/ m \" v1 r2 D& O缺少一个gethtml,用下面这个:
3 t8 W8 \9 k! ^, q- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 & _( a3 s0 b! Z! @. ]1 E" s
- {
q. s& W% X) I! j4 ] - string strWebData = "error";4 g5 x$ t% Q/ ~' u# z9 H3 X
- try* ?9 C3 ~1 U3 Q
- {2 s/ H4 |9 {' X. m6 W& Y! t
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
# l3 Z/ C4 y S: s - // 需要注意的:
9 S, x, \. t* `! E8 Y0 c6 P2 }0 l - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 ; L5 l5 p7 y' W: @0 C3 u
- //这是就要具体问题具体分析比如在头部加入cookie , h. K% [: _* p, P1 _
- // webclient.Headers.Add("Cookie", cookie); 2 f2 y# o: c$ j2 J
- //这样可能需要一些重载方法。根据需要写就可以了' J5 c# Q! B! f9 o- E) R: i
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
" Y+ f. e( n# I1 h2 V0 d; n - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");6 s5 B$ X# K- h/ B' }( p" m
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 4 ~' U# {! I. @1 |% t1 O) p2 J
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
6 t3 E% [" J- s1 X! j! g - //如果服务器要验证用户名,密码
, K" V; ^! d1 S. _ - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
Y. m, X, L* ?; L8 L - //myWebClient.Credentials = mycred; 4 ^) }6 J4 R6 F7 s( P {) O( Q
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) : V! m. N& S4 w Z5 u8 ?: y* r% {
- byte[] myDataBuffer = myWebClient.DownloadData(url);
5 ?6 U/ M: L; g - strWebData = Encoding.Default.GetString(myDataBuffer);
7 T# l+ m$ C% ~! X6 e
; N* M5 G6 w; O6 J- //获取网页字符编码描述信息
. ^$ W, C& B/ J6 j - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
0 b2 E" d- E- u5 R - string webCharSet = charSetMatch.Groups[2].Value;
! `1 X v( {& h, x' B* J; H - if (charSet == null || charSet == "")/ Q* W; v s% h; q
- charSet = webCharSet;
) t8 \% f5 s7 N- h5 e - if (charSet.Length > 0)0 f* u1 g: [# s" Z, z. j
- {
7 l: O& Y9 K4 s! C& m8 Z - charSet = charSet.Replace(""", "");
" A5 e+ s# |1 C3 M& R - }
' j b7 V7 y7 D z) p* G - if (UseUTF8CharSet)
1 z) B) s9 c( b- d9 X8 A - {9 G3 X3 M5 `5 e, B
- if (charSet == null || charSet.Length == 0)
* E! t5 p. k B l- `( w, I - {
/ c/ @+ G( Q) a0 x4 e* p/ S E - charSet = "utf-8";
6 z0 F* J( D# m2 T: d s - }8 ^% r6 m; }4 v- G8 M
- }
# U# A: D7 ]& J* c0 w$ c, M& V/ e - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)$ M, G: Y+ d: }! s3 a1 x' ~, F
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);/ y; K' R; J+ \7 U' B% `5 s5 T
- / T& C* M; w& m) J
- }6 p" j- A7 t( K7 \
- catch (Exception)6 u9 F% W; r* A1 Z) L% t
- {
7 f$ @8 |9 d6 s# R/ |6 u - strWebData = "error";+ y+ W# p8 H3 Z
- }
; A7 p3 `. z+ k8 P, L+ c3 x
9 } l6 o+ v) ^9 c- return strWebData;
, ^! k( h7 q' j1 @" H3 `. t9 ~ - }
复制代码 9 d9 g. W/ D5 \6 e4 P) m' i; Z
1 ] {% k" X4 \9 h- M! l$ Z+ T9 T |
|