|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
# E5 R' s$ t/ b缺少一个gethtml,用下面这个:5 u9 c8 p" z' ~3 r( f) e3 m5 O, D
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
( E/ m. m' O* H( |; p - {* P) f4 n: @1 k# i# l# G
- string strWebData = "error";
4 K6 {! q5 @3 ?& R9 t8 }- X) z; _8 | - try# w$ W% D7 l# r. C9 P- x
- {7 V) |' N2 J0 p, a% J
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
- b1 \( e. i. |8 \ \; W5 f9 ~$ e3 } - // 需要注意的:
4 L5 O4 i1 @+ z: O - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
0 \$ F+ `2 A2 G+ R) \* ?9 }: e" H# ~ - //这是就要具体问题具体分析比如在头部加入cookie ( d$ X4 k& z% b9 R
- // webclient.Headers.Add("Cookie", cookie); 8 n0 j+ G% z' {& A# T
- //这样可能需要一些重载方法。根据需要写就可以了
2 i6 S8 C4 T; h/ S2 v - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");( o m) n6 y$ s' e& Z) f9 ]
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
7 u7 z% R- y# [5 e - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
' k- v- T- ?; R7 L - myWebClient.Credentials = CredentialCache.DefaultCredentials;
6 V8 X9 ~8 S' p% ^+ t7 f9 l) k - //如果服务器要验证用户名,密码 ( i$ W* r+ R2 g
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
9 }9 f+ B; \, J - //myWebClient.Credentials = mycred; 9 c) J' J1 a1 x
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 4 v/ x9 B8 p f* c5 y6 O/ L% ` b
- byte[] myDataBuffer = myWebClient.DownloadData(url);
& H# P* S$ o' |( o4 {) K - strWebData = Encoding.Default.GetString(myDataBuffer);/ u* y: Q6 i0 C( j) V
9 w7 T' @9 e0 {: Q( Z- //获取网页字符编码描述信息 & ~4 n6 ]' H% Z' x) ~
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
! t/ b% D' P8 |" D. D - string webCharSet = charSetMatch.Groups[2].Value;. v4 P1 {& v6 n' H: ~
- if (charSet == null || charSet == "")5 E2 Q% \+ a0 k1 x+ m
- charSet = webCharSet;, L+ j' ?9 x* }0 N
- if (charSet.Length > 0)
; u& Z6 Q4 ?1 X; n' b$ e( } - {
7 a; I4 w5 A9 K. { - charSet = charSet.Replace(""", "");" L+ f7 `/ _8 s- R) \) j
- }7 |' a" \5 A3 h0 o, z5 j5 Q4 C
- if (UseUTF8CharSet)
4 Y t. m4 |7 P) B* E$ M- n - {
# A; J% R; A) d) | - if (charSet == null || charSet.Length == 0)' j& w s6 f# A, x2 m
- {
3 J5 P9 K. P) |/ e$ w* m1 M" h - charSet = "utf-8";
4 i& t7 _( ~! X - }
0 y& t3 R/ M7 u# l; c1 \' a" b - }$ V$ V6 V9 w$ v: C3 F9 O' `
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
5 s8 K5 x; v( d- q( Q - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);4 P. v) ?' ?( y3 t/ G' ~
9 M& M* I3 E" T, W6 g5 |! b% W/ T- }
5 [- I5 z5 T$ y% ?) J - catch (Exception)# b8 `8 H1 u5 M) P1 R
- {9 H/ H1 u6 ~& s' Q' Z" [
- strWebData = "error";
# w a+ ]/ N( G6 o; I2 G0 |6 [ - }
& A; }0 |. ?" s0 x8 O4 \/ U8 { - ; ]3 f! u' q; x4 w U
- return strWebData;' [# R2 X( [8 ]1 R
- }
复制代码
3 L" i9 z9 F5 |; \4 X* H
2 A T( B8 {. M# }3 _' G$ _2 _ |
|