|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
9 \7 B! D& `; l6 m缺少一个gethtml,用下面这个:; }: P" k% z" z4 E5 A% C+ I' P
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
& n0 [- T6 V2 B - {
! [1 U2 P7 {3 A4 D( v# y; L - string strWebData = "error";
' l. V! B7 t7 l - try
4 P/ |- ~% q _ - {
2 `) |: w% c) T1 j: o: E! J: ` - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
5 ]" R/ m% B! ^ - // 需要注意的:
, w& X2 y5 Y/ h9 t8 h( A# m - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 0 L1 T/ z0 p4 @8 [) n
- //这是就要具体问题具体分析比如在头部加入cookie
8 i! L1 ?" F; R3 G - // webclient.Headers.Add("Cookie", cookie);
. D9 a5 q" q- p7 h' q - //这样可能需要一些重载方法。根据需要写就可以了9 N) ~/ W* P+ R* |6 b# S6 z! [: ^
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");8 S- ?6 }: G, ?9 t; V. S
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
& v) b$ K) S; [ q - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 ! M3 |% Y1 d6 f# D$ l- F
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
; f1 S0 X1 A [ - //如果服务器要验证用户名,密码 |7 [) U; w9 _' ^
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
5 I1 x6 g+ v4 @8 j m9 y - //myWebClient.Credentials = mycred; ! H, X* T9 s$ Z/ F
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 5 a% @1 N$ T$ y, b+ p' z2 f8 x
- byte[] myDataBuffer = myWebClient.DownloadData(url);
9 L$ z5 l4 ?0 M3 R9 n. R' z - strWebData = Encoding.Default.GetString(myDataBuffer);% p" d/ ]/ r) x6 b& S c. ^- i
- 0 N/ C6 p) n" W; X* J
- //获取网页字符编码描述信息 0 H7 ^8 w4 H6 L8 U6 ^& x& p+ O6 W
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
% t# O5 k; o# K - string webCharSet = charSetMatch.Groups[2].Value;8 Z V* C! I' R
- if (charSet == null || charSet == "")
3 t9 j. b4 d# c6 I$ ?& s1 w6 m - charSet = webCharSet;' P& d6 j% Z/ k1 {5 l/ d( _$ x* B) ]
- if (charSet.Length > 0)/ t4 v" h/ q5 S: l& m2 _
- {
6 X9 ?; Q+ e; m% D - charSet = charSet.Replace(""", "");* [9 y C+ F& b/ a
- }
9 K3 q2 R5 P; A( ~( \) @" h - if (UseUTF8CharSet)5 m. b- J' T3 J4 _2 c
- {
6 x2 D ^8 v$ d9 W" ~8 B" b - if (charSet == null || charSet.Length == 0)
! M/ s/ N) m, e) s+ b) V9 U: x& c - {# L8 m4 Y4 I+ `" m$ d+ J/ i
- charSet = "utf-8";
7 v& { }& C( ^2 o: h! T& I' u - }
9 i' e7 c7 L" d: q - }
- a" a2 l9 h. s, ?: B. G - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
! F5 j5 ?0 v/ T8 E7 R% w* T - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);8 Q7 b8 [) k) g9 \
3 }# Q. E, X; w d3 S/ I8 P3 Y3 F- }
, e3 @4 N* }5 _+ K% _: X0 ~ - catch (Exception)( m; ` P$ F$ z0 ?8 b# F! W
- {
, B0 E3 }9 e0 {5 q7 k$ r5 C* c - strWebData = "error";
! n. N7 a5 C( P* D( `. K9 T8 h - }
, J6 U4 w s1 G0 ]0 L) L - . t4 \. W1 u; i+ N. Y* R
- return strWebData;
* P0 K% d( [1 n1 k% V6 u6 h J - }
复制代码 ! l6 |- L8 R" w% d0 {% d
- @% ]& S4 Z9 e& T+ q |
|