|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
- y" e6 h3 U$ q( x) U, Y( K+ u1 h0 s缺少一个gethtml,用下面这个:
- x2 t: C v3 `4 o+ l, f2 Y- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
4 m; b2 A, j% j9 H - {; E; y1 h! v9 I9 G3 [) B! L9 M
- string strWebData = "error";7 l, C+ q7 ?& M7 q X8 J Q
- try p X7 m9 E# N$ K, j
- {4 i7 ]/ T; o. t1 n {( @. |9 u* r: x; S
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
% |% c/ `3 e9 S" x) o - // 需要注意的: * k6 I# i" w' b; _% _
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 / k4 \% z* G' _) @# e5 Y& n
- //这是就要具体问题具体分析比如在头部加入cookie
7 Q& P7 n9 R% k6 W' `& g - // webclient.Headers.Add("Cookie", cookie);
8 q6 X$ Q3 G& `% ^; L" r - //这样可能需要一些重载方法。根据需要写就可以了
, D h+ j. q5 E- ?. N* L - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
- Q9 R0 n1 m1 _+ A( q* } - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
. M+ O! `1 B- b: C5 w - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 6 K: q* t7 u0 n3 Y, t
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
4 B( g: x4 A+ H3 o - //如果服务器要验证用户名,密码 3 t6 {; s& H" L* @
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
* J# g& D; _' k- C; e& a - //myWebClient.Credentials = mycred; , O- e4 @. Q7 C4 \+ X# @
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) " R [' d6 I' u7 ?# j
- byte[] myDataBuffer = myWebClient.DownloadData(url);& K: n+ p; r* ?# y! B1 y
- strWebData = Encoding.Default.GetString(myDataBuffer);
0 @) }* d8 v% i# \8 y; k8 _; A - 8 N* _ H+ m7 `" d6 f3 E9 \5 A+ N1 [
- //获取网页字符编码描述信息
: s0 b! p3 E. `- j - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);8 M3 w. r- b5 p7 T; n
- string webCharSet = charSetMatch.Groups[2].Value;! X% v+ W+ N k! t6 R, y- }
- if (charSet == null || charSet == "")
$ a$ Y7 ?( f8 p8 X6 r5 w1 n$ e% G - charSet = webCharSet;
1 e) r& X/ f4 J2 c* T6 K* C - if (charSet.Length > 0)
* `/ U# Q: Z; t6 ]. y - {
# V! I& }4 Q" L - charSet = charSet.Replace(""", "");
1 `% p; |* s6 `( x( u, R5 V$ v - }
- S2 n- T, r9 X- @ - if (UseUTF8CharSet), F) f( _: ?( Q4 Z: O
- {
, h/ o5 Q3 ~# B- z - if (charSet == null || charSet.Length == 0)9 b4 z# b- Q/ y* @# \
- {
; }! F! `2 e) x# Y3 g3 ^ - charSet = "utf-8";' r7 x- e+ J; I8 l3 P, J* O& a
- }
: K8 _7 @" t+ R. G! U" Q9 s. ] - }6 Z! _! o7 ?$ E7 o Z% k
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)1 X8 f) |* Y/ P5 w3 h) X
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
8 E+ h" x7 t& \3 \ - ) M. r% v. M' p" `/ s3 |
- }$ Z$ m; U* q' T7 y# p3 T
- catch (Exception)
3 B1 n# q# t0 C; X' r1 y - {. @3 [1 z' u. |$ ^
- strWebData = "error";9 A0 `! s3 I& x1 R5 H4 o
- }
, D! J! x# @$ J6 `2 o5 P - ' P" \: Q+ r# i6 [
- return strWebData;
6 M3 _+ _0 I7 w& I - }
复制代码
3 q- n+ r- ^5 I! U9 P( b" M8 O: Y# J; x
- u @; e9 P- P9 `/ F |
|