|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
6 |( h: `& N' B3 s$ m, h
缺少一个gethtml,用下面这个:
. I$ b- N3 Z- ]1 W( ?! `- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
) t/ @3 t/ a! c - {
5 j* C. t& x: r+ Y5 G h0 W* W - string strWebData = "error";) Y6 }. q; L: H* u0 n
- try
1 ~* E$ \- a% t - {
, e! S1 a/ g" U. ^. z- z - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 0 W7 O7 P: J* m
- // 需要注意的: ; m7 P: F$ @* K3 r4 x5 g% D
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 9 j' R; i5 Y) Z! H5 c/ i1 X+ A
- //这是就要具体问题具体分析比如在头部加入cookie 0 o0 s+ U: ?' U, B2 F6 j
- // webclient.Headers.Add("Cookie", cookie);
8 D9 e8 f! C6 z8 o7 N# p - //这样可能需要一些重载方法。根据需要写就可以了
+ I3 @- w% i/ v) }$ Q! t- h# { - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
; J+ w0 e3 m$ y; C- a - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");3 F$ v: D0 C7 S# T; Q
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
& L+ y2 N% f0 T$ e, a8 c - myWebClient.Credentials = CredentialCache.DefaultCredentials;
" {) u7 o; p1 ~$ t6 @! q. [ - //如果服务器要验证用户名,密码
5 O# c. p! F& W8 ^7 d - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
$ v$ s C# f- j2 c: { - //myWebClient.Credentials = mycred; $ X% Y' `: ?, o7 L
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
0 K, }. L' g# N. ^, F3 W6 M' Q5 ^ - byte[] myDataBuffer = myWebClient.DownloadData(url);: d0 `+ r3 T! D# Z3 D7 |! K
- strWebData = Encoding.Default.GetString(myDataBuffer);
! W, a1 ~% I ~* a - 7 C; ~% N' F; C0 b+ x5 k
- //获取网页字符编码描述信息 2 a; i3 u. o* W5 Z7 R: A
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);2 j4 T& `, M& a; f
- string webCharSet = charSetMatch.Groups[2].Value;
# u- w) @# a. t' m) q - if (charSet == null || charSet == "")
) Z2 c1 T) V/ [4 w# T) A - charSet = webCharSet;
: E$ \8 A- L6 \; Y6 A - if (charSet.Length > 0)
2 y6 D7 Q# H0 i' I - {
) F$ Z; A$ }; U - charSet = charSet.Replace(""", "");
6 Z9 v+ O: V$ V& q- i - }6 |1 B3 ?8 e# ]/ L8 Z
- if (UseUTF8CharSet)
2 D# U, {! s) ]# G3 ? - {8 q' v1 k6 K! c* r: M+ S
- if (charSet == null || charSet.Length == 0): a# x6 c$ p4 D$ U. U1 f" O1 X' |
- {4 z6 P* I4 U9 ^: ]6 e( `5 u
- charSet = "utf-8";6 n1 S$ u* @" U. Q5 r
- }+ m" S$ z+ r$ K) U U" q& `
- }
. n7 b) q/ i3 I# E7 S - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)" w; x4 T R% L, f" m
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
3 H- v4 {1 i( D& y- T+ n
, U5 t' r+ N/ D! d- q/ P/ Z- }! x; b5 h g5 b9 x
- catch (Exception)
3 a0 c. i' _! y, V& w - {
6 a w' W2 R1 t7 R" Y - strWebData = "error";% ?/ {* j4 b4 t+ b
- }' i& E: }# O& S/ n. d) z
- 6 J, W2 w, p% S* J9 t
- return strWebData;
3 G+ [* X& x: q0 ~ - }
复制代码 $ e" t3 @ z/ h( M; U; m- Q6 q
# f, m8 _6 D; j7 d' \
|
|