|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
3 ]2 Z2 }/ c8 x8 l* E, f; P6 t缺少一个gethtml,用下面这个:
" @$ O& G2 a3 n. w- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 8 i% A; P- N) h, y! m2 o7 @
- {4 \9 R, N3 V! b0 h
- string strWebData = "error";
X2 K9 l4 B8 P - try$ p- i$ Z9 L' u K% u/ W! [, B
- {
# d# F+ a( l2 V2 n - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
2 g5 T ~# l1 ^0 x8 Z - // 需要注意的: . I7 g$ `* X, H) X$ ^: l% f
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
0 w1 H4 Z" b' p - //这是就要具体问题具体分析比如在头部加入cookie " X; l" M, B" l5 m) \
- // webclient.Headers.Add("Cookie", cookie); ' f, V6 r* `' B# J8 P2 {+ L
- //这样可能需要一些重载方法。根据需要写就可以了
" s% c2 ^1 {0 s5 T' K - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
. A% z: j% r/ R6 D - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
) c" o' |# G& A2 i, b; w5 N8 o7 P3 [$ Y - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
' K( l0 U. e. ] u% d" i5 u - myWebClient.Credentials = CredentialCache.DefaultCredentials;
! Q' Q/ R" @1 c5 f - //如果服务器要验证用户名,密码 6 c: q- I1 Y' z. n
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); / _8 r# E, L+ N7 |# u
- //myWebClient.Credentials = mycred; / k; I- t5 L6 U5 ]- v! G9 I, H2 {
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
( `5 `7 o. F$ i. v$ T - byte[] myDataBuffer = myWebClient.DownloadData(url);% U" a) k o$ [; e% ~; o/ ~! K
- strWebData = Encoding.Default.GetString(myDataBuffer); T2 T/ h5 t' m+ X! \! D
9 V4 p/ M9 {% ^6 X- //获取网页字符编码描述信息
3 D4 k4 T2 H3 |# D* {6 f - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
* B& p- _ Q2 n# X" \, L - string webCharSet = charSetMatch.Groups[2].Value;
& z/ E! ]3 j+ b2 Y1 n3 @+ ~" X! e2 i3 t - if (charSet == null || charSet == "")
3 q- a! |$ Y& y2 E* S9 _, J - charSet = webCharSet;/ A+ N- E1 T/ M- N5 ?5 ?, M# B; ~7 u
- if (charSet.Length > 0)4 y9 c F5 O* j7 q O
- {/ h# F5 w5 d! V5 K* p7 ~6 Z
- charSet = charSet.Replace(""", "");
1 W" H; ^5 Z; \6 f: k+ Y - }
$ [5 O$ L2 \4 ~ M# A2 C - if (UseUTF8CharSet), V$ _% J( Z5 q2 X5 u# L
- {. z- O, ]( l* H
- if (charSet == null || charSet.Length == 0)
' m6 W3 c/ `0 g; D - {2 A/ w7 \; [9 R- c
- charSet = "utf-8"; _! f2 v! l q; o+ X4 K# P
- }
. k2 X7 c& W1 o7 `& ~ - }7 @/ ~4 w; }7 Q
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)( d8 L8 G: n2 M; t4 G
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);# b& q( A: N! [7 |4 p
- + Y7 W; U2 f7 y% P
- }/ C9 F" k) ~, r. R: ]
- catch (Exception)+ M8 u+ x: l( q8 i, d
- {- _0 \3 \! d8 V1 \/ J |; G5 r( b) l
- strWebData = "error";9 D* n5 X, Y! w$ P9 K, r8 \! g- {
- }
0 @. C# V* b4 |" ] - # h2 t m$ ]4 @/ `
- return strWebData;% |$ G2 E& ?6 a, l( r& j0 n
- }
复制代码 2 s; y4 V( R, ^7 p
+ ~0 L z% l, z2 b- T |
|