|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
' i, v! G4 a1 ^ g6 _8 f缺少一个gethtml,用下面这个:) }2 P1 H$ }% G
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 n6 ]3 H# X- D
- {1 S5 X+ n7 [! C0 ~; t/ N4 q
- string strWebData = "error";; h! A7 _& O+ ]7 y0 l- g. s
- try
! O8 |7 s3 I: ]8 L. c* l! v - { J+ h( U6 M# f+ v1 W& ?
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
( H: B$ `1 r. k - // 需要注意的:
# a3 S1 i9 s% I) r& p: J5 e - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
5 ^$ O0 r0 L7 N+ t) a - //这是就要具体问题具体分析比如在头部加入cookie . G) G" t% N9 e6 p6 }2 K+ O4 j: H
- // webclient.Headers.Add("Cookie", cookie);
* U/ \4 }2 N$ p! c0 x* ]! a1 e - //这样可能需要一些重载方法。根据需要写就可以了
( l" V- j6 p6 }( |' C8 N - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
9 \: D& p8 R0 P1 i. F. ` l - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
y+ Z& \6 B( I3 D6 f - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 ' s5 L2 {6 b- a) G4 _$ j: ]6 f
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
6 J ~" b4 Z3 q; k - //如果服务器要验证用户名,密码
9 F9 _" p# }* l - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); `- p/ c2 E. v: v b
- //myWebClient.Credentials = mycred; ) }; G- u; x L' D( t- Q/ S4 k
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
. F1 A( s3 B7 h* {$ M - byte[] myDataBuffer = myWebClient.DownloadData(url);
; T) [6 d+ M. B9 E3 {6 B1 U - strWebData = Encoding.Default.GetString(myDataBuffer);1 g2 O, _* E7 f( r, g' b
9 N. {5 i) R% {& M9 n% y- //获取网页字符编码描述信息 ; Q" g5 f% p4 b$ r% B4 n: t$ l
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
( B8 q2 G) ^1 b; H) ~! X: U4 \ - string webCharSet = charSetMatch.Groups[2].Value;7 Q0 Q: n) C1 Z2 @
- if (charSet == null || charSet == "")2 t* |( z$ V, N$ A4 B* H
- charSet = webCharSet;
9 s) y' X" N) C, j F - if (charSet.Length > 0)+ ~( L5 G; t& P
- {
2 X% T" m1 e; b9 X: l& I3 I, x - charSet = charSet.Replace(""", "");
* B4 c. a' P' l5 x. B V j) Z! I - }
" X7 D% T0 W8 v1 a9 ? - if (UseUTF8CharSet)- {+ |7 [4 N1 E# S1 Y
- {* L4 Y; `7 j* T0 s) ~+ R
- if (charSet == null || charSet.Length == 0)0 K9 ]3 j! g, ~
- {
, o' m# Y% n1 \9 c- S- h7 ` - charSet = "utf-8";
& _2 {9 L. L- \' n5 @) Q - }# X' P0 |' p1 Y. S+ C" }
- }
8 [" v# k7 P1 M' j4 S - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
, |- ^! T0 X0 W' f4 q( o - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);3 s1 U( v% {+ C8 h1 T% d
- . g" u4 t: g6 L8 G* [5 K) i" a( i
- }
# M# v8 n( f! d1 O0 q - catch (Exception)$ G/ M0 s x& [. z
- {
K5 v$ a) a% C' j, T - strWebData = "error";) Y3 [" L3 _. V7 _% Y
- }
9 }6 O1 p2 x, J9 k! [ _, l" t - 5 f( [( ^, B5 ]
- return strWebData;
Q* W3 K0 z; U# q, J - }
复制代码
; I) U# ]( {/ h: z& Q
3 I, n2 _" U( v" t' q0 ?* t6 w |
|