|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
|& W5 t! V! ?5 W' p( \
缺少一个gethtml,用下面这个:% u, E g) e# U* K' u4 s% O
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
- _1 T; Z1 H0 r& J1 j# S% u - {
6 T9 r1 d6 Z( Z$ X( }' T* S" s1 `1 } - string strWebData = "error";
2 n3 |7 T3 g, S - try
* F$ @5 {: E; v& e# N% w& p; H - {
' O9 @6 D% q* w9 r9 g c; Q. d/ L - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 3 O. ?, T1 o u8 \
- // 需要注意的:
! n4 ^5 {+ J3 ?2 O3 P' l - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
7 C# Z' g/ R3 A5 Q - //这是就要具体问题具体分析比如在头部加入cookie
+ ^( X Z! }3 }! o# K0 ^ - // webclient.Headers.Add("Cookie", cookie); * `% u) e% R0 G/ N3 N; ] X
- //这样可能需要一些重载方法。根据需要写就可以了, M* u/ ^& p, J2 q
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
& E( w( f; b4 s, r - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");9 s- ~9 N$ t1 q- B
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 3 u# Y% Y2 h+ b
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
- S# ]% S2 e/ u - //如果服务器要验证用户名,密码
) O$ q$ e( V9 q, D - //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
% ~, l6 S- h' X( W - //myWebClient.Credentials = mycred; ; i; `$ F, i q% E" B! ?
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
G4 {: c& m/ V' E. d - byte[] myDataBuffer = myWebClient.DownloadData(url); ?3 L" u5 ~& R% t$ g# b/ z
- strWebData = Encoding.Default.GetString(myDataBuffer);! y0 s7 B/ A! ?/ @
- ) k! U1 c; M0 P, T
- //获取网页字符编码描述信息
1 _- }7 j1 h. W* P# `# ~/ u- K - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);' o a- v9 }% T) \0 ^
- string webCharSet = charSetMatch.Groups[2].Value;1 A6 R/ i; {+ @1 t$ w6 q, S' z* D
- if (charSet == null || charSet == "")
( `, O% \8 }4 C( M" M - charSet = webCharSet;
- S+ q. S) a6 b* {# i - if (charSet.Length > 0)" y$ _( y. |" m7 o$ R
- {. @4 [- }+ t" X# T4 S' V5 p
- charSet = charSet.Replace(""", "");
, R# X. L- a& e d: [ - }
8 u: |) P, ]# m8 x( Q - if (UseUTF8CharSet). s! ]0 N, v$ o0 Q1 l* E: `
- {( B6 Q+ K" Q# u, g1 O, o& x' z5 S
- if (charSet == null || charSet.Length == 0)9 ~, r7 f& g# z% o/ i E* g$ O
- {8 I0 D3 {, R% c
- charSet = "utf-8";- B4 R: P& {5 C* s3 d" u0 v: t4 }9 i& `
- }1 b2 W# }( k& `1 Y2 U2 j5 g& Y
- }
4 ?5 {' ?/ n2 z& e* a4 z9 P9 T - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default). T4 k" G3 z! D' v6 l s: k
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
* c2 o+ g5 S' y% t' P! z
5 n) C) ~0 P* b0 ~9 L. R# C# v- }' \2 q! I) X0 ^: { k
- catch (Exception)
9 g% V3 _1 P) p+ @- C - {
! P/ o! r( ^9 J9 m) c1 A, U& ?, d - strWebData = "error";6 |( k8 O+ E" n
- }
+ c& E- J# j0 x - 0 ^9 U8 }" i- Q) X. o, e
- return strWebData;
. |/ R! ~8 n) N - }
复制代码
( c; ?5 c9 S5 L1 E6 N
% M3 x/ ^, B3 E% k( i5 @ |
|