|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
5 ~0 Y, w- ~- x ?缺少一个gethtml,用下面这个:
: W. A' A& }# ~* }# R1 x; c- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
8 \+ H+ x& n0 m0 w- E8 Z - {
5 E& s' s& \; \# e# ~+ m - string strWebData = "error";- j v& M T7 \2 d5 v
- try& K; U+ c' m: L
- {
. }* r* {! e) a9 m% ]* g - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient ; }* [8 x. c- ]. o6 @2 l
- // 需要注意的: ( e) a& F' E4 K* S; X, T
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
- r5 v9 V7 w( a. u3 n4 @7 ` - //这是就要具体问题具体分析比如在头部加入cookie
7 `4 V1 V/ e" O6 D7 f- x - // webclient.Headers.Add("Cookie", cookie);
9 Y* G1 q# V) U$ |* r- G- O - //这样可能需要一些重载方法。根据需要写就可以了
+ n: t+ R" L0 E v: ?6 S - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
6 U% u* f" a! C& F; D& P - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
4 U) q& m: f- f+ ?8 L - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 3 Y/ n! n$ h$ g" P5 h1 R. x; b! V
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
, b! u5 e: Z- o" X - //如果服务器要验证用户名,密码
5 }" G* {9 N" t% n - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); $ o5 j% A6 d$ P+ S# R! \7 L2 `7 I
- //myWebClient.Credentials = mycred; & t& \1 e6 G- ]2 {& v8 C
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
4 I% b& D7 a/ }8 R l7 ~: I- O - byte[] myDataBuffer = myWebClient.DownloadData(url);+ }7 p+ Y, v$ f6 C
- strWebData = Encoding.Default.GetString(myDataBuffer);
. C6 Q" C4 l1 \, T3 Y* | - , G6 I4 J# S$ q( O+ H9 @7 I k9 j
- //获取网页字符编码描述信息
6 @* ^1 b F+ R$ C* {' z) K - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);8 Z7 o$ o6 V* G, r+ p4 `( G
- string webCharSet = charSetMatch.Groups[2].Value;3 l0 ^7 h2 _$ U" p3 f4 S
- if (charSet == null || charSet == "")3 j) I/ Z* Q( ~ z
- charSet = webCharSet;
/ g/ h8 s- O3 j2 R" r - if (charSet.Length > 0) _3 v8 S- B& t
- {
# f0 b0 T, w! M5 `$ l$ z - charSet = charSet.Replace(""", "");
9 p# x8 ]" ^) o/ a8 h, q - }& n, j% r4 y; b6 V
- if (UseUTF8CharSet)
! ^( Y+ h1 I/ j - {! {# u) s8 g* m) q
- if (charSet == null || charSet.Length == 0)
$ ]' W$ X7 g, p4 F/ A) g; y - {8 h8 u2 E) y7 n7 i9 D( D! b, ~
- charSet = "utf-8";. }3 ^+ E& X; d; M; Z, V
- }
" u5 I# E0 C/ D$ Q% p - }
+ i2 m6 z Q% v - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
% c' Z4 R" n7 q' I9 \# f3 Z4 E - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
+ a3 M/ q; p+ K' b: Z) c
7 i* J ]4 T- L% d- }
k" y$ w# a2 d" Q8 M - catch (Exception)
3 H' J7 x# R7 z1 m* b4 M - {+ L8 h. e* u4 @) _* p- n9 O. Y
- strWebData = "error";
' [. B1 l2 ^# F8 ~ - }
- R7 G. o* D* K' U& N9 S2 U) F0 ? - $ r* w3 L* w* G, x3 z1 b
- return strWebData;
; P* H; b1 q. ?, ~ - }
复制代码
- h" K2 i q5 ^- M' h9 z
/ w1 p/ k g9 y |
|