|
发表于 2016-2-26 16:23:12
|
显示全部楼层
% s+ ?) j y7 q) n: y- _# P
缺少一个gethtml,用下面这个:% z% N9 h! e+ a( L6 u9 t7 W
- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
+ P. A1 b' o7 `, K) b# l - {
! G* r3 N0 A- b: @# { - string strWebData = "error";
$ G4 |2 `- F5 J# f4 c4 m - try5 y( w8 h( g0 N# A- f
- {
8 W, e* ^& E, C+ P2 N% Z" J! ] - WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
- B% ^( s+ }- x) l7 H O E5 b! ? - // 需要注意的:
& y% U" x2 F4 D - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 * C+ }' v; z" k8 t
- //这是就要具体问题具体分析比如在头部加入cookie
7 I3 L; s' ~6 o5 @" N - // webclient.Headers.Add("Cookie", cookie);
" H% C# W/ u, ` - //这样可能需要一些重载方法。根据需要写就可以了
6 m. K& l& r3 n: z - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");" Y* h+ |: }$ d" c
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
; n6 q6 Y1 o" k* c1 b! M - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
6 Z2 }2 T' O" ^ - myWebClient.Credentials = CredentialCache.DefaultCredentials;6 q3 }/ c+ S9 ^5 {; a
- //如果服务器要验证用户名,密码 $ y- f& k5 G6 x6 F% f, S
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); g" s2 d0 s; w$ L
- //myWebClient.Credentials = mycred; ( n. N6 N u% Z7 @' |
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
. H. J6 M9 q8 r. z( z/ s+ t( A J - byte[] myDataBuffer = myWebClient.DownloadData(url);* r- D. m% b: r
- strWebData = Encoding.Default.GetString(myDataBuffer);
7 _9 @# N5 s" `+ b; m
[- u; z4 w& I0 p- //获取网页字符编码描述信息 / I5 N. K; P" K! n- H3 u
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
5 r2 g" }- M- q5 s; R* x# k6 q - string webCharSet = charSetMatch.Groups[2].Value;: u$ }" H* n* t7 Y9 I
- if (charSet == null || charSet == "")
) V% ?- ?; y/ x3 L$ w; r - charSet = webCharSet;
1 k. ]: a6 y$ f% u - if (charSet.Length > 0)
( b6 i* _% k; Z9 z - {
7 g7 t% N9 q5 } ~9 E6 o' b8 Z - charSet = charSet.Replace(""", "");) `& U; O& s# h' ]. B
- }6 M, P8 _6 B4 g. a# h' Y
- if (UseUTF8CharSet)
& }# ^' g! h! w+ e3 P& Q; Q - {0 f# V" Z' q) U: V3 H
- if (charSet == null || charSet.Length == 0)
' \) I) _4 h) L/ C( ] - {
, p* T# Q! q2 u9 T( m# p& O - charSet = "utf-8";, R1 F: B; B v: D, F
- }
% [: K# R4 t1 z. R! V' u- j* ] - }
$ Z7 |" H; S) M4 ^) p% `% y - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
6 z2 M! @3 ]; c' S - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
& p& Y6 f; T) q7 A! F) @8 G" b! n1 S - J+ U. | P! M! r
- }
3 P5 Y: y( W" b' w5 l1 ^' N - catch (Exception)( f- ] \, [1 n F( k+ U8 K1 k2 q
- {9 u/ ?" r; @$ C
- strWebData = "error"; b7 T1 j3 g/ \5 w A
- }
5 h& i3 g: Q1 S: l( ~7 N0 Q7 G$ `
2 c! [6 J* e7 k& @3 ~4 y+ V- return strWebData;8 d3 y4 A2 S" L
- }
复制代码
4 E: i% a: D( l; L6 s3 \; F& s3 w+ ?, i4 C5 _
|
|