|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
$ B8 h5 y$ L$ U6 t. p' i# S缺少一个gethtml,用下面这个:
7 }& z1 K1 k$ B( j% m- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
6 K, I$ T# I& k5 n - {9 n1 m/ S3 T7 j& D/ K) ]
- string strWebData = "error";8 G4 h& D$ l* |% q& [! _+ J
- try% T. z& D. [. K, ?4 [1 i; N
- {4 m2 }0 P0 c8 D/ Y
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient 2 b- \0 X- @( z$ Y
- // 需要注意的:
1 F, d" k7 |$ \7 ^* c0 w - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
+ w4 N1 r# d, O* m, y2 Q - //这是就要具体问题具体分析比如在头部加入cookie - W! z, @4 C7 O+ l4 e& a
- // webclient.Headers.Add("Cookie", cookie); - j; ?+ f$ L1 f# m' Q
- //这样可能需要一些重载方法。根据需要写就可以了! j1 h: P/ J0 Q: c H
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
3 r0 i, I( @5 }! q, E7 R - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
( n C5 X- ^' U; K$ L* ~ - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 ^/ Q) p+ d. L
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
9 Y/ G/ h/ N, c' q8 A - //如果服务器要验证用户名,密码 0 x I6 [: r. L9 C
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 8 W& o- M4 T2 c% `4 |. h% y2 [
- //myWebClient.Credentials = mycred; " G3 d7 }1 w4 `# U* D( j
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) : m* o+ ]& |2 L" N* M; S7 @
- byte[] myDataBuffer = myWebClient.DownloadData(url);
Z0 |* x+ u2 n. v U - strWebData = Encoding.Default.GetString(myDataBuffer);: I g# e. k- a- y7 O/ R7 Y& C+ B! t1 a
- , P: C9 n7 q$ }. `3 \; ]
- //获取网页字符编码描述信息
2 {% P' f. K$ I8 c* j( o - Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
: d( r6 o. U) I) }9 i3 A, Q - string webCharSet = charSetMatch.Groups[2].Value;0 w# e, N% l* u2 m# g7 u
- if (charSet == null || charSet == "")
/ N- q* K7 T( r$ r - charSet = webCharSet;$ ~+ L9 B+ R/ ]. K ~/ z
- if (charSet.Length > 0)
8 Z& l+ c" }% q# J% N5 s - {
8 W5 R" C* U2 l ]7 t. r- _, V - charSet = charSet.Replace(""", "");
& h/ w) ?. s/ E - }# ?& b2 ?. D5 a" @6 H
- if (UseUTF8CharSet)
& h2 Z, s8 Y8 S: O0 f1 ~6 n& D8 q - {' m) c+ |+ G7 h% f
- if (charSet == null || charSet.Length == 0)
+ g9 ~1 k4 y6 d' v/ P7 u - {
: w7 R; B8 D8 a4 a - charSet = "utf-8";
3 i2 H/ p; S# c) _ b, H' m - }
* r' L) @- g1 p( y - }2 y4 i- p j) `) g) c
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
5 \* l( z D3 W& F - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
% J) h$ z! O/ ~1 n
& w$ C( l# s8 G9 F) z- }
+ Y' m8 b& U; R1 P - catch (Exception)
# L7 O6 S8 p! k. E - {( J/ z% X* _* t8 Y, q" d
- strWebData = "error";5 w4 l8 `+ s; g
- }( Y# D7 \0 K/ L
- ( Y5 N( t, ~1 g, n
- return strWebData;
: }( w. K4 \ K5 s! A% x - }
复制代码 / k. t- h) w2 m/ _" k3 z! e. |8 F
0 u; K, d7 J) G% \ |
|