|
|
发表于 2016-2-26 16:23:12
|
显示全部楼层
+ ]0 p; f, X) P缺少一个gethtml,用下面这个:
* L( ?: f' V" [, Q- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
* p H) a2 h5 ] - {( `+ @ c. y4 Z+ q
- string strWebData = "error";
8 o& Q, N$ x3 B$ k, m* \9 Q# ? - try
" P7 A5 B% z9 S" J. ~4 E - {+ u9 N' `# T, V" k5 o1 ?
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
: b" w" p( Y+ \" H& i/ ?& Z - // 需要注意的: 7 g+ [# c2 ?( I- T% w
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
* N3 C* F' D9 p. T& ]6 B$ G+ e+ g. M - //这是就要具体问题具体分析比如在头部加入cookie
$ x1 [ `) i2 b - // webclient.Headers.Add("Cookie", cookie);
# q* i* a8 v. r/ x$ I$ o - //这样可能需要一些重载方法。根据需要写就可以了+ [/ ^1 V) T _5 z: D# X: f, p# {
- myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");% s- b* N7 m6 [' D- {
- //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
# p7 F" i$ f, m" ^8 I( x- K! [ - //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
2 R0 N* S! P7 t) i( W0 U - myWebClient.Credentials = CredentialCache.DefaultCredentials;# u" a% ^7 @9 L9 l3 a
- //如果服务器要验证用户名,密码
+ @5 L m# K' q9 Z, m - //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 5 }+ `6 [& ^ C" s/ C- o
- //myWebClient.Credentials = mycred;
' b% {8 H" Q$ \. [# k - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 6 p r2 x7 Z. y- ?8 i
- byte[] myDataBuffer = myWebClient.DownloadData(url);- w# d1 Z# P/ E- y
- strWebData = Encoding.Default.GetString(myDataBuffer);! r7 Y& F. m# P, ^' v8 u) G$ t
- ! _2 c+ i. Y5 F6 X
- //获取网页字符编码描述信息 7 H1 ]5 g- I: Z1 F1 E* e
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);& d$ p- }) U+ J/ c, C# Y
- string webCharSet = charSetMatch.Groups[2].Value;
6 K* ~; v/ F/ r: N# R - if (charSet == null || charSet == "")- K2 e+ H7 n& r0 K8 Z4 W
- charSet = webCharSet;: S+ t# W: h7 N# U# j# X( Y( o
- if (charSet.Length > 0)$ k! W- Y" g9 P, j7 D5 ^
- {, L9 ?1 e' K! H9 I
- charSet = charSet.Replace(""", "");
+ R% R6 B$ L0 D& n3 t5 @ - }/ N' n! m& s! M7 y& }" Y; ?
- if (UseUTF8CharSet)2 ?6 }% Y/ w) }! J) U9 T
- {
+ c4 |5 n4 g: J5 c3 F R( _# t - if (charSet == null || charSet.Length == 0)7 c4 N7 n2 {; _; q: R" y( o" |7 d
- {
7 q: m7 E9 @* P - charSet = "utf-8";
! k( Y% b9 @- x; p) \6 r% R! v0 A p% n - }/ b: U6 g& P! X* G {
- }$ P. `4 y. l- {5 G7 \
- if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
! V! B8 Q) A4 c& n( _3 X - strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);& H& h3 a# Y4 C: Y" u
! X1 e V# H& I7 @- }9 A" j" _/ ` o3 ^/ P' n8 w
- catch (Exception)" U0 \1 s8 N/ ]5 a/ B
- {
" @, ]9 m4 v. v+ N - strWebData = "error";
$ w1 V3 j& p% {$ r8 P' A - }* j3 s, e" ^7 l3 A3 t3 W B- ~
- ; ^8 d/ _" e8 z
- return strWebData;
2 ?: p" M& }/ y O* h# I& q - }
复制代码
, w2 h# ^& o+ E# d' ]
5 Q5 c" Y# U9 f9 Y |
|