|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
$ N3 t; [" O0 w! ~ m# M - public static List<string> GetAllHref(string url)
) X; M& ]( E) x4 k" O- f/ g - {
" A5 o+ C9 @7 }* E; S+ | - List<string> allHref = new List<string>();
" m' v i6 h7 ] - try
5 s4 c! } m4 X( C/ \8 t - {4 x# D. n+ X) t/ j$ {( f4 Q
- string strhtml = soso.getHtml(url, "", true);
* |1 h! e, X6 i. K" J' s/ J, @. W - if (strhtml != "error")( j# g, ] \/ F) p! o+ V$ R
- {* X t( Y3 X; b1 I
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");7 q7 _. U6 l7 o0 o( [! L& _
- MatchCollection mc = reg.Matches(strhtml);
# u# g- F, r+ q% C! t - foreach (Match m in mc)
; u& f# \1 c( L$ y6 f8 G# [ - {
: u) o- p: O4 c# d2 A - Uri uri = new Uri(url);
6 d( ~$ m' V; M% n - Uri thisUri = new Uri(uri, m.Groups["url"].Value);/ h$ y( p' E- u+ E
- string fullUrl = "";
% O% @' _2 {4 p) ^' \. c - if (m.Groups["url"].Value.StartsWith("http"))% y0 v3 H9 G3 l9 e7 k& t* s
- {5 D1 O) {1 ~5 M
- fullUrl = m.Groups["url"].Value;, a; k3 w3 n5 D( S' d; P
- }
" ^% d6 @7 B2 j: l& [' g# D - else
$ \% L! c$ Y6 e; q7 r% }. A - {
% M" _5 n7 b) S5 {( ?: J) d - fullUrl = thisUri.ToString();0 s. v0 i5 t: G0 _; @* J
- }
0 a# Z. u4 U0 b1 ?- b' n- |% r - allHref.Add(fullUrl);( ]- W: O5 i+ }) M/ x
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
) {8 j/ y7 B4 S1 V - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
; x |8 F4 {) t) G3 T - //Console.WriteLine("补全链接:" + fullUrl);5 W# \4 K' t$ v
- //Console.WriteLine("…………………………………………");+ w' y5 E' H7 z2 J, h6 l
- }. b' K' Y: o. e6 U
- }
4 p2 z+ U) ]) G. c, v - }( O5 B" _& a# L8 n8 Y! @8 I
- catch (Exception ex)- k# {% h; U+ `' e" ?, m
- { }5 B0 B/ _* Y5 u4 q8 _
- return allHref;. m+ ~; F+ a. U* p1 W
- }" b/ h& `) o) K2 M \" R
- #endregion
复制代码
' z, ~+ r" ?0 J5 P; Y6 y/ Z" D
- }0 b- R' u! t( D4 v0 F4 F
1 P4 g* p8 e* }: O6 r8 _! A! p, \
% n5 U/ e5 ?9 E3 R; v) y- 9 {" T9 `7 q5 Q9 H8 O6 F( z* J
- #region 数据去重
! x: D% Y' a& X4 H - /// <summary>* H: v9 R+ J6 Z
- /// List<string>去重
9 B+ p% W# S5 H( K* I. H - /// </summary>: b- j/ ]9 h3 Z0 n
- /// <param name="list"></param>1 O0 W! q1 S: s4 ~
- /// <returns></returns>3 j9 F5 O' l( r- k# `; m# z
- public static List<string> getUnqueList(List<string> list)2 @; Y6 J1 I- n" k* k _; C% e
- {
' p; [7 U3 z! {2 u3 I# r' s! x$ \ - List<string> list1 = new List<string>();
# t+ \ y3 n z p, ~2 \ - Hashtable hash = new Hashtable();- ~/ r9 ]- p$ q1 G* E
- foreach (string s in list)
- O6 @8 ~$ N, ~! T( f) U - {% `: n b ?# N2 }
- if (!hash.ContainsKey(s))
7 `1 l, H' z, O2 s q& g- W% a0 K - {% ~6 U8 q, w9 R7 L
- hash.Add(s, s);" o/ s, n8 a& J
- list1.Add(s);
' g! w0 e9 Z, x( } - }1 f: f c' W+ v8 z/ _! c
- }" g: j" N4 B' Y1 e* O% W* i
- hash.Clear();& r1 J' T, ]6 }: t1 [5 g
- hash = null;5 K4 R/ a# z7 w- _
- return list1;5 B' `5 E6 `, ?
- }
& P( B% N( }& o1 C: y$ U& ^: Q - #endregion
复制代码 0 X( ~6 a( S/ @
: G) A P# n& u |
|