|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
" g' y9 m, B- c. A7 I1 F8 t A6 y - public static List<string> GetAllHref(string url)6 B3 z- Q# z3 o. W0 W. u# N# l0 T
- {. E; ]( A$ V8 @+ D+ N
- List<string> allHref = new List<string>();, i/ A& f$ ~3 [) G7 g
- try
% V( T1 S$ G2 D+ q - {
' F1 [9 C9 c9 d9 P - string strhtml = soso.getHtml(url, "", true);) {2 t, W; O1 q) X
- if (strhtml != "error")* W4 h: m! \* I# v ~! y: O+ m
- {+ ]6 u/ \& [0 k; b U! U0 X' h9 ]8 o
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");9 H8 v9 m `2 ~/ e
- MatchCollection mc = reg.Matches(strhtml);. o$ w" S# t5 L8 a5 y
- foreach (Match m in mc)
" _- u2 Y$ T9 _! K2 b. t - {
# L. X4 s9 D/ K - Uri uri = new Uri(url);8 }0 r! B/ d0 s, L
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
e+ W% y$ C* y/ L; h6 T9 A - string fullUrl = "";3 f+ i K, H/ {6 j+ ~
- if (m.Groups["url"].Value.StartsWith("http"))4 U, u$ y9 n. z) X) ~* @3 _7 T) [6 W
- {& [8 F) n6 h5 y o" w
- fullUrl = m.Groups["url"].Value;
( I5 }9 f: }' U - }
. ?( {. ^; A0 U2 G1 i1 Y2 [ - else
6 I* M+ j( m7 Y% l - {
, M9 I0 h) e/ u! E& l( E2 Y% d7 u - fullUrl = thisUri.ToString();
' [4 H& J' T% s. ?4 k' K' P - }* o4 P5 s. `* x) Y, V4 ]
- allHref.Add(fullUrl);& `% M, `* u, B4 Q) u) [ l
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
/ y' i5 d9 S0 n8 @2 X$ g - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);/ r" n) T4 p/ K: V2 x5 T& `9 x6 L! o
- //Console.WriteLine("补全链接:" + fullUrl);
' v/ z! E. Y" b4 S/ e3 x- E - //Console.WriteLine("…………………………………………");, ~# F* Q/ B% B, A. m; p
- }/ h- \3 C! W; B( L- P* [8 u
- }! [( I( i/ K$ \
- }5 V7 t5 Q- D) N4 Q% P# d' O6 u
- catch (Exception ex)
L, I8 v3 s* U1 J. c - { }
% U/ J! t6 D A4 b( ~ E: Z1 H - return allHref;
/ E! [/ {( b7 D, B3 B* M - }% C7 e/ e7 w2 Q) B- k# |4 A
- #endregion
复制代码
5 o( P4 ]9 ~/ a6 w% l: j
" f5 G& p, p, J, J H) a
. [ [4 [' ^4 ~% s- # B7 [4 y( c3 r, G6 K* ?
- ; V& U8 I9 ]7 h0 |* E: y$ @
- #region 数据去重$ x' s5 w3 v8 N( ?. e
- /// <summary>
% P: h$ V9 r- C/ b; d - /// List<string>去重1 l% s- f4 Q& g+ \$ F, d
- /// </summary>6 o7 X; U3 N% w& f* c2 r
- /// <param name="list"></param>
" ?6 \/ m5 m9 |5 ?; w - /// <returns></returns>
* {8 m; `$ k) ]- W& M2 J - public static List<string> getUnqueList(List<string> list)1 f* L0 Y( `- c, d7 K. K( Y7 x
- {
: w' }* _: C; B4 n- f' L - List<string> list1 = new List<string>();0 R7 G% N* l, W- o, `9 \
- Hashtable hash = new Hashtable();1 o3 U1 t# T" x. m8 T4 f
- foreach (string s in list)6 N, w: ]' L7 {; D. ]
- {
& j- L4 x( b' k- I/ l& y& O - if (!hash.ContainsKey(s))
: o/ J1 J! t0 n$ J - {0 A. ]% g5 n. |! e
- hash.Add(s, s);8 }3 \! M% @9 d" M) u3 L" r* l
- list1.Add(s);( Y# k5 {/ P4 H. l7 `
- }
* z) Z. j" @" T. [ - }
# n) i7 a) Q$ y' ~) k/ l, q- r$ r - hash.Clear();
$ R" s6 L5 a z" s4 e - hash = null;
7 d7 C1 |- r @ - return list1;- ]3 N5 \/ j' Y m
- }. O" P% ]$ b+ o8 B" B
- #endregion
复制代码 ' M) J" h* m( `
]9 _" C) Z5 t/ H' P! \* o
|
|