|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接7 [. x# T c5 @% p4 q
- public static List<string> GetAllHref(string url)
' \9 W& f% z$ u8 J - {
1 o0 z" W) [' K X# Y |' E - List<string> allHref = new List<string>(); S& e% @) f: t+ c( E$ d4 p* m8 p( b
- try6 K+ O' A: ~' |9 o( }7 ?
- {
7 B1 j/ v/ ]- t( ]3 i; r" b& s/ z - string strhtml = soso.getHtml(url, "", true);, r; H7 l9 O- h5 R3 \/ g, e
- if (strhtml != "error")* s0 }3 S* l# y0 @
- {( Y: B7 _( I2 L! I
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
& `& |* s) Y3 l! J8 ~7 ^) ~6 h _ - MatchCollection mc = reg.Matches(strhtml);
1 S9 L3 q6 u! \/ z+ O - foreach (Match m in mc)7 k- t% ~. N) T
- {3 b7 f# p% A, C, R, P
- Uri uri = new Uri(url);
. w# J) g2 @8 T {/ S7 S/ p - Uri thisUri = new Uri(uri, m.Groups["url"].Value);! h4 v% Z/ G& _3 S/ @
- string fullUrl = "";8 k Q+ ~9 r& ?& z* |$ k
- if (m.Groups["url"].Value.StartsWith("http"))% x4 }# W1 Z. x" Q
- {
6 j# X$ P- ?; b, F4 W - fullUrl = m.Groups["url"].Value;- Z) Z1 d |: @! v2 ?
- }
. I% _. L1 R" h/ N - else
) F5 Q- H2 N O. `) M - {
& ?! {8 E5 d9 h% l - fullUrl = thisUri.ToString();- B& h1 G6 d+ x8 e4 `* x
- }
5 I6 x7 \: |; v' Y$ p - allHref.Add(fullUrl);
1 R7 |( ? {9 j9 Q - //Console.WriteLine("原链接:" + m.Groups["url"].Value);
- W% u% i9 d1 C) S# R, ^ - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
9 |8 E$ i) I8 D - //Console.WriteLine("补全链接:" + fullUrl);
+ R; l+ J4 T- m7 ?) N. c6 O2 ] - //Console.WriteLine("…………………………………………");
( r& Z% N2 N+ n+ |8 P7 z% i8 `' O - }
7 L8 u, A9 Y. S4 T: z( H Q9 y - }2 O9 ~5 l1 p( `$ y p
- }
5 p1 \9 {7 D! O! A8 q+ v) ? - catch (Exception ex)
" m- Y/ z2 i- u: P: U - { }
/ V4 w$ V" q/ i: Q" `* Q0 K4 S5 o - return allHref;0 w# |* @# A" |3 x
- }
# y7 _: T7 g0 E2 v7 U" ]( l$ w - #endregion
复制代码
$ k3 W6 Q: k* v4 d4 O
# ~, l+ k0 W; o- b! L" t4 u
+ i3 X% S! T m" e- : b7 e) | y6 c" P7 _
5 M0 j$ h) W# n& {- #region 数据去重
* j, D8 `! E; x! a9 R - /// <summary>
- s" w# `6 Y8 l1 w. U - /// List<string>去重
' D+ Q0 g, \; k: v( T6 h - /// </summary>, @/ ]/ Z. T* t- U
- /// <param name="list"></param>; p6 y5 x' X& C4 ]# f" @3 E1 M
- /// <returns></returns>( D' K5 R# u- q5 I/ b% K5 g
- public static List<string> getUnqueList(List<string> list)6 B1 i! N( K9 }5 _% v
- {+ r7 t; ]0 J3 W) B- g6 X; J
- List<string> list1 = new List<string>();5 M G0 u) ?, i h* [' X
- Hashtable hash = new Hashtable();" \% _, v2 Y9 l' x- ?
- foreach (string s in list)
8 G L5 n( M$ [# V - {) N$ c& R( I; |. v, p
- if (!hash.ContainsKey(s))' M; f. ~& x9 K0 C7 U$ L
- {- D' D1 M9 @% ?. c% Q
- hash.Add(s, s);5 |3 u9 o* q$ U, a* K3 y
- list1.Add(s);/ A5 e8 o! i6 J) z- {9 v
- }; ]+ H% d1 n. W' p4 i) m
- }7 A% X% X) L+ q. T, \4 y
- hash.Clear();
2 B" b) `, X+ a: X - hash = null;
1 V& k) d1 i5 [# _ - return list1;
3 R1 ]. w# t" j: I7 \! @, [* z, a1 u; u - }& \% a+ M/ u& ?0 ~# w/ ]
- #endregion
复制代码
2 _9 e; B6 q: ]& |& G# v( b! ^/ q, F+ T8 L+ t0 r& r1 \
|
|