|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接+ ~ {) G" s+ i
- public static List<string> GetAllHref(string url)3 i& K# c+ J# Z: t% h* V& K3 E
- {1 F7 u/ Q( j9 U7 o) M/ O. r
- List<string> allHref = new List<string>();9 v& N+ r `3 y; v9 ~# ~
- try
6 P- g8 w6 Z& i9 @" q& }9 _5 ~, f6 T - {
; y5 V. B _8 E3 u - string strhtml = soso.getHtml(url, "", true);$ y, p8 r. V, ^; W0 y( H, ~
- if (strhtml != "error")' {6 Q2 q" _& ?& H: M0 @+ Y
- {+ ]+ q0 C" P# @ I; y
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");% S' W2 H/ t5 Q$ [& x! f( a; P
- MatchCollection mc = reg.Matches(strhtml);( _$ x% S6 V: B: B" H' l8 A6 B
- foreach (Match m in mc)
; W; | D* ] s9 m0 E - {
5 w, w9 T' @) _ - Uri uri = new Uri(url);* S0 L( A6 c0 h2 N2 ?, y* P
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);3 ^7 M5 V- ^6 D. C2 e$ O: l6 M$ C" f9 M
- string fullUrl = "";
5 g" D6 ^- r# z# u - if (m.Groups["url"].Value.StartsWith("http"))
5 T4 { ~- a, ~( y9 ` - {
' z. }3 } y3 ?4 K0 K/ O5 m - fullUrl = m.Groups["url"].Value;9 t: V0 a5 x$ R* Z0 C7 Y0 l3 ^
- }
( [! W4 J6 Q1 z! S' s) R' }8 s - else/ C9 V1 F0 X; T- N
- {
: g3 [1 |1 e2 T1 i# W - fullUrl = thisUri.ToString();
; Q5 P! c3 G9 r6 X - }
& M, E2 P+ A* R/ G. K/ F* { - allHref.Add(fullUrl);
; Z) m( y3 m& W/ | s9 T/ k - //Console.WriteLine("原链接:" + m.Groups["url"].Value);5 Y1 w- O% M2 ~2 I0 m
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);; L, ]" c6 Z2 R1 W; G: [: a% a
- //Console.WriteLine("补全链接:" + fullUrl);
# ^9 F2 N3 V( t6 W4 T - //Console.WriteLine("…………………………………………");
4 v% M: k. f Z) U8 I% \) @% v3 u - }
{& n( @, A9 |# @ - }2 ]' @; E2 P! G6 V" F' v" |
- }
+ o' X, m I w; j - catch (Exception ex)
" \2 v) [' ~. X, A- U - { }
" V1 w0 n, S- P9 H, ` - return allHref;$ U& _0 x1 M2 m. Y1 f. j6 E
- }4 L! J0 T& f- ^# g: g2 {. b
- #endregion
复制代码 # @% D0 j, ^1 Z. e6 n1 o' q
o. e* D( r% t- U/ W6 S5 m# d
; F; b) o. z4 A
$ [3 n4 z, c+ @1 Y' z3 v& G, p
' {/ V( {6 E9 N& x9 ~- #region 数据去重
" z5 c, q1 e0 `, v - /// <summary>
H7 ~4 |% e) z% }( V - /// List<string>去重
, e& m8 g+ q9 L" B3 ^ - /// </summary>
% _- W& O" H$ Y - /// <param name="list"></param>/ C* P; z. G& ~$ J/ Q0 _' b
- /// <returns></returns>/ z% l& s4 k* d& [" m
- public static List<string> getUnqueList(List<string> list)+ C: e: \( W- L( d) C
- {
' V: `$ s( `0 F. m% ?$ _ - List<string> list1 = new List<string>();& F5 v9 o; `* X0 x3 c
- Hashtable hash = new Hashtable();
0 F* |% |0 c& |7 i5 ` - foreach (string s in list)
+ ^! Z) }5 H9 U3 b - {' y; w4 Y0 H; | i, r, r, Q
- if (!hash.ContainsKey(s))6 N: A9 u& k; e+ i5 b
- {) O5 D" }, h' Q/ V
- hash.Add(s, s);
1 Z& d, o6 @, A3 h# r - list1.Add(s);( y2 I) N6 z& J
- }
$ B! I5 o* D7 i6 t4 d - }
9 ^0 P6 x* a# h! }% [! t3 Y1 I - hash.Clear();
; ~- y. @ s9 V5 `1 J7 [) W0 r - hash = null;% D) `0 f1 }; r5 d$ `. Z
- return list1;
6 U' |+ v" Q+ K" z- h. y" F - }
3 `2 q; ~2 R5 Q; e - #endregion
复制代码
( O1 l' }( ~" T! Y
4 k+ `& \* P5 k7 m7 M4 [' T |
|