|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接) c) D# _. ^/ J7 ?, ?8 E" d" D
- public static List<string> GetAllHref(string url)( S/ S$ I- U8 C: K
- {
4 w! x' B6 A3 N - List<string> allHref = new List<string>();
8 X7 q9 B% S+ T4 u3 F; A. j - try# Z& Y' _; Q3 G/ L: a7 P
- {# F( |5 A5 ^; z2 j! k% M6 z
- string strhtml = soso.getHtml(url, "", true);/ u% o3 N* k6 E% C. W' I9 O, ^
- if (strhtml != "error")8 r- U2 C7 a8 X, D6 S+ _
- {
" g, j5 t1 d3 C; D* f9 W1 t3 `) O - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");. p; C" I8 i, f6 _" j9 K
- MatchCollection mc = reg.Matches(strhtml);
8 U8 ~- `( B7 S$ D$ _& C - foreach (Match m in mc)
" \3 m9 b& ^" ?9 j% T+ j8 a - {
& @+ F: R6 r1 |: f5 ^. {) C+ P8 l! g9 ^ - Uri uri = new Uri(url);
( e' t3 R$ c' m' M8 A( k' \ - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
$ n$ g; V6 m" p# l9 ^ - string fullUrl = "";2 P8 A6 {, y& Q0 j, g2 Y g
- if (m.Groups["url"].Value.StartsWith("http"))) s" w+ J4 J- F" b: [3 g
- {% _. c9 m; v' ]" B
- fullUrl = m.Groups["url"].Value;
+ }, G1 u# S- L* e - }
' z$ q* l6 V, c' I - else
# g+ H0 l+ w5 k; V! s" b( ^9 q - {" {- v7 L$ ~3 }; E/ @4 X
- fullUrl = thisUri.ToString();+ B, F( H! N8 _- G
- }$ J# P. p( Z0 A0 p
- allHref.Add(fullUrl);/ y9 q2 E: b, \
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
% B' a" {3 E4 ]! h! ]- I, ~ - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);3 ^) F, x1 x0 f
- //Console.WriteLine("补全链接:" + fullUrl);) C1 f( o0 Q# ~% O- c- R. _8 I
- //Console.WriteLine("…………………………………………");
5 Y$ Q: Y2 L* i/ a: j9 y - }; y) i4 B7 L* ^
- }
9 ~. w5 j3 X9 h( r1 R( m" O } - }
V F4 L# [& _" S+ N( b - catch (Exception ex)
4 Q" _& N9 y- |9 @3 F! s - { }
0 r( \$ c# t/ X7 D* M - return allHref;
D! Z: z5 E1 ]0 l - }8 Z" E$ W+ W5 ?
- #endregion
复制代码 7 l9 a: \! d/ a$ B" M
. a% H& k$ Q, N2 E4 v/ ?! n5 Q
- ! } U- G$ t" B6 |& A9 ]
- ) `. U" ?% f% _* o
- : o) E. f) E! U* f, N
- #region 数据去重
/ j/ x$ x {" I7 R; |& i, J" Z - /// <summary> s; ?: S, ^; k0 m! g; {
- /// List<string>去重8 C" v8 b" M9 X! ~( B) d2 c0 B
- /// </summary>7 N; j! Z, T1 t
- /// <param name="list"></param>& Y! I r0 ^4 m: w+ R2 z, {3 H4 e
- /// <returns></returns>" E0 S, K7 ^1 C
- public static List<string> getUnqueList(List<string> list)! m' w4 B6 M7 w
- {
% A9 O4 h3 ?/ E, ]7 p - List<string> list1 = new List<string>();1 W; R& Y Y3 \% z+ p( B3 ` p
- Hashtable hash = new Hashtable();4 {+ x& j, F$ d, L" u' W# c( I
- foreach (string s in list)9 N4 q! g. w( V1 s
- {
% `* v2 `) r" j. c3 v" [# n( z3 q - if (!hash.ContainsKey(s))9 M* O1 N# K9 o
- {
9 E E$ C1 @3 P' J+ ? - hash.Add(s, s);) S j7 r( d% y+ F
- list1.Add(s);
* C* h2 X9 \$ E. q5 P9 V - }
$ f9 h; s& r& }* z! \/ P7 [ - }* v4 g! G5 a: o" D- q
- hash.Clear();
0 E9 ~& N }" A2 o) F- n5 r2 [- b - hash = null;4 H8 i/ Q8 V0 p; ^0 V
- return list1;
) _# q- u4 X) Y - }
1 v* [6 W8 f" Y, d4 Q i Q" Q6 F - #endregion
复制代码 6 E" c1 Q, b2 o, N! ?, l- g
, H' s; d) {$ ~( G' s+ l1 K9 g+ O" H' H
|
|