|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接& B8 A6 ]( D( [) N! O) P, i0 ?
- public static List<string> GetAllHref(string url)
' Y% `- M$ h( b7 C/ m+ N( m) M3 ~ - {
* S* p* J/ E% a - List<string> allHref = new List<string>();
) {2 o/ X! W8 t1 m- Y - try
. K4 o% @6 X0 {+ k- S0 u6 I- {1 j$ N - {* M- ]$ [8 O5 L* {; T
- string strhtml = soso.getHtml(url, "", true);
% ?/ X9 Q' P- `) K) f. w( K - if (strhtml != "error")9 q) B% a8 W, f7 L
- {* S; J" j* g1 |0 J: q
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");" ?9 H. ^2 h" m) _' z
- MatchCollection mc = reg.Matches(strhtml);! w: d8 i% o1 ^1 S0 }* n3 ^
- foreach (Match m in mc)
) ]$ K* [( C- r2 G$ ^, Y - {& b9 R% k: F9 Y8 z
- Uri uri = new Uri(url);
% K+ w* c- r/ y+ t) J% V8 t - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
1 K5 `* ?) ~3 W! j0 `: m. d - string fullUrl = "";5 g/ l: i- D" h. Y& d7 n( _0 l
- if (m.Groups["url"].Value.StartsWith("http"))+ a, C! D1 N5 Z6 B4 P, _4 p
- {7 R* G4 J9 E' C/ }, {8 [ N
- fullUrl = m.Groups["url"].Value;" u. A; C3 B% Q0 v
- }
; A+ L( w% ]4 X8 g$ c - else. |$ v$ k5 t, e+ ?! C/ u& |) x
- {7 X$ R- P' I6 v; k
- fullUrl = thisUri.ToString();
; N- b# _3 s) D* N8 I - }
# g, z! n$ _, R" R' s - allHref.Add(fullUrl);
4 m: _: ?6 W; `1 v - //Console.WriteLine("原链接:" + m.Groups["url"].Value);* q [- [; t, O# l1 O7 {
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);. b5 |! t; Q2 @- @. p, h& Z$ g
- //Console.WriteLine("补全链接:" + fullUrl);
" @& V }6 K7 V - //Console.WriteLine("…………………………………………");
9 ^* m2 M3 {# P - }* M; M4 _; Z2 U! d
- }, b A2 r/ C) j% D( }
- }
0 p, e8 \9 F; f, e7 m j - catch (Exception ex)
2 b& f- |2 u2 p - { }- D' R: S N3 g8 r4 w8 v( M
- return allHref;
9 @: c$ J5 x' v0 H - }
2 w" ?$ Q1 B9 I - #endregion
复制代码
$ [8 O+ r& m( L- @! T: D5 I
( ]- t1 i% ?. V0 f0 o8 P
3 K9 X4 \- A5 G; S5 [3 {- }
$ j) a5 V3 c& u0 h5 ] s! c
3 U3 n \! Z4 F$ D- U, E# A/ B- #region 数据去重2 q" A3 T+ ~3 w3 Y( q$ m6 z) w' ]
- /// <summary>, K$ e, S+ I. a9 C3 s0 N# I
- /// List<string>去重" X9 t1 i1 D# P* C$ \/ j# w5 B
- /// </summary>
: t/ W5 k6 w! G$ N" C. a - /// <param name="list"></param>
& x- Y5 Q' X& X' `/ O4 I. P6 t/ b) |4 E - /// <returns></returns>6 j& t* F* t; t5 g+ m
- public static List<string> getUnqueList(List<string> list)3 c& ]4 t9 U; x0 Y3 q3 N+ @
- {
: Z% j2 J0 s6 e* F! _ - List<string> list1 = new List<string>();+ t, U$ j2 ^- Y: c) k: H
- Hashtable hash = new Hashtable();
0 ?% a P2 r8 ]1 ^* e& { - foreach (string s in list)
9 [- {; S% V: m' j% C" F2 g8 ?! V - {
9 T4 a" f/ y+ L( U7 Q - if (!hash.ContainsKey(s))
+ c ?1 k* Y; o, X - {
* ^! \( z2 J$ L% |! q% l - hash.Add(s, s);% @) C7 P) ~* a) P4 L( f! G
- list1.Add(s);% v3 D+ A' y f7 ` N! F4 L! u' s; F
- }
& C% X" S; v- h8 { - }
- m0 h1 q/ a' U% w- `; d6 [) k6 T# U- H - hash.Clear();/ w r9 l! Q" D
- hash = null;3 D- o0 U8 x% H7 \" a
- return list1;, M, p; n8 _7 Y
- }
) U9 Y( U( L' _7 o - #endregion
复制代码
* Y* c6 B4 ]$ h, c" [+ F$ u& N! m
) N# S, q3 H! h$ W |
|