|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
# z8 H) z" F. D: H# Z) C - public static List<string> GetAllHref(string url)
0 V' E4 S9 s, F& s' k - {
- k$ P. N+ r8 F! p3 F# n' S - List<string> allHref = new List<string>();
* |5 I9 |3 @/ X" \ - try
R; D4 E/ G4 y: t! R, V2 e - {
) b, K* z5 E# _5 `* ` - string strhtml = soso.getHtml(url, "", true);
' L0 [( ?" |7 r0 a0 L- g. ^ - if (strhtml != "error"), W: B# j; r# V5 g; P2 \! h% d
- {
, g+ E0 Q4 c3 j; h3 N* Z" G0 ] - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
+ t7 w- O4 K$ \, r* J1 Y - MatchCollection mc = reg.Matches(strhtml);* Z0 F( {" E) Y# j3 T
- foreach (Match m in mc)+ j) z; Z' P* p' g: i8 N
- {
2 T5 V$ t+ |( p+ |- }: Q1 Q, B - Uri uri = new Uri(url);6 S6 p" S* g" r' Z9 O
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);8 I4 S; O* c* O4 {
- string fullUrl = "";+ P6 X$ @( V' v5 u
- if (m.Groups["url"].Value.StartsWith("http"))
) t: J+ E' |2 @0 k8 c! W( A6 ^! f: W - {8 i8 D1 e' S- c9 `! O& c
- fullUrl = m.Groups["url"].Value;
4 n9 |, _- @4 L2 b& p! ?! R - }& {4 a( t9 ~0 J7 e9 L3 {" j) U
- else
0 C Q+ [$ _5 ]' J2 p; \ - {$ I% ]4 Y7 k2 c6 e
- fullUrl = thisUri.ToString();
/ K* @& B6 \$ f* C - }# D8 B7 n; b, g
- allHref.Add(fullUrl);8 b4 E5 o, N0 Z
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);1 Z+ @) m% J) J9 F) _7 o5 M9 ~
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
0 l8 v8 L( a! r1 h* e5 P - //Console.WriteLine("补全链接:" + fullUrl);" d% i# O0 D4 l( ]- k7 _1 J
- //Console.WriteLine("…………………………………………");! Q8 w& b/ C+ ~; Q/ n! P
- }
# X! U% e; x& l7 W - }+ F( l& K. E' W2 @
- }
4 `7 X5 _4 ]0 w7 d' @& s0 f, v+ A - catch (Exception ex)2 H1 q( S& y& @% V
- { }
7 V+ x8 O0 b9 l1 `& w - return allHref;7 C1 C7 P: r( x% u" G3 s+ h( o
- }
1 N. t3 ^. p7 p2 v - #endregion
复制代码 2 w9 ~7 t" V1 z1 W) `
$ u$ p0 G( ]- ? C( M7 B, M
* H& U/ |5 l7 a" O$ W5 E- ( {; g1 W1 J2 A3 b4 B
1 ^4 b* \! }# N( [- #region 数据去重2 r! o" S; @ O
- /// <summary>
, ~1 m+ u- C; q3 ^, B - /// List<string>去重/ s! ^) m% n( \; \- b& Z9 d4 w
- /// </summary>% u ?" E% r6 v
- /// <param name="list"></param>- O+ F8 t. m- D8 v
- /// <returns></returns>! n; F5 h9 l. Z, O& _! ]3 Z8 l
- public static List<string> getUnqueList(List<string> list)6 u D; f# h9 G0 k7 g1 f
- {' _$ {) z& m8 Q: b
- List<string> list1 = new List<string>();
7 a: r4 [& t( T! {$ u - Hashtable hash = new Hashtable();$ Y- i5 v" r/ L" z i- U
- foreach (string s in list)
- C( j& j) _+ i; Z - {
" o; D, i- }( q8 {" M! Q - if (!hash.ContainsKey(s))' M1 g# Z9 @( Z4 w- T3 r2 U. B. l7 p/ s
- {
' l* z! K; X% `7 S - hash.Add(s, s);
! h7 ?* h. l4 t( k( f - list1.Add(s);$ |+ U" l; `0 _) L$ M$ Z
- }
+ N1 T8 G; `* ?3 r D# R - }
, P! o, {1 H$ W& F9 W - hash.Clear();
- R# {. P8 |: S2 e! r; x - hash = null;
6 [3 q0 v1 ?# ? - return list1;$ e# M0 a/ l5 c% }7 n* F
- }
3 T& u1 h1 x5 ?! ?* M7 u- J/ q - #endregion
复制代码
% y: w& X0 @7 E/ C+ p" n/ M) z7 K( a& ]" P2 A
|
|