|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接4 g( q7 b' c% [+ q5 G
- public static List<string> GetAllHref(string url)
5 m+ o2 A9 i8 l( E/ M) P q - {: C7 ~3 e1 W3 q* E m( {+ `/ Q
- List<string> allHref = new List<string>();7 V8 F+ ]7 u" y8 @( E5 h+ m9 E
- try+ m: `! Q& M) K) I* j3 ^
- {6 F4 L C$ d; i# o$ h
- string strhtml = soso.getHtml(url, "", true);
8 y0 g2 r( V0 I& G- o+ Q, X: N8 U - if (strhtml != "error")
( ~, O5 P W" j0 l+ H s d) O$ ~ - {$ ?( z6 l. N* t' E7 G
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
8 D& C: g9 p A. P - MatchCollection mc = reg.Matches(strhtml);/ X' ^: F* m+ B
- foreach (Match m in mc)2 E& Z X3 ?0 \+ s) ~
- {8 R9 l- l0 T4 X! k
- Uri uri = new Uri(url);3 f" ]; _, o- B* K0 m& ?( z) H! Q
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);& D* E! W& k0 `+ v7 Q3 i
- string fullUrl = "";1 x8 G1 m* c: b
- if (m.Groups["url"].Value.StartsWith("http"))
$ b5 i( U1 y& h x, f) H, c- u - {/ @. p2 f9 m9 K" r8 c9 b
- fullUrl = m.Groups["url"].Value;- i5 U' F3 Y# V) a& }3 `# A
- }, \/ [. \' i8 _' M4 B
- else
% R- b" C7 g \4 i9 j# e - {
8 P. j9 x7 m: b, ^. G s/ O - fullUrl = thisUri.ToString();
; j% L) f( t% D$ H! j$ ? - }+ n0 I; E9 e% \5 X2 H
- allHref.Add(fullUrl);: S' K) r2 h, X* H+ Q% a
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
4 Y- }; C; p; M. R. `" ` - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
- Z/ y5 j. e4 d$ s$ S - //Console.WriteLine("补全链接:" + fullUrl);- Y- I$ i" _) m4 x: [) S$ M
- //Console.WriteLine("…………………………………………");
, d; F9 S7 B, \6 \, a - }
! B4 }7 F4 @1 K" [* d$ T6 t- U) j - }
/ A X# f* q) u$ r0 \ - }: P& M2 v- v' P) W* P
- catch (Exception ex)
- ?5 w9 U. \ Q5 ^& o; m! W - { }/ j% y' ?( n# t$ n) ~/ q$ G" ?
- return allHref;
- m8 S4 @+ l! S( j/ b& Q/ p - }: p! j- h4 c: m9 ^( ^! V- @' x
- #endregion
复制代码
X6 l6 S4 o8 ?* T. I& I) s9 o, A/ `2 [
" M* ~" {3 G- f% C# Z- # o5 T; Y( \" @" R1 S; z
% {% B2 E) }% a4 L- #region 数据去重
1 J7 g- u- W% \8 g& w# Z% I - /// <summary>/ d: F9 x# m2 J2 A j, C+ @& I
- /// List<string>去重4 s9 F8 d u$ U* Y, B+ N- e
- /// </summary>
3 ?0 U5 \$ i% A& R; b - /// <param name="list"></param>) j& T) f0 f$ o5 c: V$ B8 N' T
- /// <returns></returns>
; C, S5 W, l& @& E5 C6 q - public static List<string> getUnqueList(List<string> list)2 o+ ?2 C3 o$ Z! [' A+ L
- {
4 E. u& I" k+ c) }1 l. F- e" { - List<string> list1 = new List<string>();6 M/ [/ y9 E. [$ } R
- Hashtable hash = new Hashtable();' B! f3 s! {& A q1 p9 a, ]. w7 A
- foreach (string s in list)/ X; g! v; s! N3 N/ [* d
- {
, |9 I# q/ v, X" c6 F* d - if (!hash.ContainsKey(s))# O$ ]/ u- N$ o/ h
- {# Z) G+ l2 p- e& o( c* W6 J
- hash.Add(s, s);: ]7 Z( F8 h0 r$ }
- list1.Add(s);! Z/ R) S! @. Z* ]* {/ |) v
- }$ y7 C- r4 Y) v: \, h* L) R
- }
c& N4 C/ ?) i) N& B4 ? - hash.Clear();: G3 C- O0 ?; k7 @8 U
- hash = null;8 g6 c8 N, c# L: Z8 @
- return list1;& e. Y, W( m3 r# C4 G! @
- }" d8 R% Q) l4 }# O
- #endregion
复制代码
# J, _. |8 s) D/ \! t8 o+ T/ @1 h5 q0 v, p! T: h
|
|