|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
; R: l" t/ {, ^ - public static List<string> GetAllHref(string url)
. D( B R+ T: m - {/ R+ Y" a) F) G! y+ S ]
- List<string> allHref = new List<string>();; a& X+ @0 Q8 n
- try7 c% p8 w5 V% e& g
- {
# y! c% B; z: V - string strhtml = soso.getHtml(url, "", true); ~7 A" _) L3 ~/ |1 Q0 I7 m9 g2 u7 N
- if (strhtml != "error")5 A- U- s) T6 ] M# D
- {. u+ H6 U y! x
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
* y& p- U" v: \' K - MatchCollection mc = reg.Matches(strhtml);- w- \% q$ t" F7 [9 q% u; O! o
- foreach (Match m in mc)
: ]- i' W! L6 Y# p; e: {0 d3 X' _ - {& y O/ H8 Y' _) U3 z" B
- Uri uri = new Uri(url);6 A4 \7 d/ o: p2 m; b1 C
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
_) x. a3 }0 u# o - string fullUrl = "";
7 O# M, u& S6 y9 N: j1 Y7 B% T5 B - if (m.Groups["url"].Value.StartsWith("http"))
! [. I8 \( c9 A) l# N - {6 m, O3 b+ d0 F8 q# L4 `
- fullUrl = m.Groups["url"].Value;! s; ]1 n1 Q P# a- U' T+ k
- }& j- I/ j% L/ L& U7 t
- else0 S0 A3 j, X+ x0 L1 P" |: W% e
- {8 U* P/ H$ @3 c+ j) D9 V7 K/ D) }
- fullUrl = thisUri.ToString();
# q5 z3 S& O: w: W - }9 S: ?* k% q/ a
- allHref.Add(fullUrl);
* S# Y/ G7 f4 V( e) u1 x( f - //Console.WriteLine("原链接:" + m.Groups["url"].Value);
3 \. j2 ^$ w( f/ W5 R - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);5 o* O9 a3 T2 q& { j; X& a
- //Console.WriteLine("补全链接:" + fullUrl);4 Q( h) L# ^& s! G1 C/ T# |& n9 @
- //Console.WriteLine("…………………………………………");
' Z/ s# u: y7 H - }+ E, C- i1 \6 M3 d* }
- }
* z5 ]2 N7 R: O! `& H# e8 E - }: ]. R# n4 Q) K+ z2 y; h6 \
- catch (Exception ex)6 q* r, l5 A8 c& U
- { }/ |( [- {9 D7 f0 L4 a! J R4 Q
- return allHref;
! c0 d% X6 K" B$ \" v, y1 | - }
% U( k+ D+ {/ E$ p/ ~8 `: S$ G - #endregion
复制代码 ( Z# I/ G% N* x2 q
% s* ?* \3 m5 |+ ~' ~
- * X6 e# k) Y7 j
- 8 `; n6 t2 O5 q3 Q9 O
- , @ K. e2 j0 Q- e7 G: h' h5 h
- #region 数据去重. ]/ U1 ^% Y- H! Z4 q
- /// <summary>
- M# s2 {$ v- J0 v8 ] - /// List<string>去重
4 }2 }4 }' n0 N9 d, _ - /// </summary>$ [( Y, I, O8 Q( h4 d0 B
- /// <param name="list"></param>
' J. R8 G( ^7 E+ A# v - /// <returns></returns>
0 U4 S9 e+ e9 L# T - public static List<string> getUnqueList(List<string> list)0 |2 \9 b7 S6 W. ^
- {
! s9 c K$ J% X( f - List<string> list1 = new List<string>();5 L! V, z# S- N0 p4 Y
- Hashtable hash = new Hashtable();9 \+ e) x) N: y# e
- foreach (string s in list)
/ M9 r$ k4 R6 ]! C, R% J - {. C( W/ N9 X2 r1 M$ N9 p
- if (!hash.ContainsKey(s))
# U5 R4 m: _8 T8 @; t - {
% q2 ]: P! O5 L: F5 A - hash.Add(s, s);7 J V" }( Y0 o: i7 w
- list1.Add(s);
! `+ I$ c+ `9 f& F% U - }
& [# c; A4 O7 l4 K: [ - }
" Y, K6 @4 C. H. d' \- e, _6 | - hash.Clear();
. z1 A, D3 }" r9 k. H4 j - hash = null;
: E1 C3 \! t+ `, c. v- ] - return list1;; f% s; w! ^! E5 H' J, d0 `- K Q
- }
' p) G8 m* N0 W$ t$ |2 {7 G - #endregion
复制代码 + I3 _' B: ^) D, {$ U
0 @5 d1 u3 ^" K2 w/ ~$ p7 B" F |
|