|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接' x" v) l. N5 K* {% m
- public static List<string> GetAllHref(string url)" J+ `9 J# [* h% |* Y0 e
- {
! y" c- v l- c/ \ ?, J - List<string> allHref = new List<string>();' S2 P) b9 y9 U/ D# K, L1 g
- try
& d" l( v0 L6 s, J4 k, x B; K - {
9 N; B. c& k/ p% m* U% @, S - string strhtml = soso.getHtml(url, "", true);
" C M$ q4 U4 {7 I4 M - if (strhtml != "error")6 N# P6 H4 d; Z; h8 ?0 v; b
- {: d, B1 L3 k! G; i5 B4 a5 Z
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");: h, P3 _& i0 H
- MatchCollection mc = reg.Matches(strhtml);( P3 c4 G1 p- C
- foreach (Match m in mc)7 k" E; z* D" B$ g& |) _, ~+ w0 k( e
- {
9 |! x: \' N8 J# \) i/ l+ v - Uri uri = new Uri(url);
+ w$ W- _* g+ r) D9 Q% R8 f - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
) q. z7 T! n5 K# q) p) M - string fullUrl = "";
3 P% F: U3 X: f3 H - if (m.Groups["url"].Value.StartsWith("http"))
+ c) p' K- H7 J6 w8 ?6 g' e- r - {, R: _" q( u5 o) j+ u8 j
- fullUrl = m.Groups["url"].Value;3 T2 ^6 V4 h7 z
- }
6 j% W- w1 U5 F# C3 L- i5 L# H - else
/ `* A6 e( p2 n: x" q4 i) s q8 }8 o - {
: d9 B( H" h* w- S8 u s* J - fullUrl = thisUri.ToString();. G8 }7 w) S {" w5 B
- }
- X2 o; R5 O. j4 g" @" W8 i& g - allHref.Add(fullUrl);
5 L# \+ E) ? q0 k - //Console.WriteLine("原链接:" + m.Groups["url"].Value);* J% _- ^/ m3 ?
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);0 }7 @- b( S2 {' U
- //Console.WriteLine("补全链接:" + fullUrl);
6 g8 K# _, F/ p) A; s$ t B" B - //Console.WriteLine("…………………………………………");. b8 T- Q; r8 H6 r, n
- }' S5 k3 p. ]: j N {& J/ }' J6 w; P
- }
( J; R# q/ H1 ?! } - }
; F8 [) m4 R( M8 T- m$ p0 j: q* g - catch (Exception ex)% x j- O& ^; m' i! \9 v
- { }% T5 C9 I: V0 \. Q; Y/ Q e0 P- K
- return allHref;
0 V" W$ @- U5 U - }
, g' @' N8 g4 Y" Q - #endregion
复制代码 ; L0 k: Y* D1 O
0 l5 m1 U( s: i
& v3 g8 R$ ]2 [! K4 G( S" u9 a* v3 k
% L& x$ M" @' F1 [; M) e- * I/ d/ Y+ y" \9 j/ Z. @" j
- #region 数据去重
2 a6 _* L! v$ _5 U - /// <summary>
+ A* D) p* q( F& a - /// List<string>去重" P4 F. V4 W; E' D; f6 d
- /// </summary>/ \! {, `4 \: U9 j% v! I( x$ P
- /// <param name="list"></param>
6 `# }* c$ Y0 Z5 ^ - /// <returns></returns>! F( w; L& Y! @$ b+ l7 ^
- public static List<string> getUnqueList(List<string> list)
" b- I! Z; _/ y# }* O) ^ - {3 L ]2 V3 p5 }+ V0 y
- List<string> list1 = new List<string>();
* O; E: p# l. ~: Z. [9 ]" }) x* v - Hashtable hash = new Hashtable();, _, F3 k9 k' D3 m- A6 ?6 `
- foreach (string s in list)
3 T$ T' k% ^ l0 n4 }8 n - {
0 R! p3 `* E" b5 O6 e3 V - if (!hash.ContainsKey(s))
4 k) J' H( J" { O+ a - {
" @; R; d( X' Y, z - hash.Add(s, s);
# o5 k" E* W1 h1 a - list1.Add(s);
+ m4 D; H5 a# E! m6 n! a - }$ @8 Q n& b$ T! \+ t6 V$ m5 P7 B
- }; G1 X# J4 Z, p1 S+ q' X' J3 m0 b
- hash.Clear();
! }4 I# \" f" f. U- k - hash = null;
) h% N$ E9 h8 g - return list1;( P- X- u! O/ n. x
- }1 _9 O7 m4 u" ?8 U9 J
- #endregion
复制代码
9 E: f) T& E0 e b
8 J1 w4 Y; g9 t0 G" a7 B3 t |
|