|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接2 N' [& J4 C& g
- public static List<string> GetAllHref(string url)
! u6 [4 w$ X1 x. l - {
. ?% [3 ^2 g8 n0 w" t# p T - List<string> allHref = new List<string>();
1 z+ R' F3 L6 m; U5 L3 L' i9 ~ - try4 C2 N! D2 J' H" `: O0 R
- {
D3 p0 v# t9 f% ~, a3 u - string strhtml = soso.getHtml(url, "", true);5 P: M7 X! i/ w8 V* w- x: p! g/ j$ n, t
- if (strhtml != "error")
& [8 M' _4 c; j9 n* X2 w7 I& s' ? - {* @; _* e1 V U' U% |
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
/ t+ P9 B9 _$ l* g6 ? - MatchCollection mc = reg.Matches(strhtml);7 s! h; L" y" e0 v: x$ l3 {
- foreach (Match m in mc)
* e. l' S8 u; r; K" x$ { - {, r& A( ]8 ^% a/ c& J& j3 n4 q
- Uri uri = new Uri(url);
' n+ b* H- X/ a, T - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
$ z4 L: o! `4 [& F @. U( x - string fullUrl = "";
, [8 O7 ?9 w) j, R4 A: I - if (m.Groups["url"].Value.StartsWith("http"))
6 ~7 r! Z" K% l; G- `4 z* Z5 J: v - {8 ^3 m6 M* v6 `0 B' O' g4 O
- fullUrl = m.Groups["url"].Value;
, \8 G/ J; N4 F; ] - }5 g# L& o7 F n, H
- else5 s* N: o9 L% h. Z3 {# A, J
- {1 h' H/ {/ O! ~+ K% [/ B8 _" W! n1 {
- fullUrl = thisUri.ToString();; T3 |% Y; e+ y, ^2 d' {- t% y, N$ n
- }7 \6 Z. y1 Y/ h
- allHref.Add(fullUrl);; V) V' {5 |" p, k i# w
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
4 ]3 M6 ?% Y: \0 j) G" v - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);3 a9 Y1 T( v- `* u V+ U8 N
- //Console.WriteLine("补全链接:" + fullUrl);
" S7 J" k. E4 s - //Console.WriteLine("…………………………………………");9 a/ k1 E# a) g- |
- }% l" n' A" ?% y, i
- }
( A, P6 I E) S3 A# t6 i# D+ P- z - }4 E+ L: T" t* W( ~" j5 ]# P
- catch (Exception ex)
& C* {# ~5 I c8 j8 C/ \" p - { }/ e% M' O6 j# ]9 R5 y
- return allHref;, M& z8 V: s4 Q
- }
Y( M3 v" W9 H$ _ - #endregion
复制代码 6 J% }5 p( X; J
% v' |1 ?1 O! U; V. ~1 ]& Z1 c
- ' L' U0 B" q2 T2 p/ X; D
- t5 I) l$ b8 j
+ u1 P# r6 @. Y0 \1 i( F9 H" ^- #region 数据去重' a: P: k k% `1 w- ]6 W
- /// <summary>& N3 M# b) x/ w1 q, H- B
- /// List<string>去重
# Z& z* f5 m9 G- n! V - /// </summary>$ n. V( a0 E" u1 U
- /// <param name="list"></param>
/ r* H: \& [3 M; I: { - /// <returns></returns>
+ s; h; f# T) f. R3 b8 o: @0 {& A - public static List<string> getUnqueList(List<string> list): g C/ M+ I+ K6 N" @
- {
7 j8 r: N1 R+ u$ B - List<string> list1 = new List<string>();
" E, S' L8 z7 z* {2 [2 |( ~: s - Hashtable hash = new Hashtable();0 T! \, k/ V+ q2 T* o/ @* c
- foreach (string s in list)( _0 ^ o. ?% ~: m5 \4 P
- {
* J$ x) b; g# P0 m2 U - if (!hash.ContainsKey(s))
, n- ~" H" G: f- p6 r - {
1 m$ D, o5 F5 |7 x6 z) ` - hash.Add(s, s);
% t/ C% {& {/ Y) S% \, { - list1.Add(s);
$ m2 @% r+ T$ I$ t' D - }
; J" Q: @- ~1 Z6 L8 D1 B$ K - }8 V' L+ d8 w) k: ^! z& ]
- hash.Clear();, j7 k6 ~1 E8 M- y x3 k" C7 v8 Y
- hash = null;3 l5 H8 F/ B! ?, d
- return list1;
9 Y' }6 }1 Y4 C/ H, R - }$ Y) z6 x- W; p0 M" n1 s
- #endregion
复制代码 4 x3 g* ]6 w- c
# y7 J" x0 _8 s! W9 |: q
|
|