|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接# P8 O- X. P7 ^3 x: j
- public static List<string> GetAllHref(string url)
% c4 E& w3 ~# P% U - {
2 y/ q( p! }! O2 g5 a( I - List<string> allHref = new List<string>();+ \6 |5 Q6 m0 y. ~
- try
- l, \7 y) z- c ~' d$ j) H - {& Y5 t) Y, s( h: a7 y, M
- string strhtml = soso.getHtml(url, "", true);
8 i6 M, Q3 Q' C6 C6 {3 m - if (strhtml != "error")
# Y4 P. c5 z# N: `. ?' |- x - {
% P o( B7 q. E: ^6 q0 t - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
9 s" y1 c8 j2 j9 w - MatchCollection mc = reg.Matches(strhtml);+ R) Q/ [1 K8 A$ Z
- foreach (Match m in mc)
& h- K- Q/ b8 A2 X8 }0 N - {
8 c$ g7 u6 z+ Q1 q9 e - Uri uri = new Uri(url);
$ }! t+ o9 m: U4 V. U, q% c& e: { - Uri thisUri = new Uri(uri, m.Groups["url"].Value);( q7 G0 _3 D! F9 L" X* E
- string fullUrl = "";5 E* _: k/ k V$ b" B
- if (m.Groups["url"].Value.StartsWith("http")); R' V4 A! i5 I
- {6 N/ s) [& ]0 J% L
- fullUrl = m.Groups["url"].Value;, W6 g# g6 R- g# a: w; d% H
- }
6 L" R( D$ H% K - else
" A) ?, a% k" r" { - {
: k& z! K Z' ? - fullUrl = thisUri.ToString();
) | Y7 i% ?9 g& i% P - }* |7 u* W& i" ~+ w
- allHref.Add(fullUrl);
4 Q$ Y; r% U/ n( S' ?& |, o8 A - //Console.WriteLine("原链接:" + m.Groups["url"].Value);, z* K! t9 v, ^1 n6 O ~, n: ?
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
" s* T0 i0 }& |2 R0 ~/ V1 R - //Console.WriteLine("补全链接:" + fullUrl);
* ]( }9 H- m: c/ x4 t& ^- ~9 B; U - //Console.WriteLine("…………………………………………");, n# M/ X* l4 m0 R
- }: m' Q5 Z" G5 `7 s
- }' |' b, B5 e+ R1 U# v$ U$ v% J
- }
3 p/ y0 C e8 h& \& K - catch (Exception ex)
, ~6 ?9 x' A, X7 e8 ^# D1 ] - { }
4 F/ \: W$ E) O) W; ^: a - return allHref;
2 E4 G2 k+ k' }% d- _ - }
+ m/ S9 O5 E% h) Q' ^8 b - #endregion
复制代码 . I( G$ c, q& w! [6 w0 ~
' c/ A* T0 C$ z1 d6 e7 K3 N. A' P7 ?
- ( v0 o1 D* ?% `' l: d
- / l7 A6 h. e0 u; C; n3 f" j
B. G1 J4 p! A" E- #region 数据去重
$ V g) b+ Q: g! u p/ D - /// <summary>) |1 {* y" e' v8 f! x+ O" i
- /// List<string>去重6 k7 q1 E( E" H
- /// </summary>4 q0 t2 V2 s" L' n3 `( q
- /// <param name="list"></param>
" V* q, T( Z7 a4 d0 y2 J - /// <returns></returns>7 S/ V" a U. Y% D5 b* Z+ Q
- public static List<string> getUnqueList(List<string> list)% m; {6 Q: X2 t! {1 ^" {
- {4 F7 R6 {. J& I3 q
- List<string> list1 = new List<string>();
7 k* d3 }7 r! j m: ^$ R. l1 V - Hashtable hash = new Hashtable();
6 u- C8 z$ E% R! C - foreach (string s in list) e$ P$ N/ B, P7 J; T
- {
u6 \! B5 e* G# K1 \% U/ `/ N) m - if (!hash.ContainsKey(s))
4 r, o# d/ Q& k B. Z - {$ Q- h3 @) l, n3 d0 k9 { Y
- hash.Add(s, s);. y+ i; {8 p# k. ?) m" k/ t
- list1.Add(s);
1 Q2 f9 R) x1 D, T' {0 R( G1 n% i - }
, w+ t6 V3 y% R# l0 k - }% g& ^0 x9 `. Q' U1 e
- hash.Clear();
4 H8 U3 G) |5 N7 I. ]$ ]% Y - hash = null;8 b) @9 }% ]& S+ n
- return list1;% V5 H' h- {) z! A6 q, Y4 U: j
- }' g2 P% C9 S4 y2 }- v" [
- #endregion
复制代码
$ u( ^3 }# t' b1 _" N( z" j8 j7 E" C9 f2 h& |
|
|