|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
4 v- K7 H5 r5 ` - public static List<string> GetAllHref(string url)- [6 A1 q1 B& x% G4 L6 ?6 B
- {
5 M7 c- s; ~, W$ N" w' ^ - List<string> allHref = new List<string>();& z* \% |- j/ _! [& B
- try, v) B- ~* w' \: ?
- {& q) r8 X- D+ L" F
- string strhtml = soso.getHtml(url, "", true);0 \5 `0 o- x' z5 R, b4 x
- if (strhtml != "error")
- ]" r9 H( e$ [* e- G - {
/ `( R: i" o4 H7 w( d" P - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");: H4 X7 }( P5 M/ N
- MatchCollection mc = reg.Matches(strhtml);
0 P* i: R' z7 i, P7 K Y - foreach (Match m in mc)
7 |4 s. d, n& y# L. H* `: X* ~0 R1 z - {; w7 A6 }( F T% z" i3 F, G
- Uri uri = new Uri(url);
+ w( ~! I, i# t$ L7 s7 Z% _ - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
0 T b8 W4 A( `+ l. [ - string fullUrl = "";! A: t4 x) `% b; A9 b( O0 _
- if (m.Groups["url"].Value.StartsWith("http"))7 D" L" y2 X$ X$ f) G$ F
- {: u, U$ @1 H3 i! \5 {& z" H
- fullUrl = m.Groups["url"].Value;* L' R5 @' [1 I, a/ J
- }7 F6 K+ K/ T; T
- else
$ S) S' W% t3 z7 S* T - {
9 B. o& D9 h+ @$ Y" `: o0 d. z* a - fullUrl = thisUri.ToString();4 R, U7 q: X, X8 }/ W
- }
; B, @$ Y8 s& t: g. o8 v - allHref.Add(fullUrl);
" f% E& q/ T) p. R& W1 p8 U - //Console.WriteLine("原链接:" + m.Groups["url"].Value);) U! I. Z7 S& o
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);6 g4 ^ O6 L" I4 ^
- //Console.WriteLine("补全链接:" + fullUrl);- T' Q3 u' g( p2 R
- //Console.WriteLine("…………………………………………");" v- ?" K$ T8 r8 y5 l' C7 v
- }
% D8 [" G/ b0 m, ] e& w8 t - }' r# u8 l* N, k I' v& f1 j7 f- T
- }
) ]5 \3 o. v: G" ]9 E1 d5 A9 ` - catch (Exception ex)& }8 F9 T# \* \. Y/ {/ U& Z
- { }
. m: o( f. x6 |, C T6 `" a9 E' ~ - return allHref;
1 p b+ z3 m; C' _! |9 l$ o- Z - }
" ]( S- R2 v0 n9 }4 f9 v% [ - #endregion
复制代码
' H7 Z( C& P3 c; `* ]5 B' Y
) E% f4 g; s* x- 9 F' N- n9 ^5 m. D4 y+ Q
( ~. t% L. M: M/ N9 l- ! u$ K' |+ a# \. V( q+ J; p5 c
- #region 数据去重
9 R9 A% h6 h3 j, V3 g% l8 P: j - /// <summary>
3 Y/ y9 D9 e6 }* C4 V - /// List<string>去重
% r3 M' {& `' s8 ~9 i - /// </summary>4 s+ B; W& B7 W
- /// <param name="list"></param># k4 A3 C5 [6 s$ T
- /// <returns></returns>% i9 M: p8 w4 F' B' ^/ M
- public static List<string> getUnqueList(List<string> list)
1 Q, k9 o1 p l+ y$ ?8 q - {
: X* ?! q. X4 l; f - List<string> list1 = new List<string>();; T, i, \/ k0 Y6 e2 f6 J
- Hashtable hash = new Hashtable();2 E) P, O; `7 [. D3 s
- foreach (string s in list)9 p7 o0 ^4 _/ M7 r
- {
9 b/ R) Q# ?2 L K- L, b! |7 u - if (!hash.ContainsKey(s)). X3 R* F9 C. }
- {+ a4 o# z2 T5 H% W4 ~
- hash.Add(s, s);
" B$ c. X) J. M; `0 x - list1.Add(s);1 j% R' y+ }% J; h( s
- }7 r$ }& E y; I7 Y1 I" g5 S
- }' y7 g4 L5 e& U- `! S
- hash.Clear();# f1 p# y: }/ Z' U5 K
- hash = null;
G/ W- f' a# i+ L; C+ n - return list1;
( B( I8 D$ {/ e - }
K. t2 m J1 _% j0 `0 {7 ^( [ - #endregion
复制代码 , k ]/ n8 o6 I# n
1 N( l) x+ D. S% e( P' R
|
|