|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
' K' P2 U2 s4 ~) {8 l. O% g+ ] - public static List<string> GetAllHref(string url), N6 V( D2 z6 N* d5 h# {
- {
# l& y! s5 ^: G/ q& q0 C K - List<string> allHref = new List<string>();6 r- S/ h9 u+ ~4 E" }
- try8 B4 M2 R! v! u6 j$ } a, m
- {
; ~9 r9 f: K% H8 |# O1 C - string strhtml = soso.getHtml(url, "", true);- v# X0 h$ q; I: D
- if (strhtml != "error")
8 a& o# _( Y$ S3 g) J, g# x - {# |* l( c! L7 X
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");0 |# V6 J9 `3 y' w) @
- MatchCollection mc = reg.Matches(strhtml);
. W- g# v* R+ Q) ?) n. v! |6 q - foreach (Match m in mc)
) N! ^, N! `! n( K* t0 a9 R6 u7 K8 t - {5 G6 n. x- o7 E3 L
- Uri uri = new Uri(url);+ k( \# c) s+ l, z
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);8 W# j3 Q; Z( R" D( M( ~
- string fullUrl = "";- A7 D0 q1 u7 `5 {" E
- if (m.Groups["url"].Value.StartsWith("http"))
2 y7 k6 q4 j6 @) s3 J - {
) i% m' U2 }/ y! T& C. `" ^" ? - fullUrl = m.Groups["url"].Value;; y& g: m7 K. Q ^ A) K
- }) x* d: M9 S/ ]2 r8 d4 s
- else/ K! Y2 M4 H9 f8 r0 k
- {# T* [4 S/ a6 n
- fullUrl = thisUri.ToString();) W% W- X; r# o4 j o, |
- }
1 y+ s. M. Z& u3 D: j1 f - allHref.Add(fullUrl);
& p% P2 h8 @# S* G1 | - //Console.WriteLine("原链接:" + m.Groups["url"].Value);/ _) h4 Y0 K# M" x \' |9 D
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
& @% z7 {- @& U3 n - //Console.WriteLine("补全链接:" + fullUrl);
) I8 D9 _: ^! L3 z4 o; N' v- I - //Console.WriteLine("…………………………………………");, [& i, l4 W- h
- }5 }5 y8 W' z3 `- a% l( p
- }/ c# _9 n; S; S0 t
- }- x1 y; y3 \3 R1 A
- catch (Exception ex)& Y3 h' F, L2 I8 Z
- { }6 D/ Y& J$ I0 d& s, J2 ?8 M
- return allHref;
. j+ R8 ]1 S/ d( i, o+ l/ r7 r - }' e m! ^& R( e& t+ N
- #endregion
复制代码 1 G; T: {2 o, r1 Z
0 v$ {0 ^1 u0 R V; H- B+ x
3 P4 \+ f0 M' i& R
; l2 k1 c& k2 Z- - ?5 X' s0 G' H- ?- j
- #region 数据去重+ }) X& }) }1 `
- /// <summary>
& d$ W9 y$ e: C: X - /// List<string>去重% \2 z/ c/ C. ?* b
- /// </summary>
$ O" ]4 |. M( s, V. f, l8 D4 } - /// <param name="list"></param>
/ T' c3 H, q7 g$ R4 W: _ - /// <returns></returns>% [, s$ C; R/ R8 g
- public static List<string> getUnqueList(List<string> list)
6 }( M: a7 Z% w - {
7 K( ^$ L$ Y0 ?: m) |' o' d& _5 h5 F - List<string> list1 = new List<string>();# \2 n6 A/ a) j) X: ]: k G$ Y
- Hashtable hash = new Hashtable();) h+ a- I5 P }8 C4 @- e3 s
- foreach (string s in list)2 [* T; y: r% S' W5 m
- {
$ ~& t, _( [0 a4 o6 @; S1 ~ - if (!hash.ContainsKey(s))
& V: G% W2 P4 R1 q - {
6 c/ B, d# N& v% i# Y - hash.Add(s, s);
9 j. x U: N0 @ - list1.Add(s);
% F0 Z% [# S, \3 @ - }
, V& Q) M) m- F- s8 P - }, k; b' r! |3 X2 g; w
- hash.Clear();% e. ?$ {& y+ f3 ? `" M, c! I
- hash = null;" c) D$ u# }* R5 Y, G4 f5 t
- return list1;
& M8 y1 |3 F% S- {4 I# e4 G- V - }
6 ^( N% o7 v, P3 e$ e. W - #endregion
复制代码 . ~. {4 }6 b2 F/ |
3 ]2 ]. G) w% K7 n9 K0 C4 E
|
|