|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
0 q4 I. M: G5 M - public static List<string> GetAllHref(string url)% j, B5 @- q, w, i8 U
- {
- i& f# E2 Y5 M6 S - List<string> allHref = new List<string>();
5 h* V8 G, Q$ Z% R - try
H7 I! d. p: c - {0 V5 B; P# |2 h; F$ A0 F% v; i( @- M
- string strhtml = soso.getHtml(url, "", true);
$ A8 v1 t/ c$ H& z z - if (strhtml != "error"), M2 H# Y( |6 r, }
- {
3 d R- Z- b" g2 [8 U8 Q - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
1 f9 @! G0 j; d# r: y8 Y g7 T - MatchCollection mc = reg.Matches(strhtml); k c. _: J2 a" ?2 `
- foreach (Match m in mc)# N$ u0 F& ], a2 F l: O% L v
- {
: ~# o0 [9 |2 d5 ^$ ^ - Uri uri = new Uri(url);
' O- Q* @6 K4 r- T - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
$ c A" D. J2 B! u2 o2 `! v - string fullUrl = "";" H+ V0 s2 S U: Y) N$ }. i, y
- if (m.Groups["url"].Value.StartsWith("http"))
8 W. }, H; E8 b4 g+ ?2 V - {
3 ]& T: R4 B7 \3 w- G - fullUrl = m.Groups["url"].Value;
& C+ u" p' N- T; T( [ - }
8 g, _6 k O9 G. Y( {. H - else6 @) e, @9 o$ V6 \# l) d& @# L
- {3 h5 c. V/ {$ f3 {4 X `- b( m, g
- fullUrl = thisUri.ToString();4 n, }; {2 G" o. z9 H
- }
% {# ~, |8 A- I) U g4 Y5 ]* e& y - allHref.Add(fullUrl);" u i8 j9 u( ^- v0 o$ Y3 a
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
2 g, K0 Z9 ^ j D7 C, F; i/ q - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
' g) R' _6 v9 a4 g0 ^0 s( G - //Console.WriteLine("补全链接:" + fullUrl);
! z& u/ B* w2 V9 ~8 q! ?, ? - //Console.WriteLine("…………………………………………");
8 i2 I' K) ?6 H- j - }8 C: _1 N4 w9 D, r1 q# e9 V; w
- }% A4 W& w1 m6 J$ t) z9 `& l6 v
- }: y9 v2 x& u1 o: Z& ]
- catch (Exception ex)
# N" |6 t b, b3 V: X# P u - { }" G. R- u6 i! E7 ]0 P U
- return allHref;
7 ^& G8 K% J! D/ T9 ? - }0 B, |7 V, ^6 y0 N& N. c$ q
- #endregion
复制代码 + ~2 h# V9 E* d/ l |+ e, f4 U
, W! Z+ k5 u. _# C, {: ~. v* i" M
, I( \: ^4 A; P3 g# I H M' A
1 F( m( g& i h- 1 ]$ f& l( A& J
- #region 数据去重5 V2 r8 F% g9 d( x
- /// <summary>
- {9 u0 t4 L+ E1 Q) k$ P - /// List<string>去重- ~% I( f* H4 B! C; T: B* u9 O
- /// </summary>4 _7 j9 z) ?. A6 M4 p, a# F0 P$ R6 w( o
- /// <param name="list"></param>! g) J0 f2 t5 ?+ N* r
- /// <returns></returns>8 i1 D$ _0 _7 V5 `8 q# _% j3 o
- public static List<string> getUnqueList(List<string> list)
. f' `. q6 C3 r0 N+ S$ i - {
: v9 m1 v; P" R/ z9 e - List<string> list1 = new List<string>();2 a7 u! g; K- z) E* V
- Hashtable hash = new Hashtable();
, a) U7 G% v- k; r - foreach (string s in list)& I; M1 {: O. I
- {5 T ]% E. c, P, j0 \. ^
- if (!hash.ContainsKey(s))/ y( G$ r s; c" B
- {- V) L% t1 ?$ L7 T
- hash.Add(s, s);$ w! V4 B% N" E) a; ^& I
- list1.Add(s);
t! N$ I' Z. w A m) ? - }
1 p2 L5 U6 X6 f( D - }
" y" T1 `; Y. {6 |& W& Q2 W( i - hash.Clear();
8 C( x( U/ O, f - hash = null;( n7 o2 s3 P6 K5 c
- return list1;
1 n* ?, W! w# a/ m- B5 B - }
- b# J1 j8 R) Y) v7 e9 [ - #endregion
复制代码 4 ]0 v$ c5 s( z; {
7 F; ?( ]9 r7 V2 {3 [. M0 ]9 @ h
|
|