|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
' i- O0 C8 b7 v+ B - public static List<string> GetAllHref(string url)4 m3 w( q4 R) E/ b, G' ]/ K7 n( C! J
- {
( K% u1 W5 C. b t& e3 }1 I8 ~ - List<string> allHref = new List<string>(); t# D, K8 j. W2 F( e6 g# m
- try5 f. U3 q" ^3 m4 e: T+ H7 D7 \+ c& |
- {2 R, f& O6 b1 n8 s4 s t5 Z+ [
- string strhtml = soso.getHtml(url, "", true);. X8 `" q+ C/ ? q! [/ O0 O; N7 _% c
- if (strhtml != "error")* v& X4 I9 Z# }% [
- {1 S" W! b* s H/ w7 F0 G1 F7 j
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
& y2 `6 T5 b9 E6 a; h7 o8 m - MatchCollection mc = reg.Matches(strhtml);4 w' `, ?5 @* f% l, B# I
- foreach (Match m in mc)5 b2 t, ]0 Q; ]5 p1 p
- {
. U. }9 o8 I$ \/ h - Uri uri = new Uri(url);
! T7 r$ y! G4 ?& D" U. I - Uri thisUri = new Uri(uri, m.Groups["url"].Value);5 f4 Z9 E9 x, V8 Q1 w/ U/ H
- string fullUrl = "";
6 P1 S7 X) E$ b - if (m.Groups["url"].Value.StartsWith("http"))0 J% I6 {" w! o4 z! d
- {
# R3 O) g1 o) D7 I; ~ _7 f2 A - fullUrl = m.Groups["url"].Value;
4 O+ k+ a- ~, }3 k% c _' K - }
9 A9 w6 z1 n2 _3 z - else# @5 `' C/ q; \ X% u4 C
- {
1 @# |$ i5 W# P7 ~& O% _ - fullUrl = thisUri.ToString();
3 w6 ?+ z( S( s: i - }* v7 N8 n6 K+ _ r: M
- allHref.Add(fullUrl);# q: ^. B( |# Y2 d) l6 j$ K
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);9 i/ o3 G6 h, J
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);& W. E8 d- H) ]6 |2 z
- //Console.WriteLine("补全链接:" + fullUrl);
4 z9 j( Z8 }! f* z5 s2 g5 U - //Console.WriteLine("…………………………………………");, Y" m' ]# D y Y% a
- }/ M9 W( f0 v5 d" G% [% i$ Z
- }+ s4 E. p' J5 n5 @
- }
, t4 V& a, u" P6 O& G - catch (Exception ex)$ Z% W3 ? s: H1 ^% z, N% Q
- { }
" `/ H" V/ R' J! E0 z - return allHref;
/ ?, R$ @9 @3 C/ l, W, Y- P& h; K - }, J. P+ _5 u* J2 d8 f
- #endregion
复制代码 ) r0 L! n+ V8 b6 |
; Z% M, r+ Y# S- , a* j0 U$ }1 [! _
# B" b# M# f( D' ?8 z# A
5 G, X G: P0 F& ]2 Z- T0 Q- #region 数据去重9 f1 u- L" E$ V( t
- /// <summary>
) m& s0 i% Y1 _* j- S. ^) X/ U( Y4 E - /// List<string>去重# `, h3 C: s! q1 U t9 u
- /// </summary>! Y0 u. W" s X: H o
- /// <param name="list"></param>7 s1 d7 p. F6 L! c6 e6 |$ M
- /// <returns></returns>
$ M( f+ m, D' z; y6 t/ f! L( Z - public static List<string> getUnqueList(List<string> list)
4 P( w& Z0 Q5 p9 o; T' f" A. { - {5 B# ]! F: b7 c3 _ F' T: }' X
- List<string> list1 = new List<string>();" L& g( R& |7 B, s
- Hashtable hash = new Hashtable();
" k( z8 {# V; H - foreach (string s in list): i, Q% r& Y) z! q6 {
- {& V U: Z% F& V) O! Q1 @- y4 m( r& {
- if (!hash.ContainsKey(s))
4 \& p5 {0 l# w& J( n - {7 G, E9 k7 N2 o
- hash.Add(s, s);
& T: e" B a: n - list1.Add(s);
- C( K$ r1 K7 N [) y/ l - }1 @+ c5 S/ j" y: \' D2 V4 M
- }
- J& `; F2 C. ], n% q* h - hash.Clear();! W' {1 v# |, p- [7 z3 r9 b% N
- hash = null;
, ?1 x# W/ Z$ k- n - return list1;$ O) h" R2 f: c& g
- }; a9 D! I4 P; I2 [0 F% P& Q+ _, \
- #endregion
复制代码
8 W; O, A4 ?, n m
* h- _7 v a% f& u: \1 Y8 r |
|