|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接* p* N5 b/ `# N& H2 q, g1 u
- public static List<string> GetAllHref(string url)/ x4 _/ ?1 F: k3 c2 n
- {
5 ^3 M) F# M& g( e6 l' {% x; c - List<string> allHref = new List<string>();
0 ^: Z# `) \ v$ H, s; F8 s% u. V6 p - try
7 r$ J6 } b, G. h z5 ? - {& O& P2 l" c5 ]- R8 _3 w7 v
- string strhtml = soso.getHtml(url, "", true);. F- B4 ?- u, Z1 U
- if (strhtml != "error")
0 I3 e f8 Y3 E+ N8 ~ C6 R - {
" @4 `4 x5 Y* D+ N6 s( @3 c% ^3 \ - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>"); b) u4 a4 O& E
- MatchCollection mc = reg.Matches(strhtml);
5 i% j8 \; l3 }. ? - foreach (Match m in mc) q4 N# i6 v; J. h+ ~! ?4 N
- {
8 b! J4 L7 E7 T' } - Uri uri = new Uri(url);: G0 R; P) r/ g+ r/ C
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
: H: V: |. c. ~: f - string fullUrl = "";
, W& G& }9 u" v( @5 {3 D; Q! { - if (m.Groups["url"].Value.StartsWith("http"))
: U% b3 T1 l; l7 F, I& r - {+ @3 D- C) E! T0 e+ X0 C+ h
- fullUrl = m.Groups["url"].Value;6 v8 ~: I* q# l
- }9 R& Z( {$ i0 \) S! E1 }! j
- else
$ P& t# K0 r' U. Z - {9 f6 S, H }& S% C* [2 y
- fullUrl = thisUri.ToString();
+ `: m U4 O& J - }
5 F* l+ H+ l' _9 m7 z! x1 a - allHref.Add(fullUrl);7 `' b6 g; l2 L3 n3 H
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
/ v/ T5 V8 @2 v1 E - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);* ?& Y5 U% a6 h( j4 P4 [ i" I
- //Console.WriteLine("补全链接:" + fullUrl);
! U X& h. B$ x - //Console.WriteLine("…………………………………………");! m2 O T5 l: u" c: @( \5 Q
- }$ {9 L2 _9 {/ Y- ^) I
- }) U' ^4 c9 B9 ?/ E) W, E4 \$ P
- }
5 h. X& E4 @0 B" g, G& N, w! V - catch (Exception ex)/ k3 c+ m* w v) X8 q
- { }
( K6 p/ H0 m( [6 b7 v4 r - return allHref;
, {2 v$ {$ X+ H - }! t& h5 k& B3 X! c
- #endregion
复制代码
2 B* y; Z# w$ X: v1 w) b8 X- s& H. e5 x5 a/ N* ]9 _: z' V
- " X, N- i3 M+ S' P" _7 r0 Y- r: @. R
8 |7 H# ]! m% y% ~- 9 N& g+ L; g# n) U
- #region 数据去重' T5 c, Y" }' z4 T% I
- /// <summary>+ l# k+ Z- ^6 K. P1 Y, F" q# k
- /// List<string>去重; }, e9 L7 u7 D$ _! ]1 o1 e# i$ H
- /// </summary>
5 {; X% w9 ^# t' r7 K( X( W% q7 C - /// <param name="list"></param>
, ?. v& s$ b7 b; O$ x+ a# d - /// <returns></returns>
* ^' E. R( Z6 G3 S) B. S - public static List<string> getUnqueList(List<string> list)
8 @% }+ j& P C* N/ n - {
1 _6 B1 u5 o% U0 }( z - List<string> list1 = new List<string>();
$ U5 [7 B& j Q$ K - Hashtable hash = new Hashtable();
( q& p* j% L7 j9 T/ L - foreach (string s in list)0 P4 W5 ^6 V+ E- q; [7 G
- {9 g+ Z4 Q( C2 M7 ]$ P$ U- i% S
- if (!hash.ContainsKey(s))
: x7 B) D; h6 U& B - {
) ~% {4 t! ]2 {- k - hash.Add(s, s);! n! a5 _: m7 ^7 P9 X: K D
- list1.Add(s);
1 b' v" T7 {9 j9 y3 C - }
! _8 ?$ Y2 _" i9 ^2 K6 W - }
$ M! z, z1 K$ Q$ I! Q* q - hash.Clear();
, C8 F# e8 Y9 I( g5 h - hash = null;/ g) Z3 `" D0 @+ S6 g- C
- return list1;
- h w, I; U; _, Q0 w - }
! W( t) R( R4 u/ F9 f5 j - #endregion
复制代码
+ r: w1 @. t; M
; S" i' j" q2 Q8 ^" o# O |
|