|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接& {# G7 m8 u% x) S4 Z
- public static List<string> GetAllHref(string url)
# I* x. q" ]2 h$ H5 U4 Y - {
) n8 O: L% `; q/ ~! | - List<string> allHref = new List<string>();$ ^8 n: t8 \/ ~' O! b
- try
. }0 S4 e: X; q% Q - {; H; B5 y5 r7 F# t9 x
- string strhtml = soso.getHtml(url, "", true);
7 c+ j) O" z# x% n* j1 ~ - if (strhtml != "error")
! v9 | |1 S8 L% Q( P - {
' ?. q+ K6 T! \! c' t# E0 M1 v - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
: [0 U) Y- b: f6 j* m1 p, ~% e# } - MatchCollection mc = reg.Matches(strhtml);
: [' t5 Q5 L# w" {8 {7 ? - foreach (Match m in mc)
0 a( Q6 E8 f0 D$ Q$ D - {4 d8 r, a+ m) d3 C! O
- Uri uri = new Uri(url);+ Y, y, G# S% [4 g
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);3 A) W R# e* G; \$ F$ L
- string fullUrl = "";9 v/ w( p0 X% ]0 x% A- J
- if (m.Groups["url"].Value.StartsWith("http")): E5 G( O1 v8 i" A: E/ B$ |
- {
& p( @' T9 Y, d- B+ f3 O - fullUrl = m.Groups["url"].Value;. C1 w/ z# o. l# I$ L r
- }
2 ^9 ^ v* ]) \- [7 j$ n x; c) @1 F, {0 H - else* X2 p" w7 t2 h
- {
$ f7 j4 P4 e B! V$ [4 c. G - fullUrl = thisUri.ToString();/ ~9 o1 e" |( L7 i+ |
- }
' o: N5 s& U+ L5 {2 B8 ~ - allHref.Add(fullUrl);
+ t6 s* G. G1 F, U - //Console.WriteLine("原链接:" + m.Groups["url"].Value);
! k, E# Z$ \& Z - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
& W* g* U, y3 [6 N; R, T. S ~ - //Console.WriteLine("补全链接:" + fullUrl);, t8 Z& C: K" s
- //Console.WriteLine("…………………………………………");3 D% M- t% G5 K( ~
- }
9 q6 X$ s ^4 F, ~# a$ E$ e( a - }
0 g, x2 n- o) E' U8 ^1 U* H7 ] - }
/ J8 a n! R* Q - catch (Exception ex)/ A7 I0 P, X! X9 c0 {) \3 ]7 l
- { }: ^; x% F5 z1 e. Q, p! B
- return allHref;6 }5 x$ y+ I6 G+ l5 c& o
- }
6 b. @1 ^# }! l0 L& B3 V3 N6 `5 ? - #endregion
复制代码 ! N) ?; Z. T1 |" W: q, w
8 j% D5 F4 {. X; R
: l7 c' v j, ~- 2 H3 _ p& `5 @4 ? y" ]# {0 x+ G
1 Q' e# |+ n8 v& `: K1 H* J- #region 数据去重
* ]9 r3 S+ T8 F" X3 u" r; s - /// <summary>
- D! H. C! H( ?1 E1 k! b - /// List<string>去重, F5 |7 s R: m$ \( v% J2 ?% f1 S
- /// </summary>
+ R7 `# j8 b7 ?$ l - /// <param name="list"></param>3 N2 }' x* l, w/ s
- /// <returns></returns>7 [6 I7 G' }0 L& t& |7 U! J
- public static List<string> getUnqueList(List<string> list)
& _& H8 l. J# p/ j4 ^4 _( V, l* W - {& _* `* }- r& }' ^
- List<string> list1 = new List<string>();- c! N/ ^, [7 _. m- j
- Hashtable hash = new Hashtable();
) f0 ]" }: w2 K0 ~ - foreach (string s in list)
; `8 e! i; I; H1 g( V& x- s - {0 y2 Z8 N- e/ ]# O
- if (!hash.ContainsKey(s))
( Y: Y. v2 @/ |% Z0 {# `+ Z% U - {9 S" {4 ^/ n) M8 j) n5 H9 P3 J/ _
- hash.Add(s, s);$ `1 S3 y A% H& `/ i
- list1.Add(s);3 g6 m, o2 l: ?3 G8 x3 y1 U2 l0 O
- }
+ J Z% I; w- j - }
+ w n) O* N$ ]# ]3 B- P* A - hash.Clear();; F5 a3 `' j" n" b
- hash = null;
" Z) \' B- f8 S6 X, b6 H - return list1;5 }, \% p$ z4 C B( f- a
- }
, b2 t, y7 m2 B, ?6 g! j2 Z - #endregion
复制代码
1 j0 h5 R- o7 ? Z$ R* f% L, M- o' _: L8 e3 E# k% S% V
|
|