|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
q4 C# W$ q5 y: A! A/ G* h - public static List<string> GetAllHref(string url)
: r$ a% y! G8 u - {& _! ~. Z" Z( \; e+ l3 ]) c( V
- List<string> allHref = new List<string>();1 d3 J1 L+ a' ]: _$ f9 R
- try! A, N; ]5 e- D2 ]
- {# j( J' A4 h. t: i5 t+ k
- string strhtml = soso.getHtml(url, "", true);) a% p& w' J% n* s; Z$ A. u9 i4 }
- if (strhtml != "error")
) n) y2 O" ^9 o: B/ Z - {
+ M( U) b: A- I0 C. N5 e - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");3 `' O% p7 M+ ]5 [$ ~$ I
- MatchCollection mc = reg.Matches(strhtml);% `8 j) u, r2 S7 s: a
- foreach (Match m in mc)
/ V( A8 p# z6 @! d( [. d - {: ?; g3 q G4 O. X; V
- Uri uri = new Uri(url);# W7 v- {4 b @! L
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);, B% Q0 t8 P7 m. h" v; t( S
- string fullUrl = "";
! O8 ?/ |" \! |" N$ E+ O& F8 r - if (m.Groups["url"].Value.StartsWith("http"))
5 t& d/ L' h B/ i$ B - {
( d3 u3 e, q; d/ K - fullUrl = m.Groups["url"].Value; x/ p# V: [+ S, W" B% \$ J
- }9 m! B6 r0 D3 w. ]
- else4 \3 X' S) A7 A/ }& U, \
- {
, z( G5 y" J0 p c) Q; G - fullUrl = thisUri.ToString();3 g2 N$ E3 Q4 c$ u6 ~9 U' d
- }
4 J4 I, _0 }7 X( G - allHref.Add(fullUrl);
( g0 Y8 X) @3 w. r6 [) b0 v7 n- D - //Console.WriteLine("原链接:" + m.Groups["url"].Value);
# L2 R# ]% j# L5 X - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);" ~" A1 u: Z* ~# N9 p. S: S4 `% Z" r
- //Console.WriteLine("补全链接:" + fullUrl);
8 B& m, e+ @! }# J" X, D3 y - //Console.WriteLine("…………………………………………");
2 _+ _$ t" P# H' R6 x - }& [6 F% z1 e. Z% w1 i' X
- }
4 y9 K' [' m, H' d - }5 _9 E' B( r* ~' ] K; P1 i0 ?4 M, Q
- catch (Exception ex)
- w7 v+ D+ ~1 e6 Y4 L2 L9 d9 P - { }, U& k# d, G* t0 M* [
- return allHref;" j/ {/ ^* H- S; L; A
- }
5 y7 M6 I& j; S" Z( Z( S' Y; h6 T - #endregion
复制代码
0 `6 O0 l- F7 K2 r; T d I* v1 r
# `7 R" h+ W- ]; S# M- 4 b0 ?( A- }3 p# Q8 Z" i( Q
- 5 a- k n/ M. ~/ D
( S% Z: N- p% h/ Y7 Q6 B- #region 数据去重6 x3 Q: I2 W8 F/ V
- /// <summary>
) F0 c O( m3 B- I& V* u; V - /// List<string>去重
. U- c4 Z+ e& G. B9 `7 H - /// </summary>( H) [; w) L$ k* b
- /// <param name="list"></param>9 n& h8 |' M& B+ t
- /// <returns></returns>
5 I1 g( X" m- Z1 h$ h2 ~% e8 | - public static List<string> getUnqueList(List<string> list)6 j+ t+ w0 g( T) h1 h
- {
% H/ [; h) s' v! q ?5 b( H" O - List<string> list1 = new List<string>();
* u: Y' K* j; n0 @" e" u: O& `) b - Hashtable hash = new Hashtable();
- l2 m! S N& e, v. {4 Z - foreach (string s in list)! T. |) l5 Y) I* g q5 D/ K
- {
- r3 P5 a% `6 c& E - if (!hash.ContainsKey(s))6 q9 I3 Q5 s; u3 Z4 x, f7 t
- {
* Q0 p6 F3 n. i - hash.Add(s, s);
! c: E- B0 p, d M$ M* }. ]7 B/ g2 N9 A - list1.Add(s);
- O! Q+ F) {7 U' Z; n* j( y - }
; m0 C" s' L: P, O X6 Z9 u& K- g - }
: g, n3 S9 p* L2 t" |+ P5 j- T - hash.Clear();/ N& J+ P, j5 S, G( A( C
- hash = null;1 a) J0 \0 ~3 j* C
- return list1;
9 g9 i, x1 C' E6 T9 Q, R* Z - }/ w$ Q6 j# U3 b
- #endregion
复制代码 / ?! n5 X4 F$ H0 G: I
6 }1 p- Q P, @( }5 f0 Z |
|