|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接/ K& X0 d& v/ R" W- I9 [
- public static List<string> GetAllHref(string url)
. C. Y, M, B; f - {
( ~) o9 e! E8 m' u - List<string> allHref = new List<string>();; E/ ]& {% g @, [# G& i$ r2 H5 h
- try
4 S1 A: K' m; g C1 F; q - {4 l+ \. W, G5 q0 |+ m s
- string strhtml = soso.getHtml(url, "", true);/ y! h" j9 R$ z5 D, Q e& K
- if (strhtml != "error")
0 f% w. O3 Z( h# t% L5 Q1 l - {
2 `$ d! F& V& L6 E# _ - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");, P8 Q" y+ h3 y0 P' G2 K
- MatchCollection mc = reg.Matches(strhtml);
- A0 }/ J, ~- { - foreach (Match m in mc)7 q& j6 o# J2 Z
- {* \+ _3 m6 r$ K0 a
- Uri uri = new Uri(url);
3 I% a6 W( K" w/ d+ L% L( d7 s - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
- a1 ]; D2 \, S4 ~" V, {+ T2 Z - string fullUrl = "";$ G; ]/ z' ^) Q' Y1 Z6 l
- if (m.Groups["url"].Value.StartsWith("http"))! X- M2 d5 b& _/ E/ [ b. `
- {
5 v+ O; R5 ~! ]; l1 B: o - fullUrl = m.Groups["url"].Value;& D8 l3 q L$ N
- }6 c+ D/ H9 O( m# A
- else
6 ]0 S1 S0 F7 Y* W. Y - {9 y2 p2 ?. _7 b3 L" y4 `4 K7 l
- fullUrl = thisUri.ToString();1 D* R: N: G% G, i2 _. w [2 I
- }; A/ L4 K, ]. c) Y
- allHref.Add(fullUrl);
4 I Q, N) Q5 b3 | - //Console.WriteLine("原链接:" + m.Groups["url"].Value);
$ ^# ~& x$ i! O; I7 T - //Console.WriteLine("文本标记:" + m.Groups["text"].Value); O" o) a* j) @: t) T
- //Console.WriteLine("补全链接:" + fullUrl);2 ], K! g! [, g, z" V7 x
- //Console.WriteLine("…………………………………………");
9 g/ {( S5 G. n2 A0 { - }
; o9 q+ |: ?# C - }
$ L7 D- m' a7 P1 w - }
$ X7 u7 J2 a9 O' x - catch (Exception ex)' J1 E1 q w1 U# s' ^
- { }
$ M3 I0 w1 r' o0 L; \. l - return allHref;1 q; e& ?1 V/ w
- }
8 m W& a% ^9 d9 ], a- R* G - #endregion
复制代码
8 r- N! z) B5 p# E ^7 u, ]; Q2 @3 ^9 D5 ^: x! H' A
- 8 a0 Z& v- z5 o! j' }, B
- / j6 i) C7 s2 t! W
9 H! P1 W. B% t4 j- #region 数据去重
7 S3 O# _- t5 k5 E - /// <summary>
# y6 {0 z, n7 N - /// List<string>去重
+ X- G; ?$ Q' |0 M - /// </summary>
: ?1 T* V3 t, \+ V+ H- ], f3 f - /// <param name="list"></param>
# ^% j) j5 k1 k7 y- B) o - /// <returns></returns>
i4 l. |6 t4 n - public static List<string> getUnqueList(List<string> list)
- t1 C$ N/ m% B4 Q - {
: ~8 ^+ y8 K4 o% V3 J+ A5 K# T5 ` - List<string> list1 = new List<string>();
/ [- T1 ^7 F/ S) A/ d& b - Hashtable hash = new Hashtable();) U# K- h$ q% S' n" w
- foreach (string s in list)( o# a6 V7 J" C+ |5 h7 C1 T7 {
- {
7 o/ O! n3 E4 o3 M - if (!hash.ContainsKey(s))* N5 |: x' P' r6 o- D4 k( a: o
- {: \; Y2 m$ \) h% t' }/ _
- hash.Add(s, s);2 e9 X; J7 s9 C( L- ?" s
- list1.Add(s);3 Y0 J& r2 ^4 [! T3 s; |2 p
- }
0 Z% m4 i, o9 i4 `6 T. B) x - }
$ O/ z2 q4 b6 k - hash.Clear();9 j s% [. Y9 N3 q ]+ J8 |
- hash = null;
$ V9 z! l: P+ p. T - return list1;
* Q$ O2 R8 T( l - }# v( ?3 U [( n, G; M y& R
- #endregion
复制代码
+ z+ \/ e% e; s! w/ J9 A
2 a7 \, e! Y' B! F# G! W# |, g |
|