|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
/ @7 X. x: r# I) y - public static List<string> GetAllHref(string url)
9 D4 U0 c/ X, D' J; O - {
6 U1 F3 z* w5 g2 `% R, K1 |, T# \ - List<string> allHref = new List<string>();
& ^, O }4 l5 r - try! s$ P7 @) t& [
- {4 }7 C# R: d! y2 T1 F
- string strhtml = soso.getHtml(url, "", true);
3 T; | u# W5 O ? - if (strhtml != "error")
1 k/ G# @, v. ?( B9 ]0 }; E - {0 R5 e+ j% Q$ z' J, a% D
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
8 |! e/ R( i7 L: D0 ` D: _ - MatchCollection mc = reg.Matches(strhtml);- ~* d# k9 K9 _* b/ O; F
- foreach (Match m in mc)
9 x7 c4 R) X$ F9 L& a - {7 W. H0 B, i% v0 y! q, ~
- Uri uri = new Uri(url);
* t r0 {3 B# R" e. u - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
+ W0 P3 b% C) U5 T- g' v - string fullUrl = "";0 O" P! B. s' e) J
- if (m.Groups["url"].Value.StartsWith("http"))
" [$ L; J# K+ H! A8 Z - {
/ j! G7 T* x' e! d% H6 u* p# g1 \ - fullUrl = m.Groups["url"].Value;
( Q( j9 l8 M4 s8 R6 l - }
5 H0 K* ]- j! m L - else
6 G( a6 k, A4 b; q, C - {+ E' Z: C7 X2 _4 q
- fullUrl = thisUri.ToString();" }; n# n0 J1 m$ c+ D& k* ]
- }
5 z0 q4 E& ?: ~2 } - allHref.Add(fullUrl);
+ A a$ S! y5 Z - //Console.WriteLine("原链接:" + m.Groups["url"].Value);, {( n5 J+ o! `$ z$ S
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);/ ?2 T- W# h! O: v& y
- //Console.WriteLine("补全链接:" + fullUrl);8 b6 y! l8 t' M( D0 F: Y
- //Console.WriteLine("…………………………………………");
' o- ~, t6 N1 H& R& b - }
' u3 f) V% B8 ? - }
8 i* m! L9 M/ M# o - }
4 u) J9 F* l# A5 J! h( s - catch (Exception ex)7 O F- }1 |8 t b
- { }, H6 ? R$ ]) u$ v# Q) X
- return allHref;
( V8 Q9 o: E! I8 ` - }' u3 Z! [2 _/ q% V' Z/ @
- #endregion
复制代码
+ T3 ~, y. w4 v# d! a. s5 K5 _% b1 b
: [- j- y. F* }- 2 Q9 b% K9 B' }
1 {, x1 i1 X6 p- #region 数据去重
+ E8 B" ~' l B6 C1 O- i6 u; Y2 h - /// <summary>9 ^ U+ t, \' C" U, _6 P5 e, ^
- /// List<string>去重1 a& t1 o5 Y9 [9 W
- /// </summary>
0 D3 E; k* _5 b+ Y) v2 t2 c - /// <param name="list"></param>
) l* F, ?& B$ `7 |( U( X1 W0 W# C - /// <returns></returns>
$ V2 p! u. i1 _$ {) b7 C - public static List<string> getUnqueList(List<string> list)
0 ?! o1 W m' g1 Z( g0 R2 F( }' C - {/ i) T2 D" m. D$ L% _+ s) e
- List<string> list1 = new List<string>();* o6 S' p. d; d% Z
- Hashtable hash = new Hashtable();4 n& M8 \" b! Y7 Q4 }2 Q5 s
- foreach (string s in list)
* ?+ X, U t4 F t S - {, m7 T' u8 y* q( s7 `
- if (!hash.ContainsKey(s))3 \4 l+ z5 |/ ?5 T* u
- {
6 H2 n+ U1 d. q - hash.Add(s, s);7 m3 J9 V; @' Z& }
- list1.Add(s);
1 k6 {% B9 \; L* } - }
) X/ r/ N, l7 c8 G0 M8 A. c - }+ `7 E/ G/ G' ~0 u& Q
- hash.Clear();2 _0 @, w( V- {* @. Z
- hash = null;5 c% n& Z) T/ {/ o
- return list1;# B, S9 j( \% L
- }+ z4 k9 X$ n5 t- v( j" k8 g3 w
- #endregion
复制代码 ) M8 s! S$ z! l
$ _9 c3 G3 z1 K2 s0 b3 ~ |
|