|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接( e, ]4 [9 a- j1 f- k
- public static List<string> GetAllHref(string url)# `% G: y. }8 v# [$ T" A$ H
- {
4 y7 u" z4 v% p5 _8 B - List<string> allHref = new List<string>();0 Y; }* K0 X" Q
- try
7 k- H. o( Z+ _0 T - {
" \1 y' L/ `0 M - string strhtml = soso.getHtml(url, "", true);" Q% T+ n: x1 u! X) D3 }; }' x
- if (strhtml != "error")
" k& |: `6 z/ j, v) e: L - {. y% ?1 a9 m1 @6 e
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");$ q: E3 T: d& K8 _: }9 k3 D
- MatchCollection mc = reg.Matches(strhtml);
% w7 F+ @ m1 v F( ]" a% V" ?( ^ - foreach (Match m in mc): b- P" m. ], y0 U5 C1 S
- {, p( u1 Z- ~( K. i$ O( v! q
- Uri uri = new Uri(url);7 Q" n3 _8 v1 ^9 N( D
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
5 G( k* a, U/ E; | - string fullUrl = "";
4 U7 f7 ` P9 f" _' B1 k - if (m.Groups["url"].Value.StartsWith("http"))
; j, G1 l. B3 N& y - {
; p6 z1 J+ |; {( A - fullUrl = m.Groups["url"].Value;
' [; b9 U* t, H L" p5 v - }; {6 q# q' }2 A2 M! A
- else4 |: M5 W5 w& _' c
- {% o y9 r; f+ @$ `
- fullUrl = thisUri.ToString();' V: l4 Y8 e7 ~) W7 y
- }
' l; G; J/ C) Q* O6 X - allHref.Add(fullUrl);/ Z% \# u6 {) @0 }; {: p9 } c# y
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
, _7 P! W. M1 a [, ? - //Console.WriteLine("文本标记:" + m.Groups["text"].Value); c' x& Y4 d! t; T3 I
- //Console.WriteLine("补全链接:" + fullUrl);: B8 d5 w% w) N& S' }
- //Console.WriteLine("…………………………………………");
/ [" G6 d& ~9 N - }* I! ?6 X1 j% q- d
- }+ I1 L# Q, Y! W( A' P9 u
- }8 Q$ V- b2 _1 W! Y
- catch (Exception ex)6 A2 U; J: r" l
- { }
8 W) C' K$ R4 w9 V* T! i5 p4 [ - return allHref;
1 S/ M W2 Y* S9 l+ P* I3 G - }
7 `% s5 H! a/ A- h! [5 O q6 y - #endregion
复制代码
; O: h/ z$ C. ^: q4 w
2 Y( B, _/ X( `! [
6 ]" l. ]% H5 p3 g
6 H5 D# D; C D' r- 2 b" u W. |$ q3 z- n: o* g" P
- #region 数据去重5 V" q# V# C* ^4 L; @4 u$ j
- /// <summary>
5 O6 L7 s: |- o- m' ? - /// List<string>去重- @7 l6 n M2 E( v
- /// </summary>( X4 D# M8 v! h; x% V0 {- _7 W3 O
- /// <param name="list"></param>
/ ~( @, M5 k* H" ?: S6 r - /// <returns></returns>
8 ^/ I+ X( ~: ~4 M$ }( Y, T: v - public static List<string> getUnqueList(List<string> list)
* O: l* E, ?. ]. ?( `4 L6 [, w' B2 { - {
k% U! s% a( s J - List<string> list1 = new List<string>();& L" @, `, r z5 l4 O0 m
- Hashtable hash = new Hashtable();, y9 J( j+ Y3 I* M* H: s
- foreach (string s in list)
/ E2 E, T* @1 f) k6 B9 x3 j8 x - {
' v% U, `, P# i0 I6 @9 {; u t - if (!hash.ContainsKey(s))
# E" W7 {! a/ h, }0 ~9 ? - {
' O$ k! e+ b6 Y# c. s. e - hash.Add(s, s);! n7 P7 U3 T- L- W) e
- list1.Add(s);
) A) G' |/ j: Y, a' q - }* X9 r9 A4 Q" f' |8 L! f
- }0 ?5 y# ~4 x, `4 N7 C% O
- hash.Clear();( V# ~( G' Q/ A* [( F' W/ A9 @
- hash = null;
0 d, n) O6 z4 ^- D" A4 W( F* d/ r - return list1;
/ ], p, P' C( f2 n - }9 {2 Q# U9 R; g1 b- ^% C% |
- #endregion
复制代码
( i6 w* | c! M$ [6 Q S8 v+ I% o: e2 h
|
|