|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接9 H% r8 H" v; R, k+ q( K7 v, h+ F
- public static List<string> GetAllHref(string url)
/ d: r0 Z0 ^% g, C6 A- }* o - {
/ X1 e' l0 x% x3 \ - List<string> allHref = new List<string>();" b% `( z2 O' x7 z6 I$ L
- try
; }# I6 e" P" }4 k - {8 C% _. C& X! b) e' S' X
- string strhtml = soso.getHtml(url, "", true);! u# J0 }5 [$ `& Z3 B! f5 o; Z) B
- if (strhtml != "error")6 x: a% n( C4 \, k1 {- K
- {8 q* t2 r4 p4 @+ h* d; j0 w
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");- B8 m0 ]& O6 {' U; R% I
- MatchCollection mc = reg.Matches(strhtml);
& L& Z; w9 C" B5 I: }- g- ?! p1 N - foreach (Match m in mc)
; D3 L' k# k; f# X O - {" J7 ^0 z: C: D4 A- Q: C/ q' O
- Uri uri = new Uri(url);! J! Y- n; Q: K4 {4 c
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
. n! U; \% M% y - string fullUrl = "";
. _3 v, _- h$ X" R2 G& _( P! w+ ? - if (m.Groups["url"].Value.StartsWith("http"))
4 S! M3 z) d& g0 o! l - {- x9 g, }) m [8 m; V
- fullUrl = m.Groups["url"].Value;4 c) u3 n+ g9 A: Y
- }- L" u t, I& q" u
- else
, h2 a; F, r3 O; {/ i8 q - {" o: N5 d, G1 A7 w1 i, z2 @! N% l
- fullUrl = thisUri.ToString();1 y- n* N G5 x
- }
2 P; m' D b0 j+ q - allHref.Add(fullUrl);4 t& F# F; r& T3 o9 D( U3 {* b8 q
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);- [! E6 y0 w5 x2 W' _% D
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
J1 Y- }( T! X) [ - //Console.WriteLine("补全链接:" + fullUrl);* q' A8 t4 h. [" z$ P2 L; r
- //Console.WriteLine("…………………………………………");2 a9 `: }% W) b8 F% H
- }1 |% W6 I5 P5 D: ]% r- R
- }
L/ N) c& Y: p0 j3 W6 v1 x - }
- ?8 X4 T/ x0 I8 T, @ - catch (Exception ex)
4 C' l* M7 F; `7 H! O3 Z& `' U - { }1 D2 e) x& k* ]3 y% i! g# [
- return allHref;6 U+ y, T3 J% J- h3 Q) g' n. D! N
- }7 K. ?+ Y; X' s X
- #endregion
复制代码 ' u6 O4 \4 C2 a5 ~
8 R: V- t4 }! x2 l2 l
- " ^+ y( w/ Z% j- D9 ^/ I
/ j' D. s) ]/ `
% e$ |5 t- b( m# z- #region 数据去重
* h" c U* N( v9 U, W" m6 {" v - /// <summary>
* S+ B& p1 ]0 \- ]: T, Z1 e+ R - /// List<string>去重0 g7 _$ j8 v. E. @" s2 L" W6 k
- /// </summary>
9 Y i R/ |6 Y& _5 c) L/ t - /// <param name="list"></param>4 A* M$ J" l' r: }" v2 K
- /// <returns></returns>
( k0 Y7 `2 x6 ~, ` z - public static List<string> getUnqueList(List<string> list)
/ H3 S/ X; |% j4 j! t; V - {
' h& L; e+ ]$ t3 g2 L/ @ - List<string> list1 = new List<string>();0 Z5 s: ~ a6 v) G1 ]* p
- Hashtable hash = new Hashtable();
, |4 ?3 P: O. o% m - foreach (string s in list)0 f, j1 I7 s! Y8 q/ x6 ^" \! q4 N8 ~
- {8 v0 |* N' Z9 c' s( r" v
- if (!hash.ContainsKey(s))
: d; M3 e4 Q6 _- z% U% b; K2 q - {
! y7 G$ k P! c) C7 o - hash.Add(s, s);$ B: g4 o: o( v7 x0 i
- list1.Add(s);
h5 h9 M( c$ ?' a) G - }
% x: g% w7 t, h0 j& B+ u - }
/ t! i J& I7 c9 G) J r; F2 i - hash.Clear();
. d& M p6 ]. E: M, r4 g - hash = null;
& z0 }# X6 l ^" B - return list1;
4 d" g; c) L; L; K - } ^; B: Q4 D' @8 F; g
- #endregion
复制代码
4 C5 {" N4 y; S; X7 H$ P0 _. q3 U" p# e. q( U4 A4 w
|
|