|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接$ D6 h& ^; O3 w( u1 |* Z; F
- public static List<string> GetAllHref(string url)5 o J+ G! z/ v A8 b6 Q( L
- {, p+ B* a, m( {% k% V
- List<string> allHref = new List<string>();3 U/ o3 A G0 j4 p
- try' T) C( f. c* ]( {. G1 {
- {
5 D+ B3 h+ h# p - string strhtml = soso.getHtml(url, "", true);* _/ L5 m: ?5 \& i2 p; w
- if (strhtml != "error")
, U! Z6 B0 Y) l: O N" ], @3 ~ - {
% V% x5 v+ i1 B( \6 C - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
/ S! K0 w" i9 o# G, o5 ]" a( I - MatchCollection mc = reg.Matches(strhtml);% P7 B! S h8 ?' j6 a' t/ ?' L# K
- foreach (Match m in mc)* x, [" ^- p# T* V& r5 y, X
- {/ {2 Y, c. K9 c
- Uri uri = new Uri(url);; h8 g# Q# p w4 }
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
' K4 x: w+ q+ l% B7 j Z: i& y1 x - string fullUrl = "";
! o. D1 a" c2 ~! S( ?& d6 ?1 O& N! B - if (m.Groups["url"].Value.StartsWith("http")), U Z1 M+ b; \, d% `0 U& H9 |
- {+ e' z9 @- G, d' R: y; U! ^
- fullUrl = m.Groups["url"].Value;2 m( ]! Q9 H# Z! c* S0 j" h5 `: y
- }0 r, n m0 v, I' f1 ~
- else
: K8 t D9 c+ d* |" I% y$ H - {
2 f8 V' `6 ?1 M. y6 Q+ o - fullUrl = thisUri.ToString();5 V( F! H' u) A2 g/ p- t, t. u- z7 }
- }2 J, x3 `# `# W# M" x" N- ]* u! v$ |
- allHref.Add(fullUrl);2 O9 r4 z, D, [
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);- s' m- q7 ], L% z
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
# D! e# [* T, f; P4 V9 A. T - //Console.WriteLine("补全链接:" + fullUrl);9 I8 A; ~3 a; @% h) K
- //Console.WriteLine("…………………………………………");
9 N, n( Y0 m9 {/ T - }
, k( P+ p7 P. y. C- P& `# L; ^ - }: N' G' i" R! Q7 y) B7 T- m
- }* o/ \1 r! _' J8 v, U
- catch (Exception ex)0 k( o; {. C, c
- { }5 `2 q, h3 a+ l( I! `. B
- return allHref;& t) N0 t6 M2 j3 m5 ^8 ?
- }+ _4 w# x( l; a
- #endregion
复制代码
( Y) Q" N) }0 i& [: O& ~/ O" l8 A
1 H" y" z# d$ c
: P( f6 ~; d/ W3 x; G- # h3 C; D7 ~! |2 A7 m/ D* b- M
- 4 x9 u( `# L; [3 r I3 D( v
- #region 数据去重
) a/ @/ P/ x! R - /// <summary>5 U! M+ ^1 {+ F" I
- /// List<string>去重2 V- H7 N- y- b' ?$ A: c1 ^4 R& \
- /// </summary>
6 O/ H- C* _. e4 o) ~% m& x" w+ _' D - /// <param name="list"></param>
; d4 M7 M1 l! {, [% Q: `; J - /// <returns></returns>
! W3 x& f2 K! d$ s8 h8 Y - public static List<string> getUnqueList(List<string> list)
h) ^5 P, d7 X$ O+ x& u+ k) Q - {; |! [6 }) Q% l# x* h* Z, r# t" @
- List<string> list1 = new List<string>();
- J3 S3 A) g9 l% d - Hashtable hash = new Hashtable();
! S8 b6 U+ M2 Y# s K& Q7 Z k& a - foreach (string s in list)/ u+ I6 A r! } c m/ y" g/ C
- {! E8 U- |* a+ P$ @( l
- if (!hash.ContainsKey(s))
# V. c3 }6 I, Y' U1 ~% y/ [% H - {
$ Z5 u: R0 `+ {: O9 u8 w - hash.Add(s, s);
( f; k) H; `& H+ v% ~# N - list1.Add(s);
1 t5 E# Z2 C0 I. h - }& A. J# H# u* [9 _( r6 N; }0 J
- }
, O# L+ p# \, @" e+ v0 t8 `( q - hash.Clear();0 S9 o* z7 h$ p0 ^
- hash = null;
; R; f: I' \; `( e% A+ T+ a. G - return list1;
# |8 A- `& W/ t; @, x9 ]" R - }
6 V% I: h) L$ V9 q6 O1 Y2 J' R - #endregion
复制代码 k8 |. r" q8 |7 r0 t2 K3 m$ t
+ h$ _7 E; a4 H9 o) r2 V s$ ?
|
|