|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
: U0 e* N9 S8 F$ q& E) D - public static List<string> GetAllHref(string url)! ~, ^% S, {4 U
- {
2 s0 U8 G' ^+ {6 d5 E - List<string> allHref = new List<string>();
: o5 i* ^4 ?9 O, M$ { - try
% I8 W B* Y6 | - {
' v8 ^, h: S* r. d O - string strhtml = soso.getHtml(url, "", true);3 b- Q% t% {$ b/ r6 c
- if (strhtml != "error")" K2 f+ ~; g7 V- `( p
- {) `7 V F- }$ | i
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");( z2 V) u+ r9 C1 W: v
- MatchCollection mc = reg.Matches(strhtml);! D0 E$ y- U( |6 z
- foreach (Match m in mc)+ z6 _7 k/ a9 z3 V3 ?8 n
- {/ U. t) H& K$ p6 W' V. z
- Uri uri = new Uri(url);
: p2 g# d" }! }* J$ l9 Q - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
# A* |+ R( ^# y8 z/ l9 m% B* s - string fullUrl = "";+ b# z& D `2 l1 G4 v
- if (m.Groups["url"].Value.StartsWith("http"))
9 R- {. X2 W* Q/ @( f& q - {
% P* X$ R# {3 i" [& V1 s& h- F - fullUrl = m.Groups["url"].Value;. l5 L" L, B$ h6 y0 |
- }- G$ y# a5 r- U
- else( n% I+ r& I) ]! x, U& u
- {
4 J# A3 L/ D- U, F - fullUrl = thisUri.ToString();, i$ |4 @* [- U2 M. B/ Y, N
- }
, x/ \3 X. K/ E6 O$ p& L' P) L - allHref.Add(fullUrl);" @7 V2 A+ h- l4 T
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);' [6 Q8 N- q9 Y. j
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);* }$ W0 _! H# z; V% A$ d6 j/ t
- //Console.WriteLine("补全链接:" + fullUrl);; l0 `: U! ]0 j- ?* T
- //Console.WriteLine("…………………………………………");, b* l) @- p/ u; j" v
- }8 j5 j, a: T/ E" ?# ^, I! Q. @+ V" _5 Z
- }
% H; u" R S. z9 w2 ~: \1 [ - }) {3 t- M8 W" U+ T1 q9 F
- catch (Exception ex)
1 Z9 d* c! D7 h$ U$ r! L$ {' x - { }( f$ ^2 [/ v( c
- return allHref;, k; I3 }2 d" h% }, o2 {0 r7 D& C1 u
- }
& ?. l; G, p. `1 m - #endregion
复制代码
* Z( i, b- p0 B$ ^ ~
6 M- k* }( d O# R5 \7 G+ d/ ~- R& k. }
- t6 d" i0 [/ A; _- 3 d, [+ }2 ?( v- J7 ^5 s
$ V3 d7 t, ]; k- #region 数据去重 t0 O$ p# H1 x7 a0 h& C
- /// <summary>
! W$ r! b9 x9 ]$ ]( N( }" U - /// List<string>去重5 O* d/ ^ G+ ]* M
- /// </summary># L/ R, z) R. x( K/ \0 [6 a, U
- /// <param name="list"></param>
3 o% A9 Q" ]+ R8 _* a - /// <returns></returns>
+ @% K, V* B: d) r" f2 ~! O5 g - public static List<string> getUnqueList(List<string> list)
5 b6 t7 ?/ Y' O9 m" X- K - {
R( y1 x( a8 s, u - List<string> list1 = new List<string>();5 {$ ?" ]6 ~6 B' r+ G1 b5 E8 i
- Hashtable hash = new Hashtable();
u( _2 X) |# b - foreach (string s in list), P6 G m( e" h# M( l0 Q; N
- {' k* e9 q y' X. Z* w
- if (!hash.ContainsKey(s))
, P" Y- V s6 _0 D# k# q. [ ~ X - {
; p ~5 T: t# O% M' q" q0 f - hash.Add(s, s);
' D: e" e C2 R - list1.Add(s);
) i: K8 y' q8 b( d - }3 B6 b! ?% }2 V( _
- }# ^- T5 a! w/ R# _4 D, t; Y# h& m* x
- hash.Clear();
- [2 b6 R4 M1 x ~/ f' | - hash = null;6 V) s9 c: z+ x
- return list1;
* C. B7 ]' V( t3 G- D& C# o& _ - }* s: c+ p; _4 h5 [% |% X1 t, n
- #endregion
复制代码 $ W4 f6 a( ]8 n7 X* i7 ?
; a3 P( @0 C& p$ A _: O f! Y" C* k' B
|
|