|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接- l% I- b2 @5 }, l# l
- public static List<string> GetAllHref(string url)
# E. _& @3 o$ C$ C3 M( Y$ A - {+ `% p* R& l8 T: f
- List<string> allHref = new List<string>();
+ U- \- ?6 p: t - try
+ v4 Z' ] Q* ~' G$ T - {8 H! r1 h, b5 s7 z
- string strhtml = soso.getHtml(url, "", true);! a F `0 s% R. y! o- w
- if (strhtml != "error")
) N/ e' u: z9 B% d) s: p4 u - {
; {) w$ ~3 n: ]0 H% [8 ]; \ - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
+ `+ Z, d8 s$ ?! ?+ ` - MatchCollection mc = reg.Matches(strhtml);( }, q" i2 ?' [6 B8 X
- foreach (Match m in mc)% n: U/ G9 {# U- q
- {' a2 T( a2 G. t. K0 s0 f1 m
- Uri uri = new Uri(url);
+ j0 d* G0 w* O& ^" P8 t, g - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
+ R0 \, n& j/ ?3 E7 Q9 v g - string fullUrl = "";
% s8 E% }2 B- d0 x( f+ g3 L - if (m.Groups["url"].Value.StartsWith("http"))" l6 k) a5 \( `, k2 p# T/ F
- {6 z$ A l+ I4 @! T. ]0 C, b
- fullUrl = m.Groups["url"].Value;
( u; ~6 n; j8 j9 t" E7 f5 u3 U - }! C& h7 {9 B- b+ ?# r: G" F: L
- else; q+ e7 J! ?0 Z, J$ y
- {
/ H: F' R2 [- j+ z$ H& P& B( y - fullUrl = thisUri.ToString();
+ ^% b% U( H$ V4 h9 {$ L _ - }
; @# q( {. Q2 g- I _- _ - allHref.Add(fullUrl);
5 X* t8 B! F6 r! k - //Console.WriteLine("原链接:" + m.Groups["url"].Value);" U7 e- y/ A' ?( z7 p, D: z
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);1 t, c" X% ^; s2 A
- //Console.WriteLine("补全链接:" + fullUrl);2 M+ C2 A% K8 v7 z6 b" d& e
- //Console.WriteLine("…………………………………………");3 q0 B" ~% J# A
- }
. S3 |) R6 b" W5 A' E3 r L% v - }8 s; Q" i$ r' T: w2 Z( p! T
- }6 `" ~. g# u7 y: _
- catch (Exception ex)) i6 T" F7 l& d! V
- { }, a8 c1 l, q# g& ?
- return allHref;
. b: x; g& @( d - }
& F0 u# V- b& d6 B - #endregion
复制代码 + Q! u2 _6 j/ [
. C K) Y- e3 ~8 V" ~
- 9 d& g6 n3 h y; m8 `8 n. P; ^( C
! r) [* E' F4 b5 W5 K& i. L$ a- & I8 T2 ]2 @3 ]( [& u
- #region 数据去重
# b/ A0 q" T0 w/ Y6 }0 q8 M- |* w- ? - /// <summary>/ P, _- ^. b7 U0 A: H
- /// List<string>去重
$ a# m$ c" M* ?: ]% b - /// </summary>
" `5 ` [! |; E S0 a* Q) i/ q - /// <param name="list"></param>
$ O9 o, s8 h& s - /// <returns></returns>( x2 C7 s* G+ U# m! E0 B" U
- public static List<string> getUnqueList(List<string> list)/ \4 T# `) |, t1 H, ]7 M
- {
* r8 M3 j8 V% _+ h. _/ F' a2 `1 y - List<string> list1 = new List<string>();
$ C: o( {. N* a, o - Hashtable hash = new Hashtable();
/ h9 Y, e: U. _0 A! p/ R - foreach (string s in list)
: u | i9 ~ E2 l9 l3 s - {) i& o8 m' Y: l$ @: d! y
- if (!hash.ContainsKey(s))
" y' `% }$ k+ K5 e& `9 m$ q5 ]& P% Z6 F - {
$ z: h: {6 l- k5 ]! y: o - hash.Add(s, s);, O1 |) T4 y9 @, t
- list1.Add(s);0 P0 c+ v2 J1 |% |3 R
- }) f6 r7 O1 Q8 C0 G. M: Z
- }) [" G+ c# l; W( d# [" \4 U+ I
- hash.Clear();
3 v2 B- u, V7 l; V- X6 F - hash = null;; ~+ x+ ^3 f ` ]2 T( A
- return list1;! E8 @0 |" T. d( }
- }
% Y4 y9 D N, Z( m - #endregion
复制代码 6 r2 O# e9 ~( [+ T$ h8 R* q
, x2 X" C! R7 ?+ w. W& R
|
|