|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
6 \7 `: p- b1 Z% m$ I - public static List<string> GetAllHref(string url)
1 \8 p! f! y+ [ - { \7 X. Y' R$ Y0 b
- List<string> allHref = new List<string>();
" B# x* X, g7 s [ - try
1 T0 g N3 R5 b+ B: M. b8 X - {! U; c4 D( v: o) I* k
- string strhtml = soso.getHtml(url, "", true);9 W# `& E4 a6 T/ a
- if (strhtml != "error")
" O/ Q- Z7 z0 n - {/ C! d Q0 e2 y
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");0 p5 a) n$ l9 Q
- MatchCollection mc = reg.Matches(strhtml);
5 O1 ?7 n' j# W$ T) w- L - foreach (Match m in mc)
, F5 E/ f2 y* \' E. U8 @ - {. X+ W" K+ g# r7 _4 ?! q
- Uri uri = new Uri(url);
% {$ U( S' _* e, t( k a& R - Uri thisUri = new Uri(uri, m.Groups["url"].Value);& Y, k6 S& c3 D; M' _3 o
- string fullUrl = "";- _0 b& I# |& W; c
- if (m.Groups["url"].Value.StartsWith("http"))
0 e( [+ y1 D. X- ]6 \, W; `% L - {
; A7 ~$ o2 R* q& p3 o - fullUrl = m.Groups["url"].Value;1 C% s# k% n; ^) F5 _7 e
- }
* O* f# V8 l( ?1 |1 Q9 b - else" Q, O, c y0 G1 V/ v' |3 p& H
- {
o5 e* x U L) Y0 Q - fullUrl = thisUri.ToString();) [ R# d, w1 `) Q! B
- }
' e( E" Y# ^: n9 y, @' {( k3 U/ n - allHref.Add(fullUrl);
+ z* W1 L% J# B: f% h" z - //Console.WriteLine("原链接:" + m.Groups["url"].Value);
/ ~8 r6 s( l3 `, S - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
. U+ |. i$ p, m6 s8 z - //Console.WriteLine("补全链接:" + fullUrl);# q m! r& J: c; ^/ u$ j2 v
- //Console.WriteLine("…………………………………………");
% K% q6 O, A3 C: I$ ?, f - }4 t" i9 U' {8 `
- }
+ ?# V p9 @8 y `8 g/ K3 i5 x - }
6 ~+ d$ M" z3 Z8 G+ _* V - catch (Exception ex)3 m Q9 _/ y9 X/ G0 ^
- { }/ y9 n& b) K A$ e! K6 X
- return allHref;
6 T8 d5 r. z! i; z: f - }
, G4 }' D$ I8 [+ v9 U4 o5 n - #endregion
复制代码
& ?/ b0 m- z, C8 ~) a# o2 J7 B7 v% H/ w. ~( J/ @6 P. d
/ |" A; ~9 q4 W8 p# Z- - _* L; v) r, F
- ; |, i/ I* \) |8 O) j" S7 Y, p
- #region 数据去重) m; r( G- w. E2 m3 X8 f4 \" `
- /// <summary>
' V: v! b( R7 i6 ?9 v: Y/ P0 w - /// List<string>去重
$ F9 d0 z9 \) y9 j8 _; K' ^/ G - /// </summary>9 X+ ^9 J4 `$ d) T% \" A) f
- /// <param name="list"></param>
# K- G3 S* i% [- j - /// <returns></returns>9 y& a' K" O) V4 K
- public static List<string> getUnqueList(List<string> list)
; T3 E. O- e5 I' r& j1 U8 t/ s - {" [$ y( a m" C- ?! Q7 J$ l2 G1 T
- List<string> list1 = new List<string>();
P8 _5 ]) G# P1 X# P - Hashtable hash = new Hashtable();" V( M' X6 |; ?0 I9 R" d
- foreach (string s in list)! V5 ` s$ W% i4 f
- {
; ^7 i# z) H8 h( [3 O5 b6 _* I3 j - if (!hash.ContainsKey(s))
# f5 R, V% ^/ W- g- ]; R) d8 U - {
1 q( }, x2 N1 z1 m( u4 p. n - hash.Add(s, s);, U: ^/ ~" k6 E v
- list1.Add(s);9 k8 Y5 q( i0 F& T
- }
! t5 i+ M! k# ^" ] - }
. S, b0 r) s B7 J8 J - hash.Clear();
$ x. n- c0 J* p. j- G - hash = null;
+ J0 f' l% W4 u) l' l - return list1;
& T6 Y' R7 b2 ] - }
. D/ E' _1 W/ x P3 {5 H - #endregion
复制代码
6 K. y" n2 V6 W1 w6 g: ~8 _. c
, `' X1 {3 b& I6 v |
|