|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
# W9 _4 a% \& q0 L5 q& ^ Y - public static List<string> GetAllHref(string url)
" N. `: k) f* P2 P7 h/ B - {
0 ?/ i# a, E( O. V( M - List<string> allHref = new List<string>();' b) p j$ U, f+ y) [
- try
# O6 W5 y1 w9 O Y+ O - {
c2 Z$ [7 D2 g- J6 I2 O8 B - string strhtml = soso.getHtml(url, "", true);2 _/ o8 z! g0 f2 a* p5 }
- if (strhtml != "error")
& s* H+ j0 A, W; [- Q) z - {
; W0 N/ w1 M& b. y0 i - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
' C6 K Q: U/ S V - MatchCollection mc = reg.Matches(strhtml);
+ Z8 Y" S* [( ^ - foreach (Match m in mc)' k6 t3 f9 J+ b. W8 k6 m3 ~5 \
- {& R4 w# f8 X, s: k2 b4 ]% Q
- Uri uri = new Uri(url);
. l0 [9 S4 H, I: x - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
& y' d5 i. P1 Z# s, c _- r( p - string fullUrl = "";
7 j; ]7 C7 h( b - if (m.Groups["url"].Value.StartsWith("http"))" h0 l0 N3 @4 f/ g+ x; x
- {# ?) f0 T3 W- }- z
- fullUrl = m.Groups["url"].Value;
1 ?! c, d1 U \ W; D# U7 N - }* A* ?2 d3 t. X4 ?7 `
- else- s, |' K+ f" p3 P) h9 w7 L
- {
6 }8 C5 {+ ~5 q& } - fullUrl = thisUri.ToString();, V4 O- X2 f! K" @+ d
- }+ i# V6 O9 {/ ~+ K
- allHref.Add(fullUrl);- h/ J, R" o. S+ p
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
" X, Z& R" n0 W. e - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);, r, o" ]& a# ]" K' Y
- //Console.WriteLine("补全链接:" + fullUrl);& U# d' p- e6 h+ J) T B
- //Console.WriteLine("…………………………………………");7 r' `( X$ n% `. ~/ [ h
- }+ y+ R. G C# J+ t' Y6 d
- }
2 a9 S X' l, E! s" |/ u - }
2 i4 ~8 d R2 K/ y% i - catch (Exception ex)9 y' I% {& D& q4 R: S* ] b
- { }
6 @& r( U/ E* w9 `4 z - return allHref;
W L+ \) }0 B9 @8 v - }' D& q+ K; S; f
- #endregion
复制代码 2 O* [- I. R) H& a
: [3 ?) x! Y2 m' U6 q$ o
% e4 n5 T: @! i% R$ `; e
+ g) o+ M, Q2 j3 E* ]3 \4 h7 H9 M/ j
& q5 F* B. {- z L/ ^- #region 数据去重3 W* d; G$ ^' s% O
- /// <summary>( @ y' r- I8 I7 q% S
- /// List<string>去重
9 Y2 d* [3 x, ]$ _ - /// </summary>! [# x8 `0 o* |- O$ u9 `6 w% a
- /// <param name="list"></param>2 E+ H7 J6 i) ^3 n' k! p, L
- /// <returns></returns># K: Z% [% C) u" q
- public static List<string> getUnqueList(List<string> list)! L- F6 q. |( o. |& c3 X6 T6 g# v5 i4 j
- {
. o" r' H3 F. j. ?, t - List<string> list1 = new List<string>();
/ `. i9 n2 E7 r: m - Hashtable hash = new Hashtable();
! ^! K5 m7 L% D* {% G) i6 ] - foreach (string s in list)
, L" x0 ]8 B1 Q - {
) N5 ^- _: V; x" I! x; P$ ^ - if (!hash.ContainsKey(s))+ C+ d) V# [2 M, b$ r
- {# x9 F' S7 B, b' B1 _
- hash.Add(s, s);
7 T& K- p2 i9 ~* ?' R s - list1.Add(s);# T, R; |: x8 }: g
- }9 i- u% \! W$ [! ?6 b8 u/ P) a- |$ `
- }0 U, l& D9 s4 i6 s& F
- hash.Clear();6 S2 v; s: A: _5 H6 f
- hash = null;
- Q9 h8 }# @/ w- E - return list1;; w2 ^2 U3 ^1 c7 C4 c p* c# F
- }9 C( K6 H; F( z# N) i
- #endregion
复制代码 4 m* [7 |1 i5 B) O" n8 O8 ]5 G
) Z+ I- v* L& k& J |
|