|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接, Y4 u' W% }# q f- v& D
- public static List<string> GetAllHref(string url)
7 C% r; b; v" D; O$ ~ - {
- N( p3 I: _3 H9 q: Y+ ]9 _) N# D - List<string> allHref = new List<string>();, ]5 |- I1 s* q8 n( ~4 Q
- try
?7 k+ c) ]9 _0 d! K - {
1 q$ E( e ?" O5 }! f3 t* ` - string strhtml = soso.getHtml(url, "", true);
6 F, } \, m+ l5 N/ E2 ], n - if (strhtml != "error")1 o" A5 I& V) B/ i
- {: j3 Y, v7 D* q( Z
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
( x% I; V0 b0 ~0 c4 ~3 V - MatchCollection mc = reg.Matches(strhtml);5 U9 U, s& ]5 |$ T7 i$ h
- foreach (Match m in mc)4 T9 a' L0 M- M
- {
& F2 ~$ a0 y. b( v% e# {& }' H5 K5 Z% N - Uri uri = new Uri(url);* b7 u c. H* H
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);8 `+ g4 e. K, W+ X
- string fullUrl = "";! g" R9 |9 j) g
- if (m.Groups["url"].Value.StartsWith("http"))
0 @0 Z# E9 L+ ^- h5 ?: D - {
( {- O! ?% K& y7 d8 o; N7 M - fullUrl = m.Groups["url"].Value;) ]! ^6 b# n* t2 ?7 N1 }
- }
3 L# e# _! s( m! f8 l1 s& p - else
: X/ b9 A8 R) x: T. D$ P5 x6 ]0 z - {% [, ?7 F) c' p/ Q8 s2 m
- fullUrl = thisUri.ToString();8 Y) |+ `+ C0 T) V D
- }! z5 o9 }' t" g1 D3 Q" f% h
- allHref.Add(fullUrl);
8 K) H2 V4 \7 G* v; Z! r' ~- }* l - //Console.WriteLine("原链接:" + m.Groups["url"].Value);
: c# [$ X7 ?5 g+ z4 e - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);$ i) O2 H" o5 Y: F7 ]* U$ b, C
- //Console.WriteLine("补全链接:" + fullUrl);/ u+ ?7 {' l# N, o; n" M
- //Console.WriteLine("…………………………………………");
8 S7 Y/ z+ G4 N5 k5 t7 M - }& b; j3 @5 O& E/ O9 e& g
- }
8 E! |, p. U- n: D; {/ w! y, v - }( V" K% D; S$ j8 f/ y% r7 `1 f* G
- catch (Exception ex)
# ?, v" S' b9 j: c - { }
0 i& [/ k1 v2 X4 c U |4 L - return allHref;
1 g& ]. {+ v# z* k - }0 L f" @+ e# S
- #endregion
复制代码
8 X( p x: y- x- w- e& X. D# x6 I$ K( y2 X, q
- ; K) W1 X/ M! J5 I
, d. v8 F6 J: J9 H! m- # E9 r) |- S2 o$ K5 h( r& y, H
- #region 数据去重! [# B; d$ m& K9 N& T" V1 R
- /// <summary>
4 g5 w3 r) J2 P2 ~; g; ^! [8 e - /// List<string>去重: ?( @: Q3 h: T% R. R& i
- /// </summary>
1 F; ?# C3 j3 _. H - /// <param name="list"></param># ?% d2 U6 p5 K
- /// <returns></returns>
: n e$ u7 P0 J0 H% s/ H8 L - public static List<string> getUnqueList(List<string> list)
) s/ ~2 o. x& q7 G7 g - {, d5 G! a, z; F* M. o6 w
- List<string> list1 = new List<string>();
% R. F4 M$ N* L - Hashtable hash = new Hashtable();7 f& r; i# H4 u- S' H
- foreach (string s in list): S0 y( n: c5 o" A
- {
/ e2 }4 U$ g( D5 `5 f0 f& B2 p) r - if (!hash.ContainsKey(s))* L8 |6 u% x. b5 y7 H
- {
0 Y* |+ u/ z: r; U - hash.Add(s, s);4 @) V! [: M7 \8 v% |7 _
- list1.Add(s);, ]) \' h+ R" n1 V+ K
- }
$ m( e% ], u# S - }$ ~% B" g2 S, h
- hash.Clear();# |! G7 i& B. G* u4 [; U
- hash = null;
. q3 K& J* o" c7 l1 L - return list1;3 T1 {+ t" e- N2 l% L4 X
- }! `4 z2 J0 Z5 u- W E( S
- #endregion
复制代码 1 P+ @! X P6 W9 ^, e0 K$ p
$ ^/ ^+ Y3 O. M' z2 { W. O: J2 K% d
|
|