|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
3 x5 y. I8 Z) B2 _2 D+ @ - public static List<string> GetAllHref(string url)/ x" T% R9 A, L, N. f; Q) t4 U
- {
. T( a2 F1 H W: |, I - List<string> allHref = new List<string>();( Y3 n( s( D/ g2 \0 }& O
- try
2 u) q2 U2 F3 `$ W- o E# E { - {
- X; `! j3 K7 n0 d9 D. [- e! F - string strhtml = soso.getHtml(url, "", true);
; r; ^7 k7 t: @/ w* t }4 z9 f - if (strhtml != "error")
/ n* K& H1 D4 ^; K - {1 A# t# y/ e' w
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
+ ?0 C0 s, x$ o" D q# k0 L - MatchCollection mc = reg.Matches(strhtml);
. R; i, q8 ? ~2 K) h' }. E - foreach (Match m in mc)9 ?, y& R9 t: ~% ^6 y& t
- {
4 ? l0 D( B. Y - Uri uri = new Uri(url);
2 b: \1 N8 I: { - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
. S6 _7 q! ?+ b: }4 O' a9 O - string fullUrl = "";/ p& v c# ?9 K0 k" g' s
- if (m.Groups["url"].Value.StartsWith("http"))
N/ S7 X+ m2 W: |' T - {) {; p* B$ O x! g- ~: B1 F
- fullUrl = m.Groups["url"].Value;
& G# _$ b1 ` X. n - }
3 V( P2 Z/ e ?3 o4 @ - else) z. y( N$ ]" b+ s: i1 U" J
- {. w" |% |! d# N4 {9 [
- fullUrl = thisUri.ToString();
& A/ j( H1 Z& e% h; E# M - }* I9 `- n2 k8 n4 a
- allHref.Add(fullUrl);
| [, O* r' i- x" l* r - //Console.WriteLine("原链接:" + m.Groups["url"].Value);- k/ A3 o/ h3 _( ]1 l. T: G& l
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
/ C! s# R$ Z' L5 M - //Console.WriteLine("补全链接:" + fullUrl);
+ ^, o7 t7 D1 z. h! H. o" Z7 Y, w- M4 x - //Console.WriteLine("…………………………………………");1 q# G5 p1 w7 ]% i I& F
- }! }: p1 S& B, T0 ?0 Q( {
- }
" |3 v2 P6 d" j - }' {/ |& T% w/ d: l$ X
- catch (Exception ex)
& M6 u4 ?0 Q: E. L - { }2 N, Q- X3 O0 m: w+ k* Y
- return allHref;
3 k, o; d5 e( h+ \! \$ a2 V+ F - }
% V9 A+ W! x/ U+ ?. C+ R - #endregion
复制代码 ; v8 i1 K9 z4 Q9 V0 k
' {6 _7 R/ G' |. [) S: V
+ z$ I* K$ K5 j/ F1 u- ; H% R6 i q+ L
- , B" \# M$ H0 z6 |! d
- #region 数据去重
' I' Y E: H5 B; Q: q1 e. ~ - /// <summary>
$ ] F* o1 u6 A2 V7 V - /// List<string>去重
/ p: _6 d5 m$ T3 G$ `0 y - /// </summary>
+ `9 h& Y7 B; Z7 v - /// <param name="list"></param>
& Q7 ]/ \; S9 @4 L' ]: e - /// <returns></returns>8 L! K$ w3 ~7 V& r$ m# r5 F/ q
- public static List<string> getUnqueList(List<string> list)0 v$ U v9 u5 g6 v2 s! b! b
- {% i2 ?3 ]# J. l6 y
- List<string> list1 = new List<string>();
* O) F# s! y) Q) K* B5 O j - Hashtable hash = new Hashtable();
4 U& \0 C; K: C8 q) F - foreach (string s in list)
0 n. m) C, p& {/ l! c - {4 m' W! x3 b+ l& O
- if (!hash.ContainsKey(s))7 @) Q. R0 @, H) p
- {+ E) ?, i! x, e% g* v
- hash.Add(s, s);
0 y/ [' ?5 O& ^ - list1.Add(s);
4 |$ \9 b1 d Q; q/ c - }
7 m" f1 Y* O; d& U$ ^2 c - }! f# @9 m4 q$ L7 T/ e
- hash.Clear();
+ g1 K: s6 ]- Y8 H - hash = null;, A- ]* {/ e+ K
- return list1;
# E% X" V# s. c$ [" m - }
/ Q1 v" G0 c0 K- _2 a7 Y$ g0 m - #endregion
复制代码
' R: ]$ O6 S& ?" A7 m
$ D. y9 `" q* p, \" c |
|