|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接5 b- S8 ?: ?. S
- public static List<string> GetAllHref(string url)* c( R- S- z- B- |4 `, {
- {
, e/ ]0 _: j% |7 G - List<string> allHref = new List<string>();2 Z7 G: |) E1 V1 K
- try
) n+ S, a3 `5 Z6 J - {
% ~3 E# r7 ~5 ?, D - string strhtml = soso.getHtml(url, "", true);
5 ~" T5 ?9 z! m5 r, e; F' Z - if (strhtml != "error")
. O5 f& R. E v& m0 z+ z - {. S# o; s+ q! q; @, j6 @
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
& c) Q5 t$ o& W$ {2 v" y& b4 N - MatchCollection mc = reg.Matches(strhtml);
" o/ r* T! h0 u" H5 z - foreach (Match m in mc)$ d2 I2 u5 I1 G9 n: H
- {. }* i( D6 Z, F$ e# l8 B# T
- Uri uri = new Uri(url);
: y3 j; l1 l- J$ f( J - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
( y/ f; j5 L/ P' R/ v3 Q7 j - string fullUrl = "";
3 X4 H9 W1 [5 v3 j" J: q% W4 d2 I) s - if (m.Groups["url"].Value.StartsWith("http"))
; `7 L' ^( w: y# {9 a+ f - {+ a3 P% a: x1 u" ]
- fullUrl = m.Groups["url"].Value;
' X- }) K) K3 `3 m# t - }
# e8 o" O& l/ S# H; V6 f: H - else
3 L, t5 z7 C- V' v, o - {
; E6 D7 {& t) F& c! ]" x* j- H1 c - fullUrl = thisUri.ToString();4 Y! `3 ^/ p; E8 H9 [' h8 f
- }
/ u% u% H% m6 a- B$ F; C - allHref.Add(fullUrl);
) S S2 n" S% _! v* g - //Console.WriteLine("原链接:" + m.Groups["url"].Value);9 o( O) N% s/ Q
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
* A b0 v# R5 h/ o' J- |0 {; C8 h, h - //Console.WriteLine("补全链接:" + fullUrl);0 Z' w/ q1 ]$ t5 {
- //Console.WriteLine("…………………………………………");
( o+ s2 k1 Z+ x c - }
" C1 g" M- C) c. C5 C2 v - }4 G6 F; k" y- E0 Y/ U, B* u. Q7 }
- }5 C/ ?1 Z& l6 a4 x% w) i6 D% p
- catch (Exception ex)
/ q9 `/ X( V2 A0 A$ S( s5 T - { }
, y1 f _. S. c% i- Q, X4 u - return allHref;
* S, \/ r( j+ t - }6 b, G g& I& l _8 `$ ]
- #endregion
复制代码 / }+ l& T8 g5 D; `2 z, G
9 n* O5 N( M8 b( q# Y- ( g& B4 |4 j$ }- @
- 6 N9 v/ v% G0 R0 u( j# `
- . d& G! F* }' o" S; p0 A
- #region 数据去重4 ]' |# A% f. Q" t2 A- ~
- /// <summary>9 }* V+ {% _+ } K
- /// List<string>去重
T% b. @! K; N# m - /// </summary>) I, J9 g8 L6 ?2 s0 j+ l# Z
- /// <param name="list"></param># p' _3 i1 n9 h
- /// <returns></returns>
# B& Y4 M" L& c Q. _ - public static List<string> getUnqueList(List<string> list)
' F* R- H- _9 I. _: W: x$ w - {
, q! ^! H/ B$ j) ]+ \% V! l+ x5 m) \ - List<string> list1 = new List<string>();1 o9 M+ I! u8 p
- Hashtable hash = new Hashtable();
* b" l* H( i8 }& F7 G, O# ^. Y - foreach (string s in list)" C4 V: j* m! T1 m1 l# `8 q
- {
5 r1 @5 l8 }$ h4 S4 L! a P) R( I - if (!hash.ContainsKey(s))
/ i/ l- a- }4 r1 f; Z- ^9 I+ X4 q - {
8 ?; |# S, I: I" I& l - hash.Add(s, s);( T' h1 B, ?7 s: z7 G7 [/ x
- list1.Add(s);" j- x& J. q0 {" R+ G' j
- }1 o& d) W- }0 i" r! h/ S0 Q/ g" ]
- }' I: M, n% L6 c$ t3 K
- hash.Clear();% `+ }) f. U2 m
- hash = null;, {3 D' g5 z6 I0 W' A
- return list1;
4 ~/ s5 A- l+ R - }9 r/ Z p3 _8 s( o7 b+ f
- #endregion
复制代码 ' Q, K1 v; \$ T7 Q" x/ [
; p6 u9 P, n& o$ f- E. ]9 c. y
|
|