|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接
' M3 ^8 N! u: H$ d$ s: z - public static List<string> GetAllHref(string url): b' X* h7 q. D6 ]' u
- {
0 H- ^5 Q% H& b4 r! y - List<string> allHref = new List<string>();
2 v5 \. @! a% T. i; i+ f. ]8 M - try# X9 A3 g5 Z( t* o% Q# \& h
- {
3 l: V! U) N; F- X - string strhtml = soso.getHtml(url, "", true);' ^- A9 u% i. Q$ `5 s# y( f
- if (strhtml != "error")# }: F+ Y& Q6 G
- {0 r5 a7 g% f% o: F8 ?5 V
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");8 k3 _0 q+ I' Z$ O J# }3 U0 {
- MatchCollection mc = reg.Matches(strhtml);
2 x; |( \5 ~7 ^+ C - foreach (Match m in mc)6 f& c7 I- {6 G3 U
- {
, i. O' c. w7 l4 D0 b# Y - Uri uri = new Uri(url);
$ d0 I5 x) a' k% I* B _, W - Uri thisUri = new Uri(uri, m.Groups["url"].Value);
* r% }4 i8 ^6 L$ W' g: l- ? - string fullUrl = "";3 u5 P- `/ U& Y1 ~6 v. ~1 X! w
- if (m.Groups["url"].Value.StartsWith("http"))
- o" e/ Z# n5 }, r' P$ w% [9 u. h- c - {
$ p& _5 K. r) F8 z! s) S - fullUrl = m.Groups["url"].Value;' P; u; t& C7 w% k) g4 G+ J4 S+ v
- }' D. `8 [, d( `7 q9 E
- else1 S+ T& i9 g$ C4 U; T5 _" l' e
- {
# D4 F6 r5 X% M, ~4 N' C - fullUrl = thisUri.ToString();. \4 \3 O) r- T) O- o
- }0 A& s6 q+ Q& H6 H
- allHref.Add(fullUrl);. ]3 ?2 d U* I. f. _$ u
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
, T" F- f9 E x, V* S - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
2 [2 Q- B' _: |9 F" T6 B - //Console.WriteLine("补全链接:" + fullUrl);
9 G: O2 a+ Q8 _ - //Console.WriteLine("…………………………………………");" ?' P& Q7 M; F9 z/ N) j2 P9 v
- }; I( i, C) h6 _. H/ ^' q$ P' A; r
- }
1 G% z; l! B) H3 f, m( ~ - }2 f. w4 v5 c" ^* q1 ?
- catch (Exception ex)
' L0 H4 s, k8 O$ ?5 S - { }
, f3 k1 U, `$ A4 V7 ]0 O - return allHref;) S) `7 N9 ~6 V$ R% t
- }( r! f5 Y; C+ T* @4 i- @3 `
- #endregion
复制代码 4 S; F* E$ m5 \4 |+ k5 ^; I3 n6 u
4 M1 l @: G! r' [ m- - G9 i1 e# b* [" ^
- 8 f! n6 T2 N8 \$ @, u( q
- * x6 d* I, L. h, N8 h8 F) o
- #region 数据去重
1 [% B. O. }3 _9 d& v+ n8 G% i" M - /// <summary>5 V! R$ }" Z$ I% B; A
- /// List<string>去重
: D' O3 b8 E4 t* N/ P0 N* T - /// </summary>
# |' H2 j4 N. A4 C/ x - /// <param name="list"></param>
1 i) o& [/ @% \0 _! y# s/ { - /// <returns></returns>
p/ g% c* c6 q% @2 O' r( h - public static List<string> getUnqueList(List<string> list)
' t- ^6 m. u; u/ D, e4 S - {# k7 d7 A+ C& {) a _6 |" d
- List<string> list1 = new List<string>();6 t2 B$ b/ D, N
- Hashtable hash = new Hashtable();, L* q6 t! N4 ~8 z
- foreach (string s in list)8 `3 m; [; p* R6 A# Y
- {
( Y1 R1 P% ?# B! c - if (!hash.ContainsKey(s))3 r/ o2 ], N' N( U
- {8 \5 W" R2 i. i+ p) M. w
- hash.Add(s, s);- M6 f" v9 f' E1 N5 h z% g
- list1.Add(s);
3 o! d0 K$ b8 ?! s) b, r - }
% T9 h' |0 i- G" i: s; u) ]7 y - }
+ `" P5 S' ~) p - hash.Clear();5 h+ f" U! ^! D" ]" [; c! \5 C
- hash = null;
5 \! o- {, P7 E) e) } - return list1;% @8 s! u8 g" ]& ]8 { Z! f6 @
- }1 p5 ?. Q6 r1 J: V) ?1 ?
- #endregion
复制代码 ( ~& N9 g ?+ g0 L6 G: ]
9 V, L+ }7 ^# z1 B6 O |
|