|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接! r5 h- J- x9 ~; k$ n. y
- public static List<string> GetAllHref(string url)/ L- {. ~/ f1 [6 Z! P
- {
* ?5 X4 e9 S6 ]1 v, d( @- x4 r - List<string> allHref = new List<string>();
* Y6 d/ p+ m( t( q4 m - try
; }, R& L: a; L# [6 u5 N - {
7 ~) i9 c+ I" c9 V8 X9 B% Y+ V% C - string strhtml = soso.getHtml(url, "", true);) }) v% i# K5 j) j
- if (strhtml != "error")
7 \/ a8 T& p/ }, K - {" b9 O& d, c% U* C T
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
0 I* G! W" n3 k" h6 T; x - MatchCollection mc = reg.Matches(strhtml);* s4 k2 l) B/ i- p' B
- foreach (Match m in mc)9 k4 _% z0 m' R6 X. ]9 k
- {. d% a) z3 V7 d$ N
- Uri uri = new Uri(url);
5 I& i& ?; z+ S$ k8 N - Uri thisUri = new Uri(uri, m.Groups["url"].Value);2 r+ i% n( m; c2 X7 P( z/ [
- string fullUrl = "";
6 F+ b3 h" L# I' P% E - if (m.Groups["url"].Value.StartsWith("http")) |) _# w& b" {
- {7 ]+ s9 O( O. ?& Q0 k) U) L, ?; h
- fullUrl = m.Groups["url"].Value;- {. V0 C0 W+ Q! p; m$ r
- }% D3 Q, i# t T- [/ z
- else* l! d( m" \; J) \( W
- {
( g' g+ O2 N% Y: ? - fullUrl = thisUri.ToString();
# R% \; R0 |) Q( v) r - }
6 [2 Q$ L8 I9 {. `: t8 A - allHref.Add(fullUrl);- r# G" K6 a/ k+ X& U- ~
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
9 r g; x) Q) {5 R - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
) l& h' Y) e( v/ i( k6 S3 W - //Console.WriteLine("补全链接:" + fullUrl);8 F" O2 k. K1 k
- //Console.WriteLine("…………………………………………");' ~ g& b) K0 D4 `# R
- }. F, s: M, `3 z
- }6 E* M9 A* U Q$ j8 i0 ]8 S
- }$ u5 y& z0 H2 A; }
- catch (Exception ex)
% ^* c! o2 U& H) H' B% e) E" Z; \+ a: C - { }
+ {* Y/ P/ A0 @- V - return allHref;
- c7 E% l {. H( V - }; q7 y+ e: V$ R' r$ P' [0 A b6 X$ r
- #endregion
复制代码
/ h* Q; J3 x9 q9 e
* q! D" l' n Z2 d
+ w3 H% k" u7 y5 K. I& c
! [1 P+ F2 d0 @: c2 |* f9 I0 A
$ P) @8 M' C. }2 b; n* O: s( U- #region 数据去重
' u- B( g, [9 Y) p9 k, I- @ - /// <summary>5 Q+ W$ o1 g! T. `: `8 M
- /// List<string>去重
; Z" d4 a* u" J5 f( G - /// </summary> V1 N; z9 s4 {( e
- /// <param name="list"></param>
7 P' ^3 b5 Y( Y ]8 t4 _0 r) }) } - /// <returns></returns>
0 ^6 _* d' F" l - public static List<string> getUnqueList(List<string> list)
# Q' f7 q5 {0 e5 Y/ h. R4 N - {
& ` C8 {4 c) u& B; }- o - List<string> list1 = new List<string>();& U4 i: W% _1 h7 u; G1 l, L
- Hashtable hash = new Hashtable();
* {& R3 Q: P c6 J - foreach (string s in list)
, N: |! R/ E, Y V% m1 P: Q - {: j4 \% W( N( J8 r: P+ O7 n, C' T4 M) X
- if (!hash.ContainsKey(s))
( m* e5 O. q ^' T - {( l# O+ |2 D! t* j3 l
- hash.Add(s, s);
5 b: j- N7 _ ?0 B - list1.Add(s);
Y$ }0 h2 k: O- Y _' e/ e - }) N* v+ D, G: a, q( v) P; h
- }+ \- o& n3 V6 G6 E q- j4 C* X
- hash.Clear();
- Q: ~4 M6 S4 h0 Q0 Y - hash = null;
# d8 l: k. \2 J - return list1;
; v3 n4 a" J; k - }
+ E" o& \& b# N4 M' e' j - #endregion
复制代码
# l* V4 Z7 t7 r3 t: b* P
3 {7 ? u) n" G |
|