|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接; T% K( g0 ]! r. [7 S; d/ M7 \
- public static List<string> GetAllHref(string url)
& U2 B. _$ N, y, W6 U- |. c - {( j2 d& @& y9 P4 v1 r" l, S U
- List<string> allHref = new List<string>();+ u6 n. i8 g. D' Q( ~
- try: [ w8 Z) E3 T" Q
- {& L2 Q5 n2 v; x# e" \; X& l
- string strhtml = soso.getHtml(url, "", true);5 L- M1 ^2 ^0 k$ O# G
- if (strhtml != "error")1 l& t7 H2 l9 Y! _
- {9 I2 f' g; Z+ d) \$ T* s
- Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");' [; F; w! Y. \) Q. b
- MatchCollection mc = reg.Matches(strhtml);
/ m( d& [: C9 C5 R( G0 K - foreach (Match m in mc)
6 J; s4 g) u: o- |3 N+ T - {, i) O( [$ \% k+ Y% b0 t5 x8 D% f
- Uri uri = new Uri(url);
2 H1 {5 q/ \& l0 `& y N5 k - Uri thisUri = new Uri(uri, m.Groups["url"].Value);2 f8 x+ h) \( ~- x$ \1 _0 U4 z% z6 q
- string fullUrl = "";2 c' B8 f: j0 C2 r9 D# U) O e
- if (m.Groups["url"].Value.StartsWith("http"))
( P: f% e& x# g! ^0 g - {
& V: T$ j+ b+ g% f2 B - fullUrl = m.Groups["url"].Value;
, M6 @; ?7 L4 L. ]# y U' q - }# f; y* D) [6 X3 O4 \
- else- m! Z' I r% q! E1 N
- {
+ [6 Y! Q- |& N0 i) G - fullUrl = thisUri.ToString();( ?4 A5 G; _$ o6 h
- }
9 G' h. ]0 {9 t2 j7 B% |* x - allHref.Add(fullUrl);
6 Z2 e. A- e9 S, |8 E - //Console.WriteLine("原链接:" + m.Groups["url"].Value);# B5 O5 g* l6 X+ d% L
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);& O3 c' }$ J: c8 o
- //Console.WriteLine("补全链接:" + fullUrl);
* S/ c+ k+ F: e* T: P4 p: R' N - //Console.WriteLine("…………………………………………");' |& i! _8 A6 \
- }
4 }! r. G4 I7 U8 ]) @- T* t' A - }) ^" P/ e' _2 X! \
- }2 a n. R* n4 w8 x6 |, e
- catch (Exception ex)
4 a; A; i! b1 @ - { }
7 h0 B7 h" T' w; q0 m, z. x - return allHref;
& S8 F# Z3 z9 V, U - }
! O1 c2 H/ D9 u) L. v - #endregion
复制代码 5 u/ z3 Z7 O' k3 h! _
. M7 ^5 ], A: a* R/ X- 4 ]* n* z3 F" r5 z
- - K0 Y: `7 {! _. N$ |4 Y$ r
9 d# D3 t( J$ A$ X* R, O4 d" {* Q% T- #region 数据去重
0 S, m1 J$ N8 ?, N) _. x* m - /// <summary>
8 h( A; f& X5 p - /// List<string>去重
- ^6 U$ a+ h- | - /// </summary> L) N! r9 T y2 B- X" G
- /// <param name="list"></param>2 U7 E0 h3 e5 m4 F, ~: i
- /// <returns></returns>
$ g0 I2 J+ {, u4 L- D - public static List<string> getUnqueList(List<string> list)4 ^4 H5 D1 d8 |/ p+ x' L3 ~: p) X
- {3 I, F# @3 M9 p$ E* o1 J; Q
- List<string> list1 = new List<string>();4 z) R, W; F* e8 b3 J
- Hashtable hash = new Hashtable();8 s& U# F. E2 [* o
- foreach (string s in list)( m! A: T, d5 z- Q$ j; b# ^7 J, w) F
- {2 f. o! @: p6 p9 t$ ], e" h
- if (!hash.ContainsKey(s))% T8 P) A( V4 P) j/ Y$ ?$ C, B
- {- S3 b% S% N8 S2 D% D: r
- hash.Add(s, s);
$ U- s6 v' x* I# x0 H4 b" O2 ` - list1.Add(s);
1 a# j% ]$ W+ _) g - }* P+ {. A) \4 A) P
- }6 Z5 K {* f0 |8 z h' P
- hash.Clear();& _) O$ M2 N' P% }) F) @1 s/ X
- hash = null;
1 ~) d! A6 _7 w; i0 _- c - return list1;- B( K# D/ v2 z- B
- }
4 V+ q6 q0 O* n, X4 q - #endregion
复制代码
# a3 V. a% a6 s
2 q+ E7 e1 M; U2 R+ A, Z |
|