|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接" Q0 r, s+ t! |/ g. W( v8 d
- public static List<string> GetAllHref(string url)
2 y5 j9 g% c( K0 G8 Y - {
7 ?( N- e5 j C0 F) |4 Q) _* Y: a - List<string> allHref = new List<string>();2 b9 L( ^/ s/ ^" p5 a; k0 F
- try
/ n! @) _4 c& N/ M3 { - {# G! j; u: w4 w3 [6 |/ ?1 Z
- string strhtml = soso.getHtml(url, "", true);7 p9 j9 U4 D; Q- v5 Q' [( S
- if (strhtml != "error")
0 Q8 e( W' t0 S! z - {
& }2 f6 g( Q2 E% k - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
' c$ J% w* v& V - MatchCollection mc = reg.Matches(strhtml);9 g) m5 T0 o: w8 p3 U! s i
- foreach (Match m in mc)
9 T' A* S0 |, E" U - {- d* s- h0 \6 X& u! Q. F9 u
- Uri uri = new Uri(url);3 S. m- {5 H" z% C3 [) u
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
4 A4 x u9 Q' `7 S2 \% @( g0 B* O - string fullUrl = "";9 k. Z$ x/ h$ N9 j! c3 }; j+ g6 B
- if (m.Groups["url"].Value.StartsWith("http"))5 w5 Q+ i' s7 V2 D
- {
* L: W/ J1 o6 ?) a& N" E - fullUrl = m.Groups["url"].Value;( V' R) h$ |4 H& _0 r
- }' t; n- x" d& w9 B; S
- else! K) S$ T9 u6 _3 y6 Q7 d! }6 J, `
- {
; ~ F0 n8 l& E8 W - fullUrl = thisUri.ToString();6 C' i) _5 N( s$ N6 n3 w K
- }
`" C# h: e4 o* V' o& U! Y - allHref.Add(fullUrl);
% {! S7 y: x7 I4 j0 S - //Console.WriteLine("原链接:" + m.Groups["url"].Value);
# T7 q8 T* V x! c; s0 q - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
: [. J3 G$ ^; R- ~ X* s - //Console.WriteLine("补全链接:" + fullUrl);
& p0 r+ J" M) [, f - //Console.WriteLine("…………………………………………");" E! U3 e$ L( G: E" q/ Z
- }) o9 N' {) f D
- }1 T" o5 r9 T4 i5 m3 T
- }
$ I; a3 E6 c. x/ d% V0 C. F - catch (Exception ex)2 A+ Y% S" h) e! Q, E
- { }: U, {; Q/ l' l3 _1 b* o6 z, q1 \
- return allHref;0 ~) Z) t& p" r) z9 y
- }3 I4 R$ h7 G9 \$ q
- #endregion
复制代码 a( v& }- n$ N! K
# `8 T! f; N# Q3 t3 ?" z- % R. z y* B/ g- w, V6 I
- , P( Q9 \* Z# U% \
1 V7 D# p, c2 L+ o9 y: E6 x' Z' E- #region 数据去重, L) i$ T9 A6 l1 T# W
- /// <summary>
: D7 M3 E: y/ v' g: w0 l - /// List<string>去重1 e/ ^% v, S5 c8 m' p0 n7 f: r5 w) i7 ~
- /// </summary>
5 a- q$ t) g; J+ Y/ p% m) N( x - /// <param name="list"></param>, u" L' f! v6 x$ [* R9 B: a& U
- /// <returns></returns>
3 l4 G' h! n. ~* y7 `& ^+ {0 z+ t3 e) V - public static List<string> getUnqueList(List<string> list)
0 a! F+ e$ Q& I& e" g8 C' f - {7 D: ? g" ~/ p4 I
- List<string> list1 = new List<string>();
2 a/ t/ ]9 ]7 ?1 U' s: a/ ^0 j - Hashtable hash = new Hashtable();* k8 v1 K, g$ q/ \2 b
- foreach (string s in list)
0 B% a ` g/ H+ [0 X$ z$ J) X - {" ^* k& q' I1 C9 ^8 h) w3 d
- if (!hash.ContainsKey(s))! k! A. Z8 a" H8 Z* U
- {2 L1 s0 o2 }+ _/ h6 x2 M
- hash.Add(s, s);
8 |/ e% H5 i1 L6 H7 R$ C0 c - list1.Add(s);
, ~) U$ R$ A) V7 d - }. N* h) j5 U8 j: @) x# K% J9 X
- }
6 b" R4 i, E" ~4 U - hash.Clear();
2 M/ r4 Y+ ^, p$ C/ h) e9 h - hash = null;
2 `" [0 }! k& R% X+ H' n - return list1;' A7 `/ f$ a* r1 ]4 ?
- }; l9 m8 z5 D! Y' q# Y
- #endregion
复制代码
# F' w; h- o- X4 x3 ~+ ?/ w; |6 D5 V* W7 y& X" J
|
|