|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接) N) y |5 D e
- public static List<string> GetAllHref(string url)
4 ?/ H) L8 d. d! `+ c; W - {
" X4 v! l" ?( S, m - List<string> allHref = new List<string>();
: ~( ]& D5 c- D X- f! q6 V! k - try1 \, q1 y1 G* m
- {, k! J2 `) _$ U: i" R8 c3 O
- string strhtml = soso.getHtml(url, "", true);7 k8 {3 k; v; O! U+ E, [
- if (strhtml != "error")0 s- Z8 ~' g& `* [+ A
- {
! \* t9 f' |2 ^) s' ?( Y7 O$ B; s - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");- b" M4 \( R; \2 [
- MatchCollection mc = reg.Matches(strhtml);* {& L5 c- i2 W5 n, k5 I: [8 Z
- foreach (Match m in mc)
+ w, M7 W# B5 s2 o( y - {0 b( k# Y8 \0 l& b; _
- Uri uri = new Uri(url);. a6 c0 @7 X' T9 H
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
) T* f5 F3 Z$ a5 h' j r - string fullUrl = "";
) E' u( a I( h - if (m.Groups["url"].Value.StartsWith("http"))
4 m6 _ j" @7 _ - {
2 l8 j A! B w# Q - fullUrl = m.Groups["url"].Value;
- s9 Z# C# w; i0 k - }
$ r; v, O2 p6 |6 m4 l - else
- P! _, S& Z3 D( ~7 l$ n% ? - {, a/ G4 b6 V8 z) H9 t- y# {
- fullUrl = thisUri.ToString();/ E# E/ }" d& Z1 R7 t+ I
- }
+ ~$ M! b1 f3 z9 `/ s- ~2 C$ B* j - allHref.Add(fullUrl);: v2 J( {, s5 i w5 @" L: S8 R: K
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);! E5 u7 T8 M: J1 A
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
]" H/ e9 L, Z+ R7 Q- F9 u - //Console.WriteLine("补全链接:" + fullUrl);
0 g0 ?- }0 A8 N, M$ a) H) |0 p% n - //Console.WriteLine("…………………………………………");* i, z8 ?2 a+ c0 W. ~
- }8 H# X2 w* k; H5 I% O0 s/ S8 w+ z5 A
- }
* R& ~9 j& v+ j( G - }( @+ d" b/ T1 \% s" p
- catch (Exception ex)0 a4 b6 o2 Q8 o3 h" u/ k( S
- { }
4 P" c' n6 a' ?) t - return allHref;8 o+ L$ V, {) P. \7 r6 T
- }$ Q( Y5 A' _% F1 C+ `2 j
- #endregion
复制代码
. z, l8 |' B$ x* W
' ]& @; u8 l# C, a; f# O
4 R" l* K4 Y& M0 ~& G9 F2 L
- G$ c$ |, T0 p4 C5 r' r. F' w. T7 t
9 s9 W& r C) e# V1 q" f/ e ?- #region 数据去重
. a( T" j, s7 Y3 j# l% I6 D& n - /// <summary>4 D8 E9 S* G: \- K/ {+ o
- /// List<string>去重/ d( h3 o7 b6 Y
- /// </summary>6 H0 S% n) j( w6 Z; F2 P
- /// <param name="list"></param>& ^6 G# y5 r/ O
- /// <returns></returns>9 H! j5 F/ ^% C6 b* V/ `7 G
- public static List<string> getUnqueList(List<string> list)' K: O- d, o$ |$ I( Y
- {& m3 i0 I- d5 J: h* I
- List<string> list1 = new List<string>();+ _- u( K$ j" Q! ]
- Hashtable hash = new Hashtable();4 }' I. K8 e2 Y/ @, O
- foreach (string s in list)# t, ]1 |0 o6 v. F& Q' Q. o s6 B" y
- {: y- B$ m/ W. O3 i
- if (!hash.ContainsKey(s))" |% m m; n7 E9 t7 H( l% |
- {3 z+ w7 c, g9 e7 W' _( b, C5 L
- hash.Add(s, s);
+ g; N/ l" d7 y$ U6 [) B* { - list1.Add(s);3 W* S' }7 Y! }
- }
* ^7 }+ k' C7 k5 j5 {+ P: m - }$ N; i9 c( Q( e" @" l+ N% ]! n! g
- hash.Clear();
* }2 r+ `6 J+ [) E5 p) q. B# i* l5 z: N - hash = null;1 T0 W8 I4 ]% C @
- return list1;3 M2 z2 V5 [+ ], u, Z
- }
. s. S, k# |5 p" \( ~' u: `; ` - #endregion
复制代码
2 u3 D1 y8 l; T3 _+ C+ O
( q2 V$ f2 R& F. Y$ i2 ?. f3 x, w+ M/ _ |
|