|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接* |9 j* M5 y# I' O6 q
- public static List<string> GetAllHref(string url), r2 ~9 t9 d" ^1 `" l
- {
# H4 }: w/ T* k2 E! `* D/ O8 M - List<string> allHref = new List<string>();4 C9 g$ j1 i4 l: Q0 o% Z& ^! c o
- try( K9 ?0 ?, }* s9 c3 s) I8 E$ R" a
- {
% T1 Q7 K9 H% ?* y w: v$ Q4 D& G - string strhtml = soso.getHtml(url, "", true);' E& F+ } w b7 H
- if (strhtml != "error")
$ b+ P0 R4 W: \9 z A N4 U - {
9 H6 ~* |' w* z; Y" E2 d! R* v+ K+ @ - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
3 H$ w' ^- B; E, }. u/ ? - MatchCollection mc = reg.Matches(strhtml);
5 z, X# B$ C Y# ^$ ]8 J - foreach (Match m in mc). C2 D7 o& E, f% g% |! u
- { Z4 L( u" p: c$ K# f, r
- Uri uri = new Uri(url);7 Y P4 S& p5 h3 N) I6 E+ g3 T
- Uri thisUri = new Uri(uri, m.Groups["url"].Value);
4 Q5 f* B( l7 |% r3 {; Q5 e - string fullUrl = "";
- v" ?% e. `2 |5 p$ x: o - if (m.Groups["url"].Value.StartsWith("http"))2 L1 r9 d( _! ~
- {
* m: [# N/ S6 o7 N1 C0 j, } - fullUrl = m.Groups["url"].Value;
" G8 X6 }) H/ F- j - }
* k# ~" m4 x2 z8 T - else
* ]5 U8 l) ]3 h c# _2 c - {7 r5 E: o3 c& B: F
- fullUrl = thisUri.ToString();
/ J+ U, B$ t, b - }: y1 O7 I. P p* e5 E& a, L
- allHref.Add(fullUrl);% ? C4 D' u$ [) G* J" C$ W f$ I
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);
! m3 I$ R: j* W2 _/ } - //Console.WriteLine("文本标记:" + m.Groups["text"].Value);
- {0 I$ d _6 r( Z2 T - //Console.WriteLine("补全链接:" + fullUrl);, ~% l' f# B9 K. c
- //Console.WriteLine("…………………………………………");
' } e& T! ^, C# ` - }
% H; C- a$ e+ T4 R) r' H5 k - }7 l8 ?' L" G5 T8 `3 m
- }. x& i d0 N& q
- catch (Exception ex)3 }6 I7 P: o h5 C
- { } j1 Q) t9 ^: b; D; _ B
- return allHref;
, B0 d: k( T8 f% a! i3 H* f - }( C% y! t8 V9 m+ t/ C% a$ s2 S
- #endregion
复制代码 0 |, m9 e. q) B# x. F
% m1 v' E1 Z4 A ?+ L' O6 f
% y0 M0 H: E+ e
/ I# i- P* x B- . l& B5 t5 X- f: l: \& T
- #region 数据去重
$ b: f, M! U/ ]; q: z& @/ K - /// <summary>9 t( D7 Z0 D. R" F4 ]6 W7 ]) e
- /// List<string>去重
9 Q7 D: t2 k0 U. o - /// </summary>. R. d( y, A. Y1 `, X
- /// <param name="list"></param>
7 C5 s$ R w# y9 { v. m - /// <returns></returns>, F6 X; M9 W p* h! `
- public static List<string> getUnqueList(List<string> list)# [6 ~3 p) p* B0 D, r1 e9 I
- {/ b% P, Y& }+ y, @7 m' Y
- List<string> list1 = new List<string>();2 D, l/ C0 B. O/ q, w
- Hashtable hash = new Hashtable();
q% j8 ]2 U) m. u& Y1 f/ q7 }( W - foreach (string s in list)
p4 z. r; v2 \9 J - {/ z l/ a, R: @$ h
- if (!hash.ContainsKey(s))
& Y& ?, X% P% T3 c: J, @ - {% n- `' |4 b, n! b5 ~0 b8 T4 G
- hash.Add(s, s);
) C% S1 d' s. ]) W, }# \ - list1.Add(s);
, z N( G5 O U' B - }8 G3 Q# ?( ^, U
- }
6 c$ p" Z: q2 | - hash.Clear();
Y" T5 f2 _* d" M$ z! ] - hash = null;
/ Z6 A5 l x5 B/ v2 b. G2 x - return list1;
0 P7 r% E9 R' s& E8 v - } ~ T9 t* K3 M* C3 l2 n8 J
- #endregion
复制代码 # e1 ^- F6 L& o g* h. e( F f
* V& f3 |6 o$ w' w% G
|
|