以下是代码片段:
using System; using System.Collections.Generic; using System.Text; using System.Net; using System.Threading; using CJData; using System.Text.RegularExpressions; using NLog;
namespace CJ { /// /// 写日志委托 /// /// public delegate void WriteLogCallBack(String log); /// /// 采集 /// public class CaiJi { private WebClient _wc;
public WebClient Wc { get { if (_wc == null) _wc = new WebClient(); return _wc; } } private Thread thread;
public String Name = ""; public event WriteLogCallBack OnWriteLog;
/// /// 开始工作 /// public void Start() { if (thread != null) return; thread = new Thread(new ThreadStart(Work)); thread.Start(); } /// /// 停止工作 /// public void Stop() { if (thread != null) thread.Abort(); thread = null; }
private void Work() { int times = 0; while (times < 100) { Url url = Url.SelectOne(); try { if (url != null) { String page = Wc.DownloadString(url.UrlAddress); if (!String.IsNullOrEmpty(page)) { OnWriteLog(Name + " 成功抓取:" + url.UrlAddress); times = 0; ThreadPool.QueueUserWorkItem(new WaitCallback(ParsePage), new Object[] { url, page }); } } else { //OnWriteLog(Name + " 没有工作,休息半秒"); times++; //没有工作,休息半秒 Thread.Sleep(500); } } catch (ThreadAbortException e) { OnWriteLog(Name + " 外部终止"); break; } catch (Exception e) { times++; OnWriteLog(Name + " 赚取" + url.UrlAddress + "出错,休息半秒。" + e.Message); Trace.WriteLine(url.UrlAddress); //出错,休息半秒 Thread.Sleep(500); } } OnWriteLog(Name + " 完成!"); }
private void ParsePage(Object state) { Object[] objs = (Object[])state; Url url = objs[0] as Url; String page = (String)objs[1]; IList rs = Rule.SelectAll(Rule._.FromTypeID, url.TypeID); //if (url.PageType.TypeName == "详细页") if (rs == null || rs.Count < 1) { CjPage cp = new CjPage(); cp.CjTime = DateTime.Now; cp.Content = page; cp.Url = url.UrlAddress; cp.TypeID = url.TypeID; cp.Insert(); } else { foreach (Rule r in rs) { ParseUrl(url, r, page); } } } private void ParseUrl(Url u, Rule r, String page) { Regex reg = new Regex(r.Pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase); MatchCollection ms = reg.Matches(page); foreach (Match m in ms) { Url url = new Url(); url.TypeID = r.ToTypeID; url.UrlAddress = m.Groups[1].Value; if (!url.UrlAddress.StartsWith("http://")) { if (url.UrlAddress.Substring(0, 1) == "/") { url.UrlAddress = u.UrlAddress.Substring(0, u.UrlAddress.IndexOf("/", 8)) + url.UrlAddress; } else { if (u.UrlAddress.Substring(u.UrlAddress.Length - 1) == "/") url.UrlAddress = u.UrlAddress + url.UrlAddress; else if (u.UrlAddress.LastIndexOf("/") < u.UrlAddress.LastIndexOf(".")) url.UrlAddress = u.UrlAddress.Substring(0, u.UrlAddress.LastIndexOf("/") + 1) + url.UrlAddress; else url.UrlAddress = u.UrlAddress + "/" + url.UrlAddress; } } url.Insert(); } } } } |