首页 新生命讲座 ASP学习 Net编程 我的垃圾工具 我的宝贝 系统编程
编译原理 其它文章
 -> 首页 -> Net编程
网页数据采集器
作者:   来源:   发布时间:2006-11-10 11:24:34   共有1449位读者阅读过此文


Form1.cs:

以下是代码片段:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;

using System.Threading;
using System.Data.OleDb;
using System.Data.SqlClient;

namespace ZLWeb
{
    public partial class FrmMain : Form
    {

        Thread[] thread; //= new Thread[10]; // 10个线程一起工作
        DD[] d;// = new DD[10];

        public OleDbConnection Conn = new OleDbConnection("Provider=Microsoft.Jet.OLEDB.4.0; Data Source=" + Application.StartupPath + "\\ZL.mdb");
        //public SqlConnection Conn = new SqlConnection("Data Source=.;Initial Catalog=nnhy;Persist Security Info=True;User ID=nnhy;Password=nnhy");

        public FrmMain()
        {
            InitializeComponent();
            Conn.Open();
        }
        /*
        ~FrmMain()
        {
            Conn.Close();
        }
        */
        private void button1_Click(object sender, EventArgs e)
        {
            int i = 0;
            int s = int.Parse(textBox1.Text);
            int ee = int.Parse(textBox2.Text);
            int n = int.Parse(textBox3.Text);

            if (button1.Text == "开始")
            {
                thread = new Thread[n];
                d = new DD[n];
                txtLog.Clear();
                //DD.Comm.Connection = Conn;
                DD.f = this;
                for (i = 0; i < n; i++)
                {
                    Run(i, s + i, ee, n);
                }
                button1.Text = "停止";
            }
            else
            {
                try
                {
                    for (i = 0; i < n; i++)
                        thread[i].Abort();
                }
                catch { }
                button1.Text = "开始";
            }
        }

        public void Run(int i,int s,int e,int n)
        {
            d[i] = new DD();
            d[i].ID = i + 1;
            d[i].Url = txtUrl.Text;
            d[i].UrlID = s;
            d[i].EndUrlID = e;
            d[i].Step = n; // 线程数
            //d[i].Run();
            thread[i] = new Thread(new ThreadStart(d[i].Run));
            thread[i].Start();
        }

        delegate void SetTextCallBack(string txt);
        public int msgCount = 0;

        public void SetText(string str)
        {
            if (txtLog.InvokeRequired)
            {
                SetTextCallBack d = new SetTextCallBack(SetText);
                txtLog.Invoke(d, new object[] { str });
            }
            else
            {
                try
                {
                    if (++msgCount > 100)
                    {
                        msgCount = 1;
                        txtLog.Clear();
                    }
                    txtLog.Text += "\r\n" + str;
                    // 让光标定位到文本框末尾
                    txtLog.Select(txtLog.TextLength, 0);

                    //然后移动滚动条,使输入点(text entry point)(即光标所在的位置)显示出来
                    //这样也可以达到滚动到最下方的目的
                    txtLog.ScrollToCaret();
                }
                catch { }
            }
        }

        private void notifyIcon1_Click(object sender, EventArgs e)
        {
            this.Visible = !this.Visible;
        }
    }
}

 

DD.cs:

以下是代码片段:

using System;
using System.Collections.Generic;
using System.Text;

using System.Net;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using System.Data;
using System.Data.OleDb;
using System.Data.SqlClient;

using System.Threading;

namespace ZLWeb
{
    class DD
    {
        public int ID = 0;
        public string Url = "";
        public string State = "";

        private WebClient client = new WebClient();
        public int UrlID = 0;// 开始ID
        public int EndUrlID = 0; // 结束ID
        public int Step = 1; //步进
        private string vUrl = "";
        private string html;

        static public FrmMain f;

        public OleDbCommand Comm = new OleDbCommand();
        //public SqlCommand Comm = new SqlCommand();

        //private Regex rx = new Regex(@"guangxi(\d{4}).html", RegexOptions.Compiled | RegexOptions.IgnoreCase);

        public void Run()
        {
            if (ID < 1 || Url == "") return;

            Comm.Connection = f.Conn;
            //f.Conn.Close();
            //f.Conn.Open();
            /*
             if (!rx.IsMatch(Url)) return;
             Match m = rx.Match(Url);
             if (!m.Success) return;
             string str = m.Groups[1].Value;
             for (int k = 0; k < str.Length; k++)
             {
                 if (!Char.IsDigit(str, k)) return;
             }
             UrlID = Int32.Parse(str);

             str = "guangxi" + UrlID.ToString("0000") + ".html";
             vUrl = rx.Replace(Url, str);
              * */
            //if (!GetWeb()) return;
            UrlID -= Step;
            while (GetUrlID())
            {
                try
                {
                    WriteLog(ID.ToString("00") + " " + vUrl);
                    GetWeb();
                    ParseHTML();
                }
                catch (ThreadAbortException ex)
                {
                    WriteLog("有人让我马上结束 " + ex.ToString());
                }
                catch (Exception e)
                {
                    WriteLog(e.ToString());
                }
            }
            WriteLog("完成任务,退出线程");
        }

        private bool GetUrlID()
        {
            //UrlID++;
            UrlID += Step;
            vUrl = Url.Replace("{#}", UrlID.ToString("0000"));
            return (UrlID <= EndUrlID);
            /*
            while (ID.ToString()[0] != UrlID.ToString()[UrlID.ToString().Length - 1])
            {
                UrlID++;
                if (UrlID > 38) return false;
                if (UrlID > 8922) return false;
            }
            vUrl = rx.Replace(Url, "guangxi" + UrlID.ToString("0000") + ".html");
            vUrl = Url.Replace("{#}", UrlID.ToString("0000"));
            return true;
             */
        }

        private void GetWeb()
        {
            html = "";
            try
            {
                html = client.DownloadString(vUrl);
            }
            catch { }
        }

        static public void WriteLog(string log)
        {
            //FrmMain f = null;
            if (null == f)
            {
                foreach (Form frm in Application.OpenForms)
                {
                    if (frm is FrmMain)
                    {
                        f = frm as FrmMain;
                        break;
                    }
                }
            }
            if (f != null)
            {
                //f.SetText(ID.ToString("00") + " " + log);
                f.SetText(log);
            }
        }

        public void ParseHTML()
        {
            string str = "";
            string[] s;
            Regex r = new Regex(@"<div align=left>([\w\W]*?)</div>");
            MatchCollection ms = r.Matches(html);
            GSData g = new GSData();
            foreach (Match m in ms)
            {
                //WriteLog(m.Groups[1].Value);
                try
                {
                    str = m.Groups[1].Value;
                    str = str.Replace("\t", "").Replace("\n", "");
                    s = str.Split(new string[] { "<br>" }, StringSplitOptions.None);
                    g.gs = s[0].Substring("<b>·".Length, s[0].IndexOf("</b>") - "<b>·".Length).Trim();
                    g.fr = s[1].Substring("法人:".Length).Trim();
                    g.zy = s[2].Substring("主营:".Length).Trim();
                    g.tel = s[3].Substring("电话:".Length).Trim();
                    g.address = s[4].Substring("地址:".Length).Trim();
                    g.etype = s[5].Substring("经济类型:".Length).Trim();
                    g.v = s[6].Substring("生产产值:".Length).Trim();
                    g.num = s[7].Substring("人员数量:".Length).Trim();
                    g.y = s[8].Substring("开业年份:".Length).Trim();
                    g.urlid = UrlID;
                    g.tid = ID;

                    //WriteLog(g.gs);
                    ReadyToWrite(g);
                    //WriteLog(g.gs);
                }
                catch { }
            }
        }

        private void ReadyToWrite(GSData g)
        {
            //lock (this)
            {
                WriteToDB(g);
            }
        }

        public void WriteToDB(GSData g)
        {
            string sql;
            lock (this)
            {
                try
                {
                    sql = "Insert Into NL_GS(公司, 法人, 主营, 电话, 地址, 经济类型, 生产产值, 人员数量, 开业年份, UrlID, tid) values(";
                    sql += "'" + g.gs.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "'" + g.fr.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "'" + g.zy.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "'" + g.tel.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "'" + g.address.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "'" + g.etype.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "'" + g.v.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "'" + g.num.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "'" + g.y.Replace("\'", "").Replace("\"", "") + "',";
                    sql += "" + g.urlid.ToString() + ",";
                    sql += "" + g.tid.ToString() + ")";
                    Comm.CommandText = sql;
                    Comm.ExecuteNonQuery();
                }
                catch (Exception e)
                {
                    WriteLog(e.ToString());
                    //Comm.Dispose();
                    //Comm = new SqlCommand();
                    //Comm.Connection = f.Conn;
                    //f.Conn.Close();
                    //f.Conn.Open();
                }
            }
        }
    }

    class GSData
    {
        public string gs = "";
        public string fr = "";
        public string zy = "";
        public string tel = "";
        public string address = "";
        public string etype = "";
        public string v = "";
        public string num = "";
        public string y = "";
        public int urlid = 0;
        public int tid = 0;
    }
}

项目包和数据库

ZL_20061110105809.rar


评论
dionysus 2008-6-18 11:16:37
可否加些注析啊!看不明白!

发表评论
网名:
评论:
    
新生命 CMS1.0 Build0920 版权所有 All Copyrights @2006 桂ICP备06011573号
站长:大石头 信箱:gxuhy@21cn.com QQ:99363590