最新文章專(zhuān)題視頻專(zhuān)題問(wèn)答1問(wèn)答10問(wèn)答100問(wèn)答1000問(wèn)答2000關(guān)鍵字專(zhuān)題1關(guān)鍵字專(zhuān)題50關(guān)鍵字專(zhuān)題500關(guān)鍵字專(zhuān)題1500TAG最新視頻文章推薦1 推薦3 推薦5 推薦7 推薦9 推薦11 推薦13 推薦15 推薦17 推薦19 推薦21 推薦23 推薦25 推薦27 推薦29 推薦31 推薦33 推薦35 推薦37視頻文章20視頻文章30視頻文章40視頻文章50視頻文章60 視頻文章70視頻文章80視頻文章90視頻文章100視頻文章120視頻文章140 視頻2關(guān)鍵字專(zhuān)題關(guān)鍵字專(zhuān)題tag2tag3文章專(zhuān)題文章專(zhuān)題2文章索引1文章索引2文章索引3文章索引4文章索引5123456789101112131415文章專(zhuān)題3
問(wèn)答文章1 問(wèn)答文章501 問(wèn)答文章1001 問(wèn)答文章1501 問(wèn)答文章2001 問(wèn)答文章2501 問(wèn)答文章3001 問(wèn)答文章3501 問(wèn)答文章4001 問(wèn)答文章4501 問(wèn)答文章5001 問(wèn)答文章5501 問(wèn)答文章6001 問(wèn)答文章6501 問(wèn)答文章7001 問(wèn)答文章7501 問(wèn)答文章8001 問(wèn)答文章8501 問(wèn)答文章9001 問(wèn)答文章9501
當(dāng)前位置: 首頁(yè) - 科技 - 知識(shí)百科 - 正文

asp.net(c#)做一個(gè)網(wǎng)頁(yè)數(shù)據(jù)采集工具

來(lái)源:懂視網(wǎng) 責(zé)編:小采 時(shí)間:2020-11-27 22:43:37
文檔

asp.net(c#)做一個(gè)網(wǎng)頁(yè)數(shù)據(jù)采集工具

asp.net(c#)做一個(gè)網(wǎng)頁(yè)數(shù)據(jù)采集工具:通過(guò)這個(gè)軟件一兩天就完成了幾千產(chǎn)品數(shù)據(jù)的錄入,可見(jiàn)很多工作不是一味用人工去做,作為一個(gè)程序員,就是要讓很多讓那些經(jīng)常做重復(fù)性的、繁瑣的工作中的人解放出來(lái)。下面只是寫(xiě)了一些核心代碼,而且采集必須要和對(duì)應(yīng)網(wǎng)站相掛鉤,作者:鄭少群 代碼如下://提取
推薦度:
導(dǎo)讀asp.net(c#)做一個(gè)網(wǎng)頁(yè)數(shù)據(jù)采集工具:通過(guò)這個(gè)軟件一兩天就完成了幾千產(chǎn)品數(shù)據(jù)的錄入,可見(jiàn)很多工作不是一味用人工去做,作為一個(gè)程序員,就是要讓很多讓那些經(jīng)常做重復(fù)性的、繁瑣的工作中的人解放出來(lái)。下面只是寫(xiě)了一些核心代碼,而且采集必須要和對(duì)應(yīng)網(wǎng)站相掛鉤,作者:鄭少群 代碼如下://提取

通過(guò)這個(gè)軟件一兩天就完成了幾千產(chǎn)品數(shù)據(jù)的錄入,可見(jiàn)很多工作不是一味用人工去做,作為一個(gè)程序員,就是要讓很多讓那些經(jīng)常做重復(fù)性的、繁瑣的工作中的人解放出來(lái)。下面只是寫(xiě)了一些核心代碼,而且采集必須要和對(duì)應(yīng)網(wǎng)站相掛鉤,作者:鄭少群

代碼如下:


//提取產(chǎn)品列表頁(yè)中產(chǎn)品最終頁(yè)的網(wǎng)頁(yè)
private void button1_Click(object sender, EventArgs e)
{
if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")
{
MessageBox.Show("網(wǎng)址和域名不能為空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
return;
}
try
{
string Html = inc.GetHtml("http://study.pctoday.net.cn");
//ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");
ArrayList al = inc.GetMatchesStr(Html, @"href\s*=\s*(?:[\'\""\s](?<1>[^\""\']*)[\'\""])");//提取鏈接


" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
StringBuilder sb = new StringBuilder();
foreach (object var in al)
{
string a = var.ToString().Replace("\"", "").Replace("'", "");
a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (a.StartsWith("/"))
a = textBox2.Text.Trim() + a;
if (!a.StartsWith("http://"))
a = "http://" + a;
sb.Append(a + "\r\n");
}
textBox5.Text = sb.ToString();//把提取到網(wǎng)址

輸出到一個(gè)textBox,每個(gè)鏈接占一行

MessageBox.Show("共提取" + al.Count.ToString() + "個(gè)鏈接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

}
catch (Exception err)
{
MessageBox.Show("提取出錯(cuò)!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}

}


//把采集的產(chǎn)品頁(yè)面html代碼進(jìn)行字符串處理,提取需要的代碼,最后保存到本地一個(gè)access數(shù)據(jù)庫(kù)中,同時(shí)提取產(chǎn)品圖片地址并自動(dòng)現(xiàn)在圖片到本地images文件夾下

private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//填充產(chǎn)品表
Database.ExecuteNonQuery("delete from Tb_Product");
DataTable dt2 = new DataTable();
OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);
OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);
OleDbCommandBuilder cb = new OleDbCommandBuilder(da);
da.Fill(dt2);
dt2.Rows.Clear();

BackgroundWorker worker = (BackgroundWorker)sender;//這個(gè)是做一個(gè)進(jìn)度條

string[] Urls = textBox5.Text.Trim().ToLower().Replace("\r\n", ",").Split(',');
DataTable dt = new DataTable();
StringBuilder ErrorStr = new StringBuilder();
string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images\\";

//循環(huán)每次采集網(wǎng)址
for (int i = 0; i < Urls.Length; i++)
{
try
{
if (!worker.CancellationPending)
{
if (Urls[i] == "")
return;
html = inc.GetHtml(Urls[i]);//獲取該url的html代碼
DataRow NewRow = dt2.NewRow();

//產(chǎn)品名
string ProductName = html.Substring(html.IndexOf("<title>") + 7);
NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

//產(chǎn)品編號(hào)
NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();

//產(chǎn)品介紹,這些都是根據(jù)不同網(wǎng)站的html做相應(yīng)的修改
string Introduce = html.Substring(html.IndexOf("Product Details") + 26);
Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()

NewRow["Introduce"] = Introduce;

" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
//下載圖片
string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17);
ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=\"") + 5);
ProductImage = ProductImage.Remove(ProductImage.IndexOf("\""));
try
{
inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1));
}
catch (Exception)
{
ErrorStr.Append("下載圖片失敗,圖片地址:" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "\r\n");
}


dt2.Rows.Add(NewRow);

//Thread.Sleep(100);
worker.ReportProgress((i + 1) * 100 / Urls.Length, i);
toolStripStatusLabel1.Text = "處理進(jìn)度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//進(jìn)度條
}

}
catch (Exception err)
{
ErrorStr.Append("采集錯(cuò)誤:" + err.Message + ";網(wǎng)址:" + Urls[i] + "\r\n");
}
}
da.Update(dt2);
DataBind(dt2);
ShowError(ErrorStr.ToString());
}

/// <summary>
/// ASPX頁(yè)面生成靜態(tài)Html頁(yè)面,作者:鄭少群
/// </summary>
public static string GetHtml(string url)
{
StreamReader sr = null;
string str = null;
//讀取遠(yuǎn)程路徑
WebRequest request = WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet));
str = sr.ReadToEnd();
sr.Close();
return str;
}


// 提取HTML代碼中的網(wǎng)址
public static ArrayList GetMatchesStr(string htmlCode, string strRegex)
{
ArrayList al = new ArrayList();

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection m = r.Matches(htmlCode);

for (int i = 0; i < m.Count; i++)
{
bool rep = false;
string strNew = m[i].ToString();

// 過(guò)濾重復(fù)的URL
foreach (string str in al)
{
if (strNew == str)
{
rep = true;
break;
}
}

if (!rep) al.Add(strNew);
}

al.Sort();

return al;
}

public static void DownFile(string Url, string Path)
{

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
long size = response.ContentLength;
//創(chuàng)建文件流對(duì)象
using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write))
{
byte[] b = new byte[1025];
int n = 0;
while ((n = stream.Read(b, 0, 1024)) > 0)
{
fs.Write(b, 0, n);
}
}
}

聲明:本網(wǎng)頁(yè)內(nèi)容旨在傳播知識(shí),若有侵權(quán)等問(wèn)題請(qǐng)及時(shí)與本網(wǎng)聯(lián)系,我們將在第一時(shí)間刪除處理。TEL:177 7030 7066 E-MAIL:11247931@qq.com

文檔

asp.net(c#)做一個(gè)網(wǎng)頁(yè)數(shù)據(jù)采集工具

asp.net(c#)做一個(gè)網(wǎng)頁(yè)數(shù)據(jù)采集工具:通過(guò)這個(gè)軟件一兩天就完成了幾千產(chǎn)品數(shù)據(jù)的錄入,可見(jiàn)很多工作不是一味用人工去做,作為一個(gè)程序員,就是要讓很多讓那些經(jīng)常做重復(fù)性的、繁瑣的工作中的人解放出來(lái)。下面只是寫(xiě)了一些核心代碼,而且采集必須要和對(duì)應(yīng)網(wǎng)站相掛鉤,作者:鄭少群 代碼如下://提取
推薦度:
標(biāo)簽: 做一個(gè) c# ASP.NET
  • 熱門(mén)焦點(diǎn)

最新推薦

猜你喜歡

熱門(mén)推薦

專(zhuān)題
Top