我来做百科(第二十天) B

tag系统完成，再修复一些添加词条，修改内容的问题，就可以做数据采集了。

数据采集网上的资料很多，再结合自己的需要，写了一下代码：

我来做百科(第二十天) B

protected void Button1_Click(object sender, EventArgs e)

我来做百科(第二十天) B

{

我来做百科(第二十天) B

Lemma lemma = new Lemma();

我来做百科(第二十天) B

Response.Write("采集结果： ");

我来做百科(第二十天) B

Response.Flush();

我来做百科(第二十天) B

for (int i = 0; i <= 3; i++)

我来做百科(第二十天) B

{

我来做百科(第二十天) B

string sUrl = strurl + (i * 10).ToString();

我来做百科(第二十天) B

Response.Write("采集url：" + sUrl + " ");

我来做百科(第二十天) B

Response.Flush();

我来做百科(第二十天) B

foreach (string temp in GetHtmls(@"/view/\d+\.htm", GetUrlHtml(sUrl)))

我来做百科(第二十天) B

{

我来做百科(第二十天) B

string url = u + temp;

我来做百科(第二十天) B

string sHtml = GetUrlHtml(url);

我来做百科(第二十天) B

string sLemma = GetLemma(sHtml);

我来做百科(第二十天) B

string sDetail = GetDetail(sHtml);

我来做百科(第二十天) B

string sTag = GetTag(sHtml);

我来做百科(第二十天) B

int idLemma = lemma.AddLemma(sLemma, sDetail, "cloud", 0, string.Empty, url, sTag);

我来做百科(第二十天) B

StringBuilder sb = new StringBuilder();

我来做百科(第二十天) B

sb.Append("id：").Append(idLemma).Append(" 词条：").Append(sLemma).Append(" ");

我来做百科(第二十天) B

sb.Append("Tag：").Append(sTag).Append(" 连接：<a href='").Append(url).Append("'' target='_blank'>").Append(url).Append("</a> ");

我来做百科(第二十天) B

if (idLemma > 0)

我来做百科(第二十天) B

{

我来做百科(第二十天) B

sb.Append("成功！").Append(" <a href='../index/show.aspx?id=").Append(idLemma).Append("' target='_blank'>查看</a>"); ;

我来做百科(第二十天) B

}

我来做百科(第二十天) B

else

我来做百科(第二十天) B

sb.Append("失败！错误代码：").Append(idLemma);

我来做百科(第二十天) B

sb.Append(" ");

我来做百科(第二十天) B

Response.Write(sb.ToString());

我来做百科(第二十天) B

Response.Flush();

我来做百科(第二十天) B

}

我来做百科(第二十天) B

}

我来做百科(第二十天) B

}

我来做百科(第二十天) B

public static string GetUrlHtml(string url)

我来做百科(第二十天) B

string output = "";

我来做百科(第二十天) B

Encoding encode = Encoding.Default;

我来做百科(第二十天) B

WebClient webclient = new WebClient();

我来做百科(第二十天) B

try

我来做百科(第二十天) B

webclient.Headers.Add("Referer", url);

我来做百科(第二十天) B

byte[] buff = webclient.DownloadData(url);

我来做百科(第二十天) B

output = encode.GetString(buff);

我来做百科(第二十天) B

catch

我来做百科(第二十天) B

return output;

我来做百科(第二十天) B

public static string GetHtml(string begin, string end, string content)

我来做百科(第二十天) B

return GetHtml(begin + "((.*?\\n?)*?)" + end, content);

我来做百科(第二十天) B

public static string GetHtml(string pattern, string content)

我来做百科(第二十天) B

Regex reg = new Regex(pattern);

我来做百科(第二十天) B

Match match = reg.Match(content);

我来做百科(第二十天) B

if (match != Match.Empty)

我来做百科(第二十天) B

//content = content.Replace(match.Groups[1].ToString(), string.Empty);

我来做百科(第二十天) B

return match.Groups[1].ToString();

我来做百科(第二十天) B

else

我来做百科(第二十天) B

return string.Empty;

我来做百科(第二十天) B

public static StringCollection GetHtmls(string begin, string end, string content)

我来做百科(第二十天) B

return GetHtmls(begin + "((.*?\\n?)*?)" + end, content);

我来做百科(第二十天) B

public static StringCollection GetHtmls(string pattern, string content)

我来做百科(第二十天) B

MatchCollection matches = reg.Matches(content);

我来做百科(第二十天) B

StringCollection list = new StringCollection();

我来做百科(第二十天) B

foreach (Match match in matches)

我来做百科(第二十天) B

if (match != Match.Empty)

我来做百科(第二十天) B

list.Add(match.Value);

我来做百科(第二十天) B

return list;

我来做百科(第二十天) B

/// <summary>

我来做百科(第二十天) B

/// 正则替换

我来做百科(第二十天) B

/// </summary>

我来做百科(第二十天) B

public static string ReplaceText(string input, string pattern, string replacement)

我来做百科(第二十天) B

if (string.IsNullOrEmpty(input)) return string.Empty;

我来做百科(第二十天) B

Regex rgx = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline);

我来做百科(第二十天) B

return rgx.Replace(input, replacement);

我来做百科(第二十天) B

/// 去标签包括内容

我来做百科(第二十天) B

public static string ClearWholeTag(string input, string tag)

我来做百科(第二十天) B

return ReplaceText(input, @"<" + tag + "[^>]*?>.*?</" + tag + ">", "");

我来做百科(第二十天) B

/// 去标签不包括内容

我来做百科(第二十天) B

public static string ClearTag(string input, string tag)

我来做百科(第二十天) B

return ReplaceText(input, @"<\/?" + tag + "[^>]*>", "");

我来做百科(第二十天) B

/// 去全部标签

我来做百科(第二十天) B

public static string ClearAllTag(string input)

我来做百科(第二十天) B

return ReplaceText(input, @"<\/?[a-zA-Z]+[^>]*>", "");

我来做百科(第二十天) B

数据采集就是爽，先来三百多条吧，哈哈。

我来做百科(第二十天) B

继续阅读

基金恒市值定投源码

#为什么台风“卡努”的预报路径成迷#关于台风预报路径“卡努”成迷小编分析可能有以下几个原因：1.台风路径难以准确预测：台

python爬虫第1章 urllib库（一） urllib库概述python爬虫第1章 urllib库（一） urllib库概述

【DrissionPage】DrissionPage是一个基于python的网页自动化工具。它既能控制浏览器，也能收发数

第二章 K8s ingress控制器

华汇项目后评价系统助力政企项目“后评价”时代

【示波器】基于FPGA的数字示波器设计实现

电池巡检单元通用型4节电池电压和温度监测:☞4节电池的电压监测、4路电池的温度监测、1路继电器输出、CAN-BUS级联通

利用PLC-Recorder的录波和虚拟变量功能，实现PLC采集数据的后处理或复杂计算1、用Ana打开待分析的数据文件，获得原始的数据和曲线2、添加虚拟变量3、退出配置窗口即可得到总流量曲线4、小结

数据的采集

自动快捷使用数据采集器采集某网站数据---后裔采集器

专业PLC数据采集软件PLC-Recorder通过ADS通讯进行倍福TwinCAT2和TwubCAT3数据采集的介绍一、通道配置二、变量配置三、通过左侧的按钮进行启停控制四、调用离线分析软件，进行后续数据分析五、小结

数据采集过程介绍简介数据的总体采集过程如下：

工厂流水线数据采集方案工厂流水线数据采集方案2、数据采集层

2020年9月，星闪联盟正式成立。从正式启动标准化工作到首次商用仅用了两年多的时间，成为史上发展最快的近距离无线技术。华

flume实时写数据到HA模式下的hdfs