//由于我所抓取的網頁有多個困難,1、以上三種無法擷取到源碼,無論設定何種頭部請求都沒用,2、單獨隻用webbrowser無法進行翻頁操作。是以用webbrowser與IE結合來抓取
//本項目流程——先打開IE、再用MSHtml裡的方法操作IE表單,進行翻頁,将清單中的網址在webbrowser一一打開,這樣才擷取得到源碼。
這個項目的意義在于,無論網站是何種方式加載,都可以抓取到内容。
項目源碼在文章最後。
string[] province = { "北京市", "天津市", "河北省", "山西省", "内蒙古", "遼甯省", "吉林省", "黑龍江省", "上海市", "江蘇省", "浙江省", "安徽省", "福建省", "江西省", "山東省", "河南省", "湖北省", "湖南省", "廣東省", "廣西壯族", "海南省", "重慶市", "四川省", "貴州省", "雲南省", "西藏", "陝西省", "甘肅省", "青海省", "甯夏回族", "新疆維吾爾", "新疆建設兵團" };
int[] provinceCode = { 11, 12, 13, 14, 15, 21, 22, 23, 31, 32, 33, 34, 35, 36, 37, 41, 42, 43, 44, 45, 45, 50, 51, 52, 53, 54, 61, 62, 63, 64, 65, 66 };
//private Thread Thread_land;
private void button1_Click(object sender, EventArgs e)
{
//調用webBrowser中的js函數
//if (webBrowser1.Document != null)
//{
// HtmlDocument doc = webBrowser1.Document;
// //無參調用
// doc.InvokeScript("sdf2");
//}
new Action(appStart).BeginInvoke(null, null);
//Thread Thread_land = new Thread(new ThreadStart(appStart));
//Thread_land.Start();
//appStart();
}
string TaskProgress = "";//任務進度
private void appStart()
{
try
{
//Action at = new Action(delegate() { label1.Text += "采集開始;\n"; });
Action at = new Action(() => { textBox1.Text = "采集開始;\r\n"; labCount.Text = "0"; timer1.Enabled = true; labStartTime.Text =DateTime.Now.ToString(); });
this.Invoke(at);
//擷取采集進度
int proIndex = 0;
int yearIndex = 2011;
int pageIndex = 2;
LandDB.BLL.CJLog cjlogBLL = new LandDB.BLL.CJLog();
TaskProgress = cjlogBLL.GetTaskProgress();
if (TaskProgress != "")
{
string[] s = TaskProgress.Split(',');
ArrayList str = new ArrayList(province);
proIndex = str.IndexOf(s[0]);
yearIndex = int.Parse(s[2]);
pageIndex = int.Parse(s[3]);
}
//周遊所有頁籤
SHDocVw.ShellWindows IETabs = new SHDocVw.ShellWindows();
foreach (SHDocVw.InternetExplorer ieTab in IETabs)
{
if (ieTab.LocationURL.Contains("www.landchina.com/default.aspx"))
{
for (int i = proIndex; i <= province.Length; i++)//周遊省
{
for (int y = yearIndex; y <= DateTime.Now.Year; y++)//周遊年
{
//通過js操控
mshtml.HTMLDocument doc = ieTab.Document as mshtml.HTMLDocument;
mshtml.IHTMLScriptElement script = doc.createElement("script") as mshtml.IHTMLScriptElement;//
script.text = string.Format("document.getElementById('TAB_queryTblEnumItem_227').value='{0}';", province[i]);
script.text += string.Format("document.getElementById('TAB_queryTblEnumItem_227_v').value={0};", provinceCode[i]);
script.text += "document.getElementById('TAB_QueryConditionItem227').checked = true;";
script.text += "document.getElementById('TAB_QueryConditionItem268').checked = true;";
DateTime dtbegin = new DateTime(y, 1, 1); ;
DateTime dtend = new DateTime(y, 12, 31);
script.text += string.Format("document.getElementById('TAB_queryDateItem_268_1').value='{0}';", dtbegin);//開始時間
script.text += string.Format("document.getElementById('TAB_queryDateItem_268_2').value='{0}';", dtend);//結束時間
//script.text += "document.getElementById('TAB_QueryConditionItem288').checked = true;";
//script.text += string.Format("document.getElementById('TAB_queryCheckItem_288').value='{0}';","");//土地用途
mshtml.HTMLBody body = doc.body as mshtml.HTMLBody; //取得body對象
body.appendChild((mshtml.IHTMLDOMNode)script);//注冊JavaScript
mshtml.IHTMLDocument2 doc2 = (mshtml.IHTMLDocument2)ieTab.Document;
mshtml.IHTMLElementCollection inputs;
inputs = (mshtml.IHTMLElementCollection)doc2.all.tags("INPUT");
mshtml.IHTMLElement element_post = (mshtml.IHTMLElement)inputs.item("TAB_QueryButtonControl", 0);
element_post.click();
doc = ieTab.Document as mshtml.HTMLDocument; analysisSource(doc.body.innerHTML);
Regex re = new Regex("共([0-9]{1,5})頁([\\s\\ ]*共[0-9]{1,20})條記錄", RegexOptions.Multiline);
Match ma = re.Match(doc.body.innerHTML);
string pages = ma.Groups[1].ToString();
for (int j = pageIndex; j <= int.Parse(pages); j++)
{
TaskProgress = province[i] + "," + provinceCode[i] + "," + dtbegin+","+j;
script = doc.createElement("script") as mshtml.IHTMLScriptElement;//
script.text = string.Format("QueryAction.GoPage('TAB',{0})", j);
body = doc.body as mshtml.HTMLBody; //取得body對象
body.appendChild((mshtml.IHTMLDOMNode)script);//注冊JavaScript
//分析頁面
analysisSource(doc.body.innerHTML);
Action at1 = new Action(() => { textBox1.Text= ""; });
this.Invoke(at1);
}
pageIndex = 2;
}
yearIndex = 2011;
}
}
else
{
//Action at = new Action(() => { "IE浏覽器可能未打開\n"; });
//this.Invoke(at);
}
}
}
catch(Exception ex)
{
Action at = new Action(() => { textBox1.Text += ex.Message.ToString() + "\r\n"; });
this.Invoke(at);
//把錯誤記錄到資料庫
LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
LandDB.Model.CJLog model = new LandDB.Model.CJLog();
model.FSourceContent = "appStart:" + ex.Message.ToString();
model.Furl = webBrowser1.Document.Url.ToString();
bll.Add(model);
}
finally
{
Action at = new Action(() => { textBox1.Text += "采集停止;\r\n"; timer1.Enabled = false; });
this.Invoke(at);
//把進度記錄到資料庫
LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
LandDB.Model.CJLog model = new LandDB.Model.CJLog();
model.TaskProgress = TaskProgress;
model.FSourceContent = "appStart:";
model.Furl = webBrowser1.Document.Url.ToString();
bll.Add(model);
this.EndInvoke(null);
}
}
/// <summary>
/// 擷取清單
/// </summary>
/// <param name="source"></param>
private void analysisSource(string source)
{
//去除回車換行符号
source = Regex.Replace(source, "([\\r\\n])[\\s]+", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
source = Regex.Replace(source, "\\n", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
source.Replace("\\r\\n", "");
Regex reg = new Regex("<a([^\\/>href]+)href=\"/DesktopModule/BizframeExtendMdl/workList/bulWorkView.aspx?([^\"]+)\"([^\\/]*)\\/?>",RegexOptions.Multiline|RegexOptions.IgnoreCase);
MatchCollection mats = reg.Matches(source);
foreach (Match mat in mats)
{
string aa = mat.Value;
Regex reg1 = new Regex("href=\"([^\"]+)\"");
string url = "http://www.landchina.com" + reg1.Match(aa).Groups[1].Value.Replace("&", "&");
Action at = new Action(() =>
{
webBrowser1.Navigate(url);
textBox1.Text += url + "\r\n";
});
this.Invoke(at);
Thread.Sleep(3000);
}
}
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
try
{
string webInfo = webBrowser1.Document.Body.InnerHtml;
if (webInfo != "" && webInfo.IndexOf("占地公告") == -1)
{
//去除回車換行符号
webInfo = Regex.Replace(webInfo, "([\\r\\n])[\\s]+", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
webInfo = Regex.Replace(webInfo, "\\n", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
webInfo.Replace("\\r\\n", "");
webInfo = CleanWordHtml(webInfo);
string Splitstr = @"(<TD class=[""']?cellBordy[""']? vAlign=[""']?top[""']?>).*?(<A (id=[""']?lnkOldBul[""']? class=[""']?link1[""']?>|class=[""']?link1[""']? id=[""']?lnkOldBul[""']?))";
Match cc = Regex.Match(webInfo, Splitstr, RegexOptions.IgnoreCase | RegexOptions.Multiline);
webInfo = cc.Groups[0].ToString();
webInfo = webInfo.ToLower();
if (webInfo != "")
{
getParam(webInfo, webBrowser1.Document.Url.ToString());
}
textBox1.Text += "采集完成\r\n";
labCount.Text = (int.Parse(labCount.Text) + 1).ToString();
}
else
{
textBox1.Text += "空頁面\r\n";
//把錯誤記錄到資料庫
LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
LandDB.Model.CJLog model = new LandDB.Model.CJLog();
model.Furl = webBrowser1.Document.Url.ToString();
bll.Add(model);
}
}
catch (Exception ex)
{
Action at = new Action(() => { textBox1.Text += ex.Message.ToString() + "\r\n"; });
this.Invoke(at);
//把錯誤記錄到資料庫
LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
LandDB.Model.CJLog model = new LandDB.Model.CJLog();
model.FSourceContent = ex.Message.ToString();
model.Furl = webBrowser1.Document.Url.ToString();
bll.Add(model);
}
//finally
//{
// //把進度記錄到資料庫
// LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
// LandDB.Model.CJLog model = new LandDB.Model.CJLog();
// model.TaskProgress = TaskProgress;
// model.Furl = webBrowser1.Document.Url.ToString();
// bll.Add(model);
//}
}
private void Main_FormClosed(object sender, FormClosedEventArgs e)
{
}
/// <summary>
/// 擷取參數
/// </summary>
/// <param name="strWebData"></param>
/// <param name="url"></param>
private void getParam(string strWebData, string url)
{
LandDB.BLL.blockNote bnBll = new LandDB.BLL.blockNote();
LandDB.Model.blockNote bnModel = new LandDB.Model.blockNote();
bnModel.gtUrl = url;
bnModel.dataType = 1;
string Splitstr = "<span[^>]*>([^<]*)</span>";
Match ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
string strTitle = ma.Groups[1].ToString();
bnModel.topic = strTitle;
Splitstr = "[(].*?[)]";
ma = Regex.Match(strTitle, Splitstr);
string noteno = ma.Groups[0].ToString().Replace("(", "").Replace(")", "");
bnModel.noteNo = noteno;
//釋出機關
Splitstr = @"<td[^>](?:align=[""']?right[""']?)*>([^<]*)<br>";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
string strpubUnit = ma.Groups[1].Value.Trim();
if (string.IsNullOrEmpty(strpubUnit))
{
strpubUnit = string.Empty;
}
bnModel.pubUnit = strpubUnit;
bool isEx = false;
isEx = bnBll.Exists(string.Format("topic='{0}' and noteNo='{1}' and state=1 ", strTitle, noteno));
if (isEx == true)
{
bnModel = bnBll.GetModel(strTitle, noteno);
//1、采集完成 2、采集不成功,3、地塊不完全
if (bnModel.gtState == 2)
{
//重新采集更新
//插入公告
AddblockNote(strWebData, ref bnModel);
bnBll.Update(bnModel);
bool isComp = false;
AddblockInfo(strWebData, bnModel, out isComp);
//判斷表是否采集完整
if (isComp == true)
{
bnBll.Update("gtstate=3", bnModel.noteId);
}
}
else if(bnModel.gtState==3)
{
bool isComp = false;
AddblockInfo(strWebData, bnModel, out isComp);
//判斷表是否采集完整
if (isComp == true)
{
bnBll.Update("gtstate=3", bnModel.noteId);
}
}
}
if (isEx == false)
{
//插入公告
AddblockNote(strWebData, ref bnModel);
int noteid = bnBll.Add(bnModel);
bnModel.noteId = noteid;
//繼續往下插入表
bool isComp=false;
AddblockInfo(strWebData, bnModel,out isComp);
//判斷表是否采集完整
if(isComp==true)
{
bnBll.Update("gtstate=3",noteid);
}
}
}
private void AddblockNote(string strWebData, ref LandDB.Model.blockNote bnModel)
{
LandDB.BLL.blockNote bnBll = new LandDB.BLL.blockNote();
//if (bnBll.Exists(bnModel.noteId))
//{
//}
//else
//{
string Splitstr = "";
string province = "";
string city = "";
string blockZone = "";
Match ma = null;
//地區
Splitstr = @"<SPAN[^>](?:id=[""']?lblXzq[""']?)*>([^<]*)</SPAN>";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
string partweb = ma.Groups[1].ToString();
if (partweb != "")
{
partweb = partweb.Replace(">", "|");
string[] s = partweb.Split('|');
if (s.Length > 0)
{
s[0] = s[0].Replace("行政區:", "").Trim();
province = s[0];
//model.province = s[0].Trim().TrimEnd('省').Replace("自治區", "").Replace("直轄市", "").Replace("自治州", "");
if (s.Length >= 2)
{
//s[1] = s[1].ToString().Trim().TrimEnd('市');
//model.city = s[1].Replace("自治區", "").Replace("直轄市", "").Replace("自治州", "").Replace("自治縣", "");
city = s[1];
if (s.Length == 3)
{
//s[2] = s[2].ToString().Trim().Replace("自治區", "").Replace("自治州", "").Replace("自治縣", "").Replace("本級", "");
//model.blockZone = s[2];
blockZone = s[2];
}
else
{
//model.blockZone = "";
blockZone = "";
}
}
}
}
bnModel.province = province;
bnModel.city = city;
bnModel.blockZone = blockZone;
//出讓日期
Splitstr = "(?:号地塊:)(.*?日)";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[1].Value;
if (partweb == "")//不存在,則在另一個地方擷取
{
Splitstr = "(六、).*?[年]?(</u>)";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString();
Splitstr = "<u>.*?日";
ma = Regex.Match(partweb, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString().Replace("年", "-").Replace("月", "-").Replace("日", "").Replace("<u>", "").Replace("<U>", "").Trim();
bnModel.transferDate = Convert.ToDateTime(partweb);
}
else
{
partweb = partweb.Replace("<U>", "").Replace("<u>", "").Trim();
bnModel.transferDate = DateTime.Parse(partweb);
}
//出讓方式
Splitstr = "(?:以 <u>)(.*?)(?:</u> 方式出讓 <u>)";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[1].ToString();
if (partweb != "")
{
bnModel.remiseWay = partweb;
}
//建立時間
bnModel.createTime = DateTime.Parse(DateTime.Now.ToShortDateString().ToString());
//截止日期
Splitstr = "(?:号地塊:<u>).*?(?:</u> ;)";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString();
if (partweb != "")
{
Splitstr = "(至 <u>).*?(日)";
ma = Regex.Match(partweb, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString().Replace("至 <U>", "").Replace("<u>", "");
partweb = Regex.Replace(partweb, "[\u4e00-\u9fa5]", "-").Trim('-');
bnModel.expireDate = DateTime.Parse(partweb);
}
else//拍賣
{
Splitstr = "(截止時間為).*?[年]?(日)";
ma = Regex.Match(strWebData, Splitstr);
partweb = ma.Groups[0].ToString().Replace("年", "-").Replace("月", "-").Replace("日", "").Replace("截止時間為", "").Replace("<u>", "").Replace("<U>", "").Trim();
bnModel.expireDate = DateTime.Parse(partweb);
}
//釋出日期
Splitstr = @"<SPAN[^>](id=[""']?lblCreateDate[""']?)*>([^<]*)</SPAN>";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString().Trim();
if (partweb != "")
{
partweb = Regex.Match(partweb, "\\d{4}年\\d{1,2}月\\d{1,2}日").Value;
bnModel.pubDate = DateTime.Parse(partweb);
}
if (string.IsNullOrEmpty(bnModel.province) || string.IsNullOrEmpty(bnModel.city) || string.IsNullOrEmpty(bnModel.topic) || bnModel.pubDate == null || string.IsNullOrEmpty(bnModel.remiseWay) || string.IsNullOrEmpty(bnModel.pubUnit) || bnModel.expireDate == null || bnModel.transferDate == null || string.IsNullOrEmpty(bnModel.blockZone))
{
bnModel.gtState = 2;
}
else
{
bnModel.gtState = 1;
}
//}
}
private void AddblockInfo(string strWebData, LandDB.Model.blockNote bnModel, out bool isComp)
{
strWebData = strWebData.Replace(" ", "");
LandDB.BLL.blockInfo bIBll = new LandDB.BLL.blockInfo();
#region
//擷取地塊
Regex divRg = new Regex(@"<DIV[\s]*style=[""'\s]*FONT-SIZE:[\s]*12px[""'\s]*>.*?</div>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection divRgs = divRg.Matches(strWebData);
foreach (Match match in divRgs)
{
LandDB.Model.blockInfo bIModel = new LandDB.Model.blockInfo();
string blockTable = match.Value;
if (blockTable != "")
{
bIModel.noteId = bnModel.noteId;
bIModel.province = bnModel.province;
bIModel.city = bnModel.city;
bIModel.blockZone = bnModel.blockZone;
bIModel.transferMode = bnModel.remiseWay;
bIModel.dataType = 1;
bIModel.blockState = 2;
bIModel.state = 1;
bIModel.createTime = DateTime.Now;
//出讓機關
bIModel.pubUnit = bnModel.pubUnit;
//起始總價
bIModel.firstPrice = 0;
bIModel.donePrice = 0;
bIModel.doneArea = 0;
bIModel.floorPrice = 0;
bIModel.blockArea = "";
#region
Regex re = new Regex("(<td).*?[>]?(</td>)", RegexOptions.IgnoreCase);
MatchCollection mc = re.Matches(blockTable);
for (int i = 0; i < mc.Count; i++)
{
//var ed = Regex.Match(mc[i].Value, "<td[^>]*>([^<]*)</td>");
var ed = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
if (!string.IsNullOrEmpty(ed))
{
//if(ed.IndexOf("用途")!=-1)
//{
//}
if (ed == "宗地編号:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
bIModel.blockNo = tdvalue;
continue;
}
else if (ed == "宗地面積:" || ed == "宗地總面積:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
bIModel.blockArea = tdvalue;
continue;
}
else if (ed == "起始價:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
var first = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?");
if (first.Success)
{
bIModel.firstPrice = decimal.Parse(first.ToString());
}
continue;
}
else if (ed == "容積率:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
string blockRate = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value;
bIModel.blockRate = blockRate;
bIModel.blockRateStr = tdvalue;
continue;
}
else if (ed == "土地用途:" || ed == "土地用途明細:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
//string useType = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value;
bIModel.oriUseType = tdvalue;
continue;
}
else if (ed == "宗地坐落:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
//string blockAddress = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value;
bIModel.blockAddress = tdvalue;
continue;
}
else if (ed.IndexOf("挂牌截止時間")!=-1)
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
//string expireDate = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value;
bIModel.expireDate = DateTime.Parse(tdvalue);
continue;
}
else if(ed.IndexOf("估價報告備案号")!=-1)
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
bIModel.RecordNumberOfAppraisalReport = tdvalue;
continue;
}
}
}
#endregion
bool isEx = false;
isEx = bIBll.Exists(string.Format("blockAddress='{0}' and blockNo='{1}' and state=1 ", bIModel.blockAddress, bIModel.blockNo));
if (string.IsNullOrEmpty(bnModel.province) || string.IsNullOrEmpty(bnModel.city) || string.IsNullOrEmpty(bnModel.blockZone) || string.IsNullOrEmpty(bIModel.blockArea) || string.IsNullOrEmpty(bIModel.oriUseType) || bIModel.expireDate == null || string.IsNullOrEmpty(bIModel.pubUnit) || string.IsNullOrEmpty(bIModel.blockAddress) || string.IsNullOrEmpty(bIModel.blockNo) || bIModel.firstPrice == null || string.IsNullOrEmpty(bIModel.blockRateStr) || string.IsNullOrEmpty(bIModel.RecordNumberOfAppraisalReport))
{
bIModel.gtState = 2;
}
else
{
bIModel.gtState = 1;
}
if (isEx == true)
{
bIModel = bIBll.GetModel(bIModel.blockAddress, bIModel.blockNo);
//1、采集完成 2、采集不成功
if (bIModel.gtState == 2)
{
//插入地塊
bIBll.Update(bIModel);
}
}
if (isEx == false)
{
int infoid = bIBll.Add(bIModel);
}
}
}
isComp= bIBll.Exists("gtState=2");
#endregion
}
項目源碼位址:http://www.onethink.top/1/SoudiWinForm.zip