C#正規表達式抓取它網資訊,本示例以抓取京東商城商品詳情為例。
1、建立JdRobber.cs程式類
public class JdRobber
{
/// <summary>
/// 判斷是否京東連結
/// </summary>
/// <param name="param"></param>
/// <returns></returns>
public bool ValidationUrl(string url)
{
bool result = false;
if (!String.IsNullOrEmpty(url))
{
Regex regex = new Regex(@"^http://item.jd.com/\d+.html$");
Match match = regex.Match(url);
if (match.Success)
{
result = true;
}
}
return result;
}
/// <summary>
/// 抓取京東資訊
/// </summary>
/// <param name="param"></param>
/// <returns></returns>
public void GetInfo(string url)
{
if (ValidationUrl(url))
{
string htmlStr = WebHandler.GetHtmlStr(url, "Default");
if (!String.IsNullOrEmpty(htmlStr))
{
string pattern = ""; //正規表達式
string sourceWebID = ""; //商品關鍵ID
string title = ""; //标題
decimal price = 0; //價格
string picName = ""; //圖檔
//提取商品關鍵ID
pattern = @"http://item.jd.com/(?<Object>\d+).html";
sourceWebID = WebHandler.GetRegexText(url, pattern);
//提取标題
pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>";
title = WebHandler.GetRegexText(htmlStr, pattern);
//提取圖檔
int begin = htmlStr.IndexOf("<div id=\"spec-n1\"");
int end = htmlStr.IndexOf("</div>", begin + 1);
if (begin > 0 && end > 0)
{
string subPicHtml = htmlStr.Substring(begin, end - begin);
pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>";
picName = WebHandler.GetRegexText(subPicHtml, pattern);
}
//提取價格
if (sourceWebID != "")
{
string priceUrl = @"http://p.3.cn/prices/get?skuid=J_" + sourceWebID + "&type=1";
string priceJson = WebHandler.GetHtmlStr(priceUrl, "Default");
pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\""";
price = WebHandler.GetValidPrice(WebHandler.GetRegexText(priceJson, pattern));
}
Console.WriteLine("商品名稱:{0}", title);
Console.WriteLine("圖檔:{0}", picName);
Console.WriteLine("價格:{0}", price);
}
}
}
}
2、建立WebHandler.cs公共方法類
/// <summary>
/// 公共方法類
/// </summary>
public class WebHandler
{
/// <summary>
/// 擷取網頁的HTML碼
/// </summary>
/// <param name="url">連結位址</param>
/// <param name="encoding">編碼類型</param>
/// <returns></returns>
public static string GetHtmlStr(string url, string encoding)
{
string htmlStr = "";
try
{
if (!String.IsNullOrEmpty(url))
{
WebRequest request = WebRequest.Create(url); //執行個體化WebRequest對象
WebResponse response = request.GetResponse(); //建立WebResponse對象
Stream datastream = response.GetResponseStream(); //建立流對象
Encoding ec = Encoding.Default;
if (encoding == "UTF8")
{
ec = Encoding.UTF8;
}
else if (encoding == "Default")
{
ec = Encoding.Default;
}
StreamReader reader = new StreamReader(datastream, ec);
htmlStr = reader.ReadToEnd(); //讀取資料
reader.Close();
datastream.Close();
response.Close();
}
}
catch { }
return htmlStr;
}
/// <summary>
/// 擷取正規表達式中的關鍵字
/// </summary>
/// <param name="input">文本</param>
/// <param name="pattern">表達式</param>
/// <returns></returns>
public static string GetRegexText(string input, string pattern)
{
string result = "";
if (!String.IsNullOrEmpty(input) && !String.IsNullOrEmpty(pattern))
{
Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);
Match match = regex.Match(input);
if (match.Success)
{
result = match.Groups["Object"].Value;
}
}
return result;
}
/// <summary>
/// 傳回有效價格
/// </summary>
/// <param name="strPrice"></param>
/// <returns></returns>
public static decimal GetValidPrice(string strPrice)
{
decimal price = 0;
try
{
if (!String.IsNullOrEmpty(strPrice))
{
Regex regex = new Regex(@"^\d+(\.\d{1,2})?$", RegexOptions.IgnoreCase);
Match match = regex.Match(strPrice);
if (match.Success)
{
price = decimal.Parse(strPrice);
}
}
}
catch { }
return price;
}
}