天天看點

C#正規表達式抓取網站資訊

C#正規表達式抓取它網資訊,本示例以抓取京東商城商品詳情為例。

1、建立JdRobber.cs程式類

public class JdRobber
{
    /// <summary>
    /// 判斷是否京東連結
    /// </summary>
    /// <param name="param"></param>
    /// <returns></returns>
    public bool ValidationUrl(string url)
    {
        bool result = false;
        if (!String.IsNullOrEmpty(url))
        {
            Regex regex = new Regex(@"^http://item.jd.com/\d+.html$");
            Match match = regex.Match(url);
            if (match.Success)
            {
                result = true;
            }
        }
        return result;
    }

    /// <summary>
    /// 抓取京東資訊
    /// </summary>
    /// <param name="param"></param>
    /// <returns></returns>
    public void GetInfo(string url)
    {
        if (ValidationUrl(url))
        {
            string htmlStr = WebHandler.GetHtmlStr(url, "Default");
            if (!String.IsNullOrEmpty(htmlStr))
            {
                string pattern = "";          //正規表達式
                string sourceWebID = "";      //商品關鍵ID
                string title = "";            //标題
                decimal price = 0;            //價格
                string picName = "";          //圖檔

                //提取商品關鍵ID
                pattern = @"http://item.jd.com/(?<Object>\d+).html";
                sourceWebID = WebHandler.GetRegexText(url, pattern);

                //提取标題
                pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>";
                title = WebHandler.GetRegexText(htmlStr, pattern);

                //提取圖檔
                int begin = htmlStr.IndexOf("<div id=\"spec-n1\"");
                int end = htmlStr.IndexOf("</div>", begin + 1);
                if (begin > 0 && end > 0)
                {
                    string subPicHtml = htmlStr.Substring(begin, end - begin);
                    pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>";
                    picName = WebHandler.GetRegexText(subPicHtml, pattern);
                }

                //提取價格
                if (sourceWebID != "")
                {
                    string priceUrl = @"http://p.3.cn/prices/get?skuid=J_" + sourceWebID + "&type=1";
                    string priceJson = WebHandler.GetHtmlStr(priceUrl, "Default");
                    pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\""";
                    price = WebHandler.GetValidPrice(WebHandler.GetRegexText(priceJson, pattern));
                }

                Console.WriteLine("商品名稱:{0}", title);
                Console.WriteLine("圖檔:{0}", picName);
                Console.WriteLine("價格:{0}", price);
            }
        }
    }
}
           

2、建立WebHandler.cs公共方法類

/// <summary>
/// 公共方法類
/// </summary>
public class WebHandler
{
    /// <summary>
    /// 擷取網頁的HTML碼
    /// </summary>
    /// <param name="url">連結位址</param>
    /// <param name="encoding">編碼類型</param>
    /// <returns></returns>
    public static string GetHtmlStr(string url, string encoding)
    {
        string htmlStr = "";
        try
        {
            if (!String.IsNullOrEmpty(url))
            {
                WebRequest request = WebRequest.Create(url);            //執行個體化WebRequest對象
                WebResponse response = request.GetResponse();           //建立WebResponse對象
                Stream datastream = response.GetResponseStream();       //建立流對象
                Encoding ec = Encoding.Default;
                if (encoding == "UTF8")
                {
                    ec = Encoding.UTF8;
                }
                else if (encoding == "Default")
                {
                    ec = Encoding.Default;
                }
                StreamReader reader = new StreamReader(datastream, ec);
                htmlStr = reader.ReadToEnd();                           //讀取資料
                reader.Close();
                datastream.Close();
                response.Close();
            }
        }
        catch { }
        return htmlStr;
    }

    /// <summary>
    /// 擷取正規表達式中的關鍵字
    /// </summary>
    /// <param name="input">文本</param>
    /// <param name="pattern">表達式</param>
    /// <returns></returns>
    public static string GetRegexText(string input, string pattern)
    {
        string result = "";
        if (!String.IsNullOrEmpty(input) && !String.IsNullOrEmpty(pattern))
        {
            Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);
            Match match = regex.Match(input);
            if (match.Success)
            {
                result = match.Groups["Object"].Value;
            }
        }
        return result;
    }

    /// <summary>
    /// 傳回有效價格
    /// </summary>
    /// <param name="strPrice"></param>
    /// <returns></returns>
    public static decimal GetValidPrice(string strPrice)
    {
        decimal price = 0;
        try
        {
            if (!String.IsNullOrEmpty(strPrice))
            {
                Regex regex = new Regex(@"^\d+(\.\d{1,2})?$", RegexOptions.IgnoreCase);
                Match match = regex.Match(strPrice);
                if (match.Success)
                {
                    price = decimal.Parse(strPrice);
                }
            }
        }
        catch { }
        return price;
    }
}
           

繼續閱讀