天天看點

DotnetSpider (一) 架構的了解、應用、搭建

受業務影響,決定将Downloader單獨分層,做出修改。

最近在做爬蟲,之前一直在使用 HttpWebRequest 和 WebClient ,很友善快捷,也很适合新手,但随着抓取任務的增多,多任務,多庫等情況的出現,使用一個優秀的爬蟲架構是十分必要的。于是開始接觸dotnetspider。

  

DotnetSpider (一) 架構的了解、應用、搭建

借鑒一下架構的設計圖,在引入dotnetspider的NuGet包後,我基本也是按照這個進行了分層

DotnetSpider (一) 架構的了解、應用、搭建

Data.Spider - 存放前台頁面(Winform、控制台)和實體爬蟲(EntitySpider)等,相當于發起請求的起點。

Spider.Downloader - 封裝請求等資訊,可實作自定義cookie等,非必須。

Spider.Processor - 處理器,繼承 IPageProcessor 實作對抓取内容的處理

Spider.Pipe - 管道,我将它了解為經過了 Processor 處理後的一個回調,将處理好的資料存儲(檔案、資料庫等)

Spider.Entity - 資料實體類,繼承 SpiderEntity

Spider.Command - 一些常用的公用指令,我這目前存放着轉資料格式類,背景執行JS類,SqlHelper(因架構自帶資料庫管道,暫時沒用)等

這樣的分層也是參考了源碼的示例

DotnetSpider (一) 架構的了解、應用、搭建

随着這幾天的嘗試,真的發現這個架構真的非常靈活,以凹凸租車的爬蟲為例,上代碼

  實體類:

[EntityTable("CarWinsSpider", "AtzucheCar", EntityTable.Today)]

[EntitySelector(Expression = "$.data.content[*]", Type = SelectorType.JsonPath)]

public class AtzucheModel : SpiderEntity

{

/// <summary>

/// 車輛編号

/// </summary>

[PropertyDefine(Expression = "$.carNo", Type = SelectorType.JsonPath)]

public int carNo { get; set; }

/// 品牌

//[ReplaceFormatter(NewValue = "", OldValue = "\r")]

//[ReplaceFormatter(NewValue = "", OldValue = "\t")]

//[ReplaceFormatter(NewValue = "", OldValue = "&nbsp;")]

//[ReplaceFormatter(NewValue = "", OldValue = "\n")]

//[ReplaceFormatter(NewValue = "", OldValue = "\"")]

//[ReplaceFormatter(NewValue = "", OldValue = " ")]

[PropertyDefine(Expression = "$.brand", Type = SelectorType.JsonPath)]

public string brand { get; set; }

/// 位址

[PropertyDefine(Expression = "$.carAddr", Type = SelectorType.JsonPath)]

public string carAddr { get; set; }

/// 車系

[PropertyDefine(Expression = "$.type", Type = SelectorType.JsonPath)]

public string type { get; set; }

/// 排量

[PropertyDefine(Expression = "$.sweptVolum", Type = SelectorType.JsonPath)]

public string sweptVolum { get; set; }

/// 圖檔

[PropertyDefine(Expression = "$.coverPic", Type = SelectorType.JsonPath)]

public string coverPic { get; set; }

/// 日租金

[PropertyDefine(Expression = "$.dayPrice", Type = SelectorType.JsonPath)]

public int dayPrice { get; set; }

/// 公裡數

[PropertyDefine(Expression = "$.distance", Type = SelectorType.JsonPath)]

public string distance { get; set; }

/// 評分

[PropertyDefine(Expression = "$.evalScore", Type = SelectorType.JsonPath)]

public string evalScore { get; set; }

[PropertyDefine(Expression = "$.gbType", Type = SelectorType.JsonPath)]

public string gbType { get; set; }

/// 車牌

[PropertyDefine(Expression = "$.plateNum", Type = SelectorType.JsonPath)]

public string plateNum { get; set; }

[PropertyDefine(Expression = "$.replyTag", Type = SelectorType.JsonPath)]

public string replyTag { get; set; }

[PropertyDefine(Expression = "$.transCount", Type = SelectorType.JsonPath)]

public string transCount { get; set; }

/// 年款

[PropertyDefine(Expression = "$.year", Type = SelectorType.JsonPath)]

public int year { get; set; }

[PropertyDefine(Expression = "$.isPrivilege", Type = SelectorType.JsonPath)]

public int isPrivilege { get; set; }

[PropertyDefine(Expression = "$.isRecommend", Type = SelectorType.JsonPath)]

public int isRecommend { get; set; }

[PropertyDefine(Expression = "$.isUpgrade", Type = SelectorType.JsonPath)]

public int isUpgrade { get; set; }

[PropertyDefine(Expression = "$.lat", Type = SelectorType.JsonPath)]

public string lat { get; set; }

[PropertyDefine(Expression = "$.lon", Type = SelectorType.JsonPath)]

public string lon { get; set; }

[PropertyDefine(Expression = "$.queryId", Type = SelectorType.JsonPath)]

public string queryId { get; set; }

[PropertyDefine(Expression = "$.supplyCarService", Type = SelectorType.JsonPath)]

public int supplyCarService { get; set; }

[PropertyDefine(Expression = "$.freeCarService", Type = SelectorType.JsonPath)]

public int freeCarService { get; set; }

[PropertyDefine(Expression = "$.isShenMaCar", Type = SelectorType.JsonPath)]

public int isShenMaCar { get; set; }

[PropertyDefine(Expression = "$.supportGetReturn", Type = SelectorType.JsonPath)]

public int supportGetReturn { get; set; }

[PropertyDefine(Expression = "$.confirmation", Type = SelectorType.JsonPath)]

public int confirmation { get; set; }

}

起始:

  /// <summary>

  /// 應用程式的主入口點。

  /// </summary>

  [STAThread]

static void Main()

var site = new Site

  CycleRetryTimes = 1,

  SleepTime = 200,

  Headers = new Dictionary<string, string>()

  {

    {"Accept","application/json, text/javascript, */*; q=0.01" },

    {"Accept-Encoding","gzip, deflate" },

    {"gzip, deflate","zh-CN,zh;q=0.9" },

    {"X-Requested-With","XMLHttpRequest" },

    { "Referer", "http://www.atzuche.com/hz/car/search"},

    { "Connection","keep-alive" },

    { "Content-Type","application/json;charset=UTF-8" },

    { "Host","www.atzuche.com"},

    { "User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  }

};

List<Request> resList = new List<Request>();

Request res = new Request();

//res.PostBody = $"id=7&j=%7B%22createMan%22%3A%2218273159100%22%2C%22createTime%22%3A1518433690000%2C%22row%22%3A5%2C%22siteUserActivityListId%22%3A8553%2C%22siteUserPageRowModuleId%22%3A84959%2C%22topids%22%3A%22%22%2C%22wherePhase%22%3A%221%22%2C%22wherePreferential%22%3A%220%22%2C%22whereUsertype%22%3A%220%22%7D&page={i}&shopid=83106681";//據說是post請求需要

res.Url = "http://www.atzuche.com/car/searchListMap/2?cityCode=330100&sceneCode=U002&filterCondition%5Blon%5D=120.219294&filterCondition%5Blat%5D=30.259258&filterCondition%5Bseq%5D=4&pageNum=1&pageSize=0";

res.Method = System.Net.Http.HttpMethod.Get;

resList.Add(res);

var spider = DotnetSpider.Core.Spider.Create(site, new QueueDuplicateRemovedScheduler(), new AtzucheProcessor())

.AddStartRequests(resList.ToArray())//頁面抓取整理

.AddPipeline(new AtzuchePipe());//資料回調

//----------------------------------

spider.Monitor = new DotnetSpider.Core.Monitor.NLogMonitor();

spider.Downloader = new AtzucheDownloader(); //new DotnetSpider.Core.Downloader.HttpClientDownloader();

spider.ClearSchedulerAfterComplete = false;//爬蟲結束後不取消排程器

spider.ThreadNum = 1;

spider.Run();

    Console.WriteLine("Press any key to continue...");

    Console.Read();

這裡也可将整個抓取方法當做一個Spider執行個體單獨放置 -> EntitySpider

  AtzucheEntitySpider dDengEntitySpider = new AtzucheEntitySpider();

  dDengEntitySpider.AddPageProcessor(new AtzucheProcessor());//控制器

  dDengEntitySpider.AddPipeline(new AtzuchePipe());//回調

  dDengEntitySpider.ThreadNum = 1;

  dDengEntitySpider.Run();

  Console.WriteLine("Press any key to continue...");

  Console.Read();

Downloader

對目标的請求全部包含着這裡,可以根據需要自行設定,下篇将進行自定義Request的應用

public class AtzucheDownloader : BaseDownloader

  protected override Page DowloadContent(Request request, ISpider spider)

  { 

    return new HttpClientDownloader().Download(request, spider);

建立爬蟲實體類

public class AtzucheEntitySpider : EntitySpider

  protected override void MyInit(params string[] arguments)

    AddPipeline(new SqlServerEntityPipeline("Server=.;Database=AuzucheSpider;uid=sa;pwd=123;MultipleActiveResultSets=true"));//注意連接配接字元串中資料庫不能帶 .  親測報錯。。。

    AddStartUrl("http://www.atzuche.com/car/searchListMap/2?cityCode=330100&sceneCode=U002&filterCondition%5Blon%5D=120.219294&filterCondition%5Blat%5D=30.259258&filterCondition%5Bseq%5D=4&pageNum=1&pageSize=0");

    AddEntityType<AtzucheModel>();//如添加此實體類,架構将會根據此實體類上面的特性選擇進行比對,比對成功後插入資料庫,固可以省略Processor和Pipe,或者不使用此句,通過控制器和回調自定義存儲方法

public AtzucheEntitySpider() : base("AuzucheSpider", new Site

})

 接下來是處理器:

解析抓取的資料封裝到"AtzucheList"内,可Pipe内通過此名稱擷取處理好的資料。

public class AtzucheProcessor : IPageProcessor

  public void Process(Page page, ISpider spider)

    List<AtzucheModel> list = new List<AtzucheModel>();

    var html = page.Selectable.JsonPath("$.data.content").GetValue();

    list = JsonConvert.DeserializeObject<List<AtzucheModel>>(html);

    page.AddResultItem("AtzucheList", list);

最後是回調,可在此加入儲存資料的代碼,至此結束。

public class AtzuchePipe : BasePipeline

  public override void Process(IEnumerable<ResultItems> resultItems, ISpider spider)

    var result = new List<AtzucheModel>();

    foreach (var resultItem in resultItems)

    {

      Console.WriteLine((resultItem.Results["AtzucheList"] as List<AtzucheModel>).Count);

      foreach (var item in (resultItem.Results["AtzucheList"] as List<AtzucheModel>))

      {

        result.Add(new AtzucheModel()

        {

          carNo = item.carNo

        });

        Console.WriteLine($"{item.carNo}:{item.type} ");

      }

    }

   結果圖:

DotnetSpider (一) 架構的了解、應用、搭建

總體來說,此架構對新手還是很友好的,靈活寫法可以讓我們有較多的方式去實作爬蟲,因為這個爬蟲比較簡單,就先寫到這裡,未來如果可能,會再嘗試使用架構内的多線程、代理等功能,如有心得将繼續分享,希望能對跟我一樣的新手有所幫助,十分感謝。

作者:Grom