Scrapy+selenium爬取簡書全站
環境
- Ubuntu 18.04
- Python 3.8
- Scrapy 2.1
爬取内容
- 文字标題
- 作者
- 作者頭像
- 釋出日期
- 内容
- 文章連接配接
- 文章ID
思路
- 分析簡書文章的url規則
- 使用selenium請求頁面
- 使用xpath擷取需要的資料
- 異步存儲資料到MySQL(提高存儲效率)
實作
前戲:
- 建立scrapy項目
- 建立crawlsipder爬蟲檔案
- 打開
和pipelines
middleware
第一步:分析簡書文章的url
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsISM9AnYldnJwAzN9c3Pn5GcuQ0MlQ0MlcnW1JkbMpXVE1ENZRlT4VERNlHMT1UNFRUT5lleNVTRE90dVRUT3lERNlHMp1UMnRVTwMGVNZXVE10dJRUT5hTaNFzZU1EMjRVT2NmMiNnSywEd5ITW110MaZHetlVdO1GT3lERNl3YXJGc5kHT20ESjBjUIF2Lc12bj5SYphXa5VWen5WY35iclN3Ztl2Lc9CX6MHc0RHaiojIsJye.png)
可以看到url規則為
jianshu.com/p/文章ID
,然後再crawlsipder中設定url規則
class JsSpider(CrawlSpider):
name = \'js\'
allowed_domains = [\'jianshu.com\']
start_urls = [\'http://jianshu.com/\']
rules = (
Rule(LinkExtractor(allow=r\'.+/p/[0-9a-z]{12}.*\'), callback=\'parse_detail\', follow=True),
)
第二步:使用selenium請求頁面
設定下載下傳器中間件
- 由于作者、釋出日期等資料由Ajax加載,是以使用selenium來擷取頁面源碼以友善xpath解析
- 有時候請求會卡在一個頁面,一直未加載完成,是以需要設定逾時時間
- 同理Ajax也可能未加載完成,是以需要顯示等待加載完成
from selenium import webdriver
from scrapy.http.response.html import HtmlResponse
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
class SeleniumDownloadMiddleware(object):
def __init__(self):
self.driver = webdriver.Chrome()
def process_request(self, request, spider):
while True:
# 逾時重新請求
try:
self.driver.set_page_load_timeout(1)
self.driver.get(request.url)
except:
pass
finally:
try:
# 等待ajax加載,逾時了就重來
WebDriverWait(self.driver, 1).until(
expected_conditions((By.CLASS_NAME, \'rEsl9f\'))
)
except:
continue
finally:
break
url = self.driver.current_url
source = self.driver.page_source
response = HtmlResponse(url=url, body=source, request=request, encoding=\'utf-8\')
return response
注意提前将 chromedriver 放到
/user/bin
下,或者自行指定執行路徑。windows下可以講其添加到環境變量下。
第三步:使用xpath擷取需要的資料
設定好item
import scrapy
class JianshuCrawlItem(scrapy.Item):
title = scrapy.Field()
content = scrapy.Field()
author = scrapy.Field()
avatar = scrapy.Field()
pub_time = scrapy.Field()
origin_url = scrapy.Field()
article_id = scrapy.Field()
分析所需資料的xpath路徑,進行擷取需要的資料,并交給
pipelines
處理
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import JianshuCrawlItem as Jitem
class JsSpider(CrawlSpider):
name = \'js\'
allowed_domains = [\'jianshu.com\']
start_urls = [\'http://jianshu.com/\']
rules = (
Rule(LinkExtractor(allow=r\'.+/p/[0-9a-z]{12}.*\'), callback=\'parse_detail\', follow=True),
)
def parse_detail(self, response):
# 使用xpath擷取資料
title = response.xpath("//h1[@class=\'_2zeTMs\']/text()").get()
author = response.xpath("//a[@class=\'_1OhGeD\']/text()").get()
avatar = response.xpath("//img[@class=\'_13D2Eh\']/@src").get()
pub_time = response.xpath("//div[@class=\'s-dsoj\']/time/text()").get()
content = response.xpath("//article[@class=\'_2rhmJa\']").get()
origin_url = response.url
article_id = origin_url.split("?")[0].split("/")[-1]
print(title) # 提示爬取的文章
item = Jitem(
title=title,
author=author,
avatar=avatar,
pub_time=pub_time,
origin_url=origin_url,
article_id=article_id,
content=content,
)
yield item
第四步:存儲資料到資料庫中
我這裡用的資料庫是MySQL,其他資料同理,操作資料的包是
pymysql
送出資料有兩種思路,順序存儲和異步存儲
由于scrapy是異步爬取,是以順序存儲效率就會顯得比較慢,推薦采用異步存儲
順序存儲:實作簡單、效率低
class JianshuCrawlPipeline(object):
def __init__(self):
dbparams = {
\'host\': \'127.0.0.1\',
\'port\': 3306,
\'user\': \'debian-sys-maint\',
\'password\': \'lD3wteQ2BEPs5i2u\',
\'database\': \'jianshu\',
\'charset\': \'utf8mb4\',
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()
self._sql = None
def process_item(self, item, spider):
self.cursor.execute(self.sql, (item[\'title\'], item[\'content\'], item[\'author\'],
item[\'avatar\'], item[\'pub_time\'],
item[\'origin_url\'], item[\'article_id\']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = \'\'\'
insert into article(id,title,content,author,avatar,pub_time,origin_url,article_id)\
values(null,%s,%s,%s,%s,%s,%s,%s)\'\'\'
return self._sql
異步存儲:複雜、效率高
import pymysql
from twisted.enterprise import adbapi
class JinshuAsyncPipeline(object):
\'\'\'
異步儲存爬取的資料
\'\'\'
def __init__(self):
# 連接配接本地mysql
dbparams = {
\'host\': \'127.0.0.1\',
\'port\': 3306,
\'user\': \'debian-sys-maint\',
\'password\': \'lD3wteQ2BEPs5i2u\',
\'database\': \'jianshu\',
\'charset\': \'utf8mb4\',
\'cursorclass\': pymysql.cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool(\'pymysql\', **dbparams)
self._sql = None
@property
def sql(self):
# 初始化sql語句
if not self._sql:
self._sql = \'\'\'
insert into article(id,title,content,author,avatar,pub_time,origin_url,article_id)\
values(null,%s,%s,%s,%s,%s,%s,%s)\'\'\'
return self._sql
def process_item(self, item, spider):
defer = self.dbpool.runInteraction(self.insert_item, item) # 送出資料
defer.addErrback(self.handle_error, item, spider) # 錯誤處理
def insert_item(self, cursor, item):
# 執行SQL語句
cursor.execute(self.sql, (item[\'title\'], item[\'content\'], item[\'author\'],
item[\'avatar\'],
item[\'pub_time\'],
item[\'origin_url\'], item[\'article_id\']))
def handle_error(self, item, error, spider):
print(\'Error!\')
總結
- 類似簡書這種采用Ajax技術的網站可以使用selenium輕松爬取,不過效率相對解析接口的方式要低很多,但實作簡單,如果所需資料量不大沒必要費勁去分析接口。
- selenium方式通路頁面時,會經常出現加載卡頓的情況,使用逾時設定和顯示等待避免浪費時間
Github:https://github.com/aduner/jianshu-crawl
部落格位址:https://www.cnblogs.com/aduner/p/12852616.html