在爬蟲中喚起另一個爬蟲
由于開發需要,希望在一個爬蟲檔案結束後,接着爬取指定檔案,想辦法在爬蟲檔案定義close方法,執行os.system(‘scrapy crawl spider_n’),結果想當然的失敗。
後來查找cmd指令,發現 && 可以連接配接多個指令!于是在cmd指令執行 scrapy crawl spider1 && scrapy crawl spider2,果然在spider1爬取關閉後就自動發起spider2,達成需求,贊一個~~
利用中間件監控爬蟲開啟和關閉
from scrapy import signals
from scrapy.utils.project import get_project_settings
class ProxyMiddleware(object):
'''設定proxy'''
connect, redis_connect = ('', '')
def __init__(self):
settings = get_project_settings()
if not self.connect:
# 連接配接資料庫
self.connect = pymysql.connect(
host=settings.get('MYSQL_HOST'),
port=settings.get('MYSQL_PORT'),
db=settings.get('MYSQL_DBNAME'),
user=settings.get('MYSQL_USER'),
passwd=settings.get('MYSQL_PASSWD'),
charset='utf8',
use_unicode=True)
# 通過cursor執行增删查改
self.cursor = self.connect.cursor()
self.connect.autocommit(True)
if not self.redis_connect:
self.redis_connect = redis.Redis(
host=settings.get('REDIS_HOST'),
port=settings.get('REDIS_PORT'),
password=settings.get('REDIS_PASSWD'),
decode_responses=True)
@classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
self.redis_connect.set(spider.name + '_proxy_id', 0) # 設定緩存鍵初始值
def spider_closed(self, spider):
spider.logger.info('Spider closed: %s' % spider.name)
self.redis_connect.delete(spider.name + '_proxy_id') # 删除緩存鍵