Python 爬虫 Scrapy框架示例 #chouti.py:
import scrapy,sys,io
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import ChoutiItem
from scrapy.dupefilters import RFPDupeFilter
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding="gb18030")
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ["chouti.com"]
start_urls = ["http://dig.chouti.com"]
#visited_urls=set()#用于存放已访问的URL,从而实现不重复访问;但Scrapy本身也进行了去重的工作,因而无需自定义实现该操作
#def start_requests(self):#重写scrapy.Spider的start_requests()方法
# for url in self.start_urls:
# yield Request(url,callback=self.run)
#def run(self,response):
# pass
def parse(self, response):
#获取当前页面所有热点新闻的标题:
sel1=Selector(response=response).xpath("//div[@id='content-list']/div[@class='item']")
for obj in sel1:
title=obj.xpath(".//a[@class='show-content']/test()").extract_first().strip()
href=obj.xpath(".//a[@class='show-content']/@href").extract_first().strip()
item_obj=ChoutiItem(title=title,href=href)#封装数据
#获取当前页可直接访问的所有热点新闻页面的页码(即显示在热点新闻下的页码的href属性)L:
sel2=Selector(response=response).xpath("//a[re:test(@href,'/all/hot/recent/\d+')]/@href").extract()
for url in sel2:
#md5_url=self.md5(url)
#if url in self.visited_urls:#如果已经访问过该URL
# print("该URL已存在")
#else:
# self.visited_urls.add(url)
print(url)
url="https://dig.chouti.com%s"%url
#将得到的要访问的URL放入调度器(记得在settings.py中指定深度):
yield Request(url=url,callback=self.parse)#注意:指定回调函数时不加括号
#def md5(self,url):
# #要保存的URL可能过长,所以进行md5加密,然后保存加密后的URL(加密后的内容都是等长的)
# import hashlib
# obj=hashlib.md5()
# obj.update(bytes(url,encoding="utf-8"))
# return obj.hexdigest()
#chouti2.py:
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.cookies import CookieJar
class Chouti2Spider(scrapy.Spider):
name = 'chouti2'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/']
cookie_dict = None
def parse(self, response):
cookie_obj = CookieJar()
cookie_obj.extract_cookies(response, response.request)
self.cookie_dict = cookie_obj._cookies
yield Request(
url = "https://dig.chouti.com/login",
method = "POST",
body = "phone=12345678901&password=aaabbbccc&oneMonth=1",#注意:请求体不支持使用字典
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
},
cookies = cookie_obj._cookies,
callback = self.check_login
)
def check_login(self, response):
print(response.text)
yield Request( # 进行下1步操作(如点赞)
url="https://dig.chouti.com",
callback=self.like
)
def like(self, response):
id_list = Selector(response=response).xpath("//div[@share-linkid]/@share-linkid").extract()
for nid in id_list: # 给所有热点新闻点赞
url = "https://dig.chouti.com/link/vote?linksId=%s" % nid
yield Request(
url=url,
method="POST",
cookies=self.cookie_dict,
callback=self.show
)
page_urls=Selector(response=response).xpath("//div[@id='dig_lcpage']//a/@href").extract()
for page in page_urls:
url="https://dig.chouti.com%s"%page
yield Request(url=url,callback=self.like)#完成深度,点赞每页中的热点新闻
def show(self, response):
print(response.text)
#cnblogs.py:
import scrapy
class CnblogsSpider(scrapy.Spider):
name = 'cnblogs'
allowed_domains = ['cnblogs.com']
start_urls = ['http://cnblogs.com/']
def parse(self, response):
pass
#duplication.py:
class RepeatFilter(BaseDupeFilter):
def __init__(self, path=None, debug=False):
self.visited_urls=set()
#以下各函数都不能改名:
@classmethod
def from_settings(cls, settings):#cls即当前类的类名(此处是RepeatFilter)
#在内部会通过RepeatFilter.from_settings()创建RepeatFilter对象
return cls()
def request_seen(self, request):
if request.url in self.visited_urls:
return True
else:
self.visited_urls.add(request.url)
return False
def request_fingerprint(self, request):
print("request_fingerprint")
def close(self, reason):#爬取结束时执行
print("close")
def log(self, request, spider):#日志
print("log")
#items.py:
import scrapy
class ChoutiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
href=scrapy.Field()
#middlewares.py:
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class DownMiddleware1(object):
#注意:中间件中的方法只能属于这3个
def process_request(self,request,spider):
'''
请求需要被下载时,经过所有下载器中间件的process_request()调用
:param request:Request对象
:param spider:爬虫对象
:return:
None:交给下1个下载中间件的process_request()
Response对象:停止process_request()的执行,开始执行process_response()
Request对象:停止中间件的执行,将Request放回调度器
raise IgnoreRequest异常:调用process_exception()
'''
pass
def process_response(self,request,response,spider):
'''
下载完成,返回时调用
:param request:Request对象
:param response:Response对象
:param spider:爬虫对象
:return:
Response对象:交给下1个下载中间件的process_response()
Request对象:停止中间件的执行,request将被重新调度下载
raise IgnoreRequest异常:调用Request.errback()
'''
print('response1')
return response
def process_exception(self,request,exception,spider):
'''
当下载处理器(download handler)或process_request()抛出异常时执行
:param response:Response对象
:param exception:错误对象
:param spider:爬虫对象
:return:
None:交给下1个下载中间件的process_exception()
如果所有process_exception()都返回None,则报错
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调度下载
'''
return None
class SpiderMiddleware(object):
def process_spider_input(self,response,spider):
'''
下载完成时执行,然后交给parse()处理
:param response:下载器移交的Response对象
:param spider:爬虫对象
:return:
'''
pass
def process_spider_output(self,response,result,spider):
'''
spider处理完成,返回时调用,之后再交给调度器或管道
:param response:Response对象
:param result:parse()返回的Request或Item对象
:param spider:爬虫对象
:return:必须返回包含Request或Item对象的可迭代对象
'''
return result
def process_spider_exception(self,response, exception, spider):
'''
异常调用
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常
含Response或Item对象的可迭代对象:交给调度器或管道
'''
return None
def process_start_requests(self,start_requests,spider):
'''
爬虫启动时调用,在之后迭代深度时不执行
:param start_requests:初始URL的Request对象构成的可迭代对象
:param spider:爬虫对象
:return:包含Request对象的可迭代对象
'''
return start_requests
#pipelines.py:
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class Scrapy1Pipeline:
def __init__(self,conn_s):
self.conn_s=conn_s
self.conn=None
def open_spider(self,spider):
self.f=open("news.json","a+")
def process_item(self,item,spider):
if spider.name=="chouti":
print(item,spider)
tpl="%s\n%s\n\n"%(item["title"],item["href"])
self.f.write(tpl)
return item
def close_spider(self,spider):
self.f.close()
@classmethod
def from_crawler(cls,crawler):
conn_s=crawler.settings.getint('DB')
return cls(conn_s)
class Scrapy1Pipeline2:
def process_item(self,item,spider):
if spider.name=="cnblogs":
print(item)
raise DropItem()
#crawlall.py中:
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
class Command(ScrapyCommand):
requires_project=True
def syntax(self):
return '[options]'
def short_desc(self):#使用scrapy --help时关于该命令的提示
return 'Runs all of the spiders'
def run(self,args,opts):#opts是传入命令的参数
#找到所有爬虫的名称:
spider_list=self.crawler_process.spiders.list()
#开始爬取(内置的crawl命令也是通过类似的方法执行的):
for name in spider_list:
self.crawler_process.crawl(name,**opts.__dict__)
self.crawler_process.start()
#settings.py中:
BOT_NAME = 'scrapy1'
SPIDER_MODULES = ['scrapy1.spiders']
NEWSPIDER_MODULE = 'scrapy1.spiders'
SPIDER_MIDDLEWARES = {
'scrapy1.middlewares.SpiderMiddleware':550
}
DOWNLOADER_MIDDLEWARES = {
'scrapy1.middlewares.SpiderMiddleware':550
}
EXTENSIONS = {
#'scrapy.extensions.telnet.TelnetConsole': None,
}
ITEM_PIPELINES = {
'scrapy1.pipelines.Scrapy1Pipeline': 300,
'scrapy.pipelines.Scrapy1Pipeline2':200,
}
DEPTH_LIMIT=4#指定爬取的深度
#如指定为1,则访问初始URL页面中的URL(记为URL1),但不访问URL1页面中的URL
#DUPEFILTER_CLASS='scrapy1.duplication.RepeatFilter'#指定用于去重的类
COMMANDS_MODULE='scrapy1.commands'