天天看点

文盲的Python入门日记:第三十一天,稍稍完善一下爬虫类,用来支持http2.0,以及当前完整代码

现在已经有越来越多的站点使用了http2.0协议了,他们在做防采集的时候,针对2.0特有的请求头做了一些验证,那么,我们之前写的采集类就不适用了,需要做一下调整,很小的改动哦

首先,我们要安装 hyper 这个包,记得是在命令行运行,不是在python环境里哦

pip install hyper
           

然后调整Http方法,追加一个参数

def Http(self,url,method=Method.GET,postdata=None,http=1):
           

然后在 send 请求前,加上一句判断

if http == 2:
			self.__session.mount(url,HTTP20Adapter())
           

怎么样,很方便吧

那么,现在就将老顾现在的这个采集类完整的放出来,大家可以参考一下,有什么优化建议,欢迎指教

import gzip
import re
import requests
import zlib
from enum import Enum
from hyper.contrib import HTTP20Adapter

class Ajax:
	def __init__(self):
		self.version = '0.1'
		self.agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400'
		self.refer = ''
		self.cache = 'no-cache'
		self.lang = 'zh-CN,zh;q=0.9'
		self.encoding = 'gzip, deflate, br'
		self.accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
		self.current_url = ''
		self.method = self.Method.GET
		self.charset = 'utf-8'
		self.cookies = requests.utils.cookiejar_from_dict({})
		self.__content_encoding = ''
		self.__session = requests.Session()
		self.html = ''
		self.redirect = True
		self.__requestHeaders = {}
		self.__refreshRequestHeaders()
	
	class Method(Enum):
		GET = 1
		POST = 2
		HEAD = 3
		PUT = 4
		DELETE = 5
		OPTIONS = 6
		TRACE = 7
		PATCH = 8

	@property
	def ResposeHeader(self):
		return self.__responseHeaders
	
	@property
	def Session(self):
		return self.__session

	@property
	def Header(self):
		return self.__requestHeaders
	
	def Download(self,url,filename,method=Method.GET,postdata=None):
		self.__refreshRequestHeaders()
		if postdata!=None:
			if isinstance(postdata,str):
				postdata = {n.group(1):n.group(2) for n in re.finditer('([^&=]+)=([^&]*)',postdata)}
		req = requests.Request(method=method.name,url=url,headers=self.__requestHeaders,data=postdata)
		pre = self.__session.prepare_request(req)
		res = self.__session.send(pre)
		self.status = res.status_code
		data = res.content
		f = open(filename,'wb+')
		f.write(data)
		f.close()

	def Http(self,url,method=Method.GET,postdata=None,http=1):
		"""url 请填写时注意在路径类型后追加一个“/”,以方便路径补全计算

		method 请使用 Ajax 类本身提供的枚举,不支持字符串类型传入,默认为 GET 方法
		
		postdata 为提交的表单数据,支持字典和字符串
		
		Http 方法不支持文件上传
		
		Http 方法,除已列出的参数外,其他设置请调用相应的方法
		
		追加请求头,请使用 .AddHeader 方法

		追加 cookie ,请使用 .cookies.set 方法

		关闭自动跳转,请使用 .redirect 属性

		refer 设置通过 .refer 属性,使用请求头设置会被覆盖
		"""
		self.__refreshRequestHeaders()
		self.current_url = re.sub(r'\\','/',url)
		_pd = None
		if postdata!=None:
			if isinstance(postdata,str):
				_pd = {n.group(1):n.group(2) for n in re.finditer('([^&=]+)=([^&]*)',postdata)}
				if len(str(_pd))<len(postdata):
					_pd = postdata
			else:
				_pd = postdata
		req = requests.Request(method=method.name,url=url,headers=self.__requestHeaders,data=_pd)
		if _pd != None and isinstance(_pd,str):
			req.headers.update({'content-length':str(len(_pd))})
		pre = self.__session.prepare_request(req)
		self.__session.cookies = self.cookies
		if http == 2:
			self.__session.mount(url,HTTP20Adapter())
		res = self.__session.send(request=pre,allow_redirects=self.redirect)
		self.current_url = res.url
		self.cookies = self.__session.cookies
		enc = re.sub(r'[-]','',res.encoding) if res.encoding != None else re.sub(r'[-]','',self.charset)
		if enc == 'ISO88591':
			charset = re.findall('''<meta[^<>]*?charset=['"]?([^'""]+)['"\\s]?''',res.text,re.I)
			encoding = re.findall('''<?xml version="1.0" encoding="([^"]+)"?>''',res.text,re.I)
			if len(charset) > 0:
				enc = re.sub(r'[-]','',charset[0])
			elif len(encoding) > 0:
				enc = encoding[0]
			else:
				enc = re.sub(r'[-]','',self.charset)
		self.status = res.status_code
		self.__responseHeaders = str(res.headers)
		self.__parseHeaders()
		if method == self.Method.HEAD:
			return self.__responseHeaders
		data = res.content
		if 'gzip' in self.__content_encoding:
			self.html = gzip.decompress(data).decode(enc)
		elif 'deflate' in self.__content_encoding:
			try:
				self.html = zlib.decompress(data, -zlib.MAX_WBITS).decode(enc)
			except zlib.error:
				self.html = zlib.decompress(data).decode(enc)
		else:
			try:
				self.html = data.decode(encoding=enc)
			except UnicodeDecodeError:
				if enc != re.sub(r'[-]','',self.charset):
					self.html = data.decode(encoding=re.sub(r'[-]','',self.charset),errors='ignore')
					#self.html = data.decode(encoding='utf-8')
		self.html = self.__url_complemented(self.html)
		return self.html

	def AddHeader(self,key:str = None,val:str = None,dic:dict = None):
		"""追加请求头设置
		
		AddHeader(key,val)

		AddHeader(dic={key:val})"""
		if dic != None and isinstance(dic,dict):
			self.__requestHeaders.update(dic)
		if key != None and val != None:
			self.__requestHeaders.update({key:val})

	def __parseHeaders(self):
		dict = {n.group(1).lower():n.group(2).strip() for n in re.finditer('([^\r\n:]+):([^\r\n]+)',self.__responseHeaders)}
		if 'content-encoding' in dict:
			self.__content_encoding = dict['content-encoding']
		else:
			self.__content_encoding = ''
	
	def __refreshRequestHeaders(self):
		self.__requestHeaders.update({'refer':self.refer
			,'user-agent':self.agent
			,'accept':self.accept
			,'accept-encoding':self.encoding
			,'accept-language':self.lang
			,'cache-control':self.cache})

	def __url_complemented(self,html):
		html = re.sub('''(url|src|href)\\s*=\\s*(['"]?)([^'"]*)(\\2|[\\s\\r\\n\\t])''',self.__url_replace,html,count=0,flags=re.I)
		return html

	def __url_replace(self,m):
		url = m.group(3).strip()
		# about:setting、about:blank 类型的,javascript:void(0) 类型的,#类型的,原样返回
		if re.search('^(#.*|javascript:.*|[a-z_-]+:[a-z_-]+)$',url,re.I):
			return m.string[m.span()[0]:m.span()[1]]
		# 带有协议的,原样返回,例如 https://、ftp://、file://、tencent://等
		if re.search('^[a-z]+://',url,re.I):
			return m.string[m.span()[0]:m.span()[1]]
		# base64 格式,原样返回
		if re.search('^base64',url,re.I):
			return m.string[m.span()[0]:m.span()[1]]
		root = re.sub(r'^([a-z]+:/{2,}[^/]+).*','\\1/',self.current_url.strip(),re.I)
		if re.search('^/(?!/)',url,re.I):
			url = re.sub('^/',root,url,re.I)
		elif re.search('^//',url):
			url = re.sub('^([a-z]+:)//.*$','\\1',root,re.I) + url
		else:
			path = re.sub('/[^/]*$','',self.current_url) + '/'
			p = re.search(r'^[\./]+',url,re.I)
			if p:
				# 具有 ./ 计算路径
				# 获取开头的全部计算路径
				p = p.string[p.span()[0]:p.span()[1]]
				# 去掉路径中 ./ 后,剩余的点的数量,就是路径向上路径的层级
				p = re.sub(r'\./','',p)
				# 获得剩余点的数量,得到层级
				p = len(p)
				pth = path
				for i in range(p):
					pth = re.sub('[^/]+/','',pth,re.I)
				if len(pth)<len(root):
					pth = root
				url = pth + re.sub(r'^[\./]+','',url)
			else:
				# 无 ./ 计算路径,当前路径
				url = path + url
		return m.group(1) + '=' + m.group(2) + url + m.group(4)

           

使用实例

from spider import Ajax
payload = r'''{"query":"\n    query onlineNumberHistory{\n      onlineNumberHistory(tag: CCTXFFC, rowsPerPage: 20 , page:10000){\n        ...data\n        ...page\n      }\n    }\n    fragment data on OnlineNumberPage{\n      data{\n        id\n        tag\n        number\n        change\n        time\n      }\n    }\n    fragment page on OnlineNumberPage{\n      page{\n        current\n        total\n        count\n      }\n    }\n    ","variables":{}}'''
ajax = Ajax()
ajax.accept = '*/*'
ajax.AddHeader('origin','https://77tj03.com/')
ajax.AddHeader(':authority','bkd.77tj.cc')
ajax.AddHeader(':path','/graphql')
ajax.AddHeader(':scheme','https')
ajax.AddHeader(':method','POST')
ajax.AddHeader('pragma','no-cache')
ajax.refer = 'https://77tj03.com/'
ajax.AddHeader('content-type','application/json;charset=utf-8')
#ajax.AddHeader(dic={'content-length':str(len(payload))})
ajax.AddHeader('operationname','onlineNumberHistory')
xhr = ajax.Http('https://bkd.77tj.cc/graphql',Ajax.Method.POST,payload,http=2)
           

由于使用的是requests创建的请求,post的信息,他会自动转为词典,但payload格式本身就是json字符串,不能转成词典,所以http方法里进行了相应的调整,且 content-length应该是自动计算的,不需要在请求头里自行设置,这个例子就是一个http2.0的采集

from spider import Ajax
ajax = Ajax()
ajax.refer = 'https://www.360.cn/brain_of_security/'
ajax.AddHeader('host','inf.safe.360.cn')
xhr = ajax.Http('https://inf.safe.360.cn/api/c?callback=jQuery112408270818558880948_1625620054847&_=1625620054952')
    
           

而这个例子,使用同样的类,也可以采集http1.1的站点

--------------

采集类需要完成的其他需求,咱们有机会了再弄,嘿嘿