现在已经有越来越多的站点使用了http2.0协议了,他们在做防采集的时候,针对2.0特有的请求头做了一些验证,那么,我们之前写的采集类就不适用了,需要做一下调整,很小的改动哦
首先,我们要安装 hyper 这个包,记得是在命令行运行,不是在python环境里哦
pip install hyper
然后调整Http方法,追加一个参数
def Http(self,url,method=Method.GET,postdata=None,http=1):
然后在 send 请求前,加上一句判断
if http == 2:
self.__session.mount(url,HTTP20Adapter())
怎么样,很方便吧
那么,现在就将老顾现在的这个采集类完整的放出来,大家可以参考一下,有什么优化建议,欢迎指教
import gzip
import re
import requests
import zlib
from enum import Enum
from hyper.contrib import HTTP20Adapter
class Ajax:
def __init__(self):
self.version = '0.1'
self.agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400'
self.refer = ''
self.cache = 'no-cache'
self.lang = 'zh-CN,zh;q=0.9'
self.encoding = 'gzip, deflate, br'
self.accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
self.current_url = ''
self.method = self.Method.GET
self.charset = 'utf-8'
self.cookies = requests.utils.cookiejar_from_dict({})
self.__content_encoding = ''
self.__session = requests.Session()
self.html = ''
self.redirect = True
self.__requestHeaders = {}
self.__refreshRequestHeaders()
class Method(Enum):
GET = 1
POST = 2
HEAD = 3
PUT = 4
DELETE = 5
OPTIONS = 6
TRACE = 7
PATCH = 8
@property
def ResposeHeader(self):
return self.__responseHeaders
@property
def Session(self):
return self.__session
@property
def Header(self):
return self.__requestHeaders
def Download(self,url,filename,method=Method.GET,postdata=None):
self.__refreshRequestHeaders()
if postdata!=None:
if isinstance(postdata,str):
postdata = {n.group(1):n.group(2) for n in re.finditer('([^&=]+)=([^&]*)',postdata)}
req = requests.Request(method=method.name,url=url,headers=self.__requestHeaders,data=postdata)
pre = self.__session.prepare_request(req)
res = self.__session.send(pre)
self.status = res.status_code
data = res.content
f = open(filename,'wb+')
f.write(data)
f.close()
def Http(self,url,method=Method.GET,postdata=None,http=1):
"""url 请填写时注意在路径类型后追加一个“/”,以方便路径补全计算
method 请使用 Ajax 类本身提供的枚举,不支持字符串类型传入,默认为 GET 方法
postdata 为提交的表单数据,支持字典和字符串
Http 方法不支持文件上传
Http 方法,除已列出的参数外,其他设置请调用相应的方法
追加请求头,请使用 .AddHeader 方法
追加 cookie ,请使用 .cookies.set 方法
关闭自动跳转,请使用 .redirect 属性
refer 设置通过 .refer 属性,使用请求头设置会被覆盖
"""
self.__refreshRequestHeaders()
self.current_url = re.sub(r'\\','/',url)
_pd = None
if postdata!=None:
if isinstance(postdata,str):
_pd = {n.group(1):n.group(2) for n in re.finditer('([^&=]+)=([^&]*)',postdata)}
if len(str(_pd))<len(postdata):
_pd = postdata
else:
_pd = postdata
req = requests.Request(method=method.name,url=url,headers=self.__requestHeaders,data=_pd)
if _pd != None and isinstance(_pd,str):
req.headers.update({'content-length':str(len(_pd))})
pre = self.__session.prepare_request(req)
self.__session.cookies = self.cookies
if http == 2:
self.__session.mount(url,HTTP20Adapter())
res = self.__session.send(request=pre,allow_redirects=self.redirect)
self.current_url = res.url
self.cookies = self.__session.cookies
enc = re.sub(r'[-]','',res.encoding) if res.encoding != None else re.sub(r'[-]','',self.charset)
if enc == 'ISO88591':
charset = re.findall('''<meta[^<>]*?charset=['"]?([^'""]+)['"\\s]?''',res.text,re.I)
encoding = re.findall('''<?xml version="1.0" encoding="([^"]+)"?>''',res.text,re.I)
if len(charset) > 0:
enc = re.sub(r'[-]','',charset[0])
elif len(encoding) > 0:
enc = encoding[0]
else:
enc = re.sub(r'[-]','',self.charset)
self.status = res.status_code
self.__responseHeaders = str(res.headers)
self.__parseHeaders()
if method == self.Method.HEAD:
return self.__responseHeaders
data = res.content
if 'gzip' in self.__content_encoding:
self.html = gzip.decompress(data).decode(enc)
elif 'deflate' in self.__content_encoding:
try:
self.html = zlib.decompress(data, -zlib.MAX_WBITS).decode(enc)
except zlib.error:
self.html = zlib.decompress(data).decode(enc)
else:
try:
self.html = data.decode(encoding=enc)
except UnicodeDecodeError:
if enc != re.sub(r'[-]','',self.charset):
self.html = data.decode(encoding=re.sub(r'[-]','',self.charset),errors='ignore')
#self.html = data.decode(encoding='utf-8')
self.html = self.__url_complemented(self.html)
return self.html
def AddHeader(self,key:str = None,val:str = None,dic:dict = None):
"""追加请求头设置
AddHeader(key,val)
AddHeader(dic={key:val})"""
if dic != None and isinstance(dic,dict):
self.__requestHeaders.update(dic)
if key != None and val != None:
self.__requestHeaders.update({key:val})
def __parseHeaders(self):
dict = {n.group(1).lower():n.group(2).strip() for n in re.finditer('([^\r\n:]+):([^\r\n]+)',self.__responseHeaders)}
if 'content-encoding' in dict:
self.__content_encoding = dict['content-encoding']
else:
self.__content_encoding = ''
def __refreshRequestHeaders(self):
self.__requestHeaders.update({'refer':self.refer
,'user-agent':self.agent
,'accept':self.accept
,'accept-encoding':self.encoding
,'accept-language':self.lang
,'cache-control':self.cache})
def __url_complemented(self,html):
html = re.sub('''(url|src|href)\\s*=\\s*(['"]?)([^'"]*)(\\2|[\\s\\r\\n\\t])''',self.__url_replace,html,count=0,flags=re.I)
return html
def __url_replace(self,m):
url = m.group(3).strip()
# about:setting、about:blank 类型的,javascript:void(0) 类型的,#类型的,原样返回
if re.search('^(#.*|javascript:.*|[a-z_-]+:[a-z_-]+)$',url,re.I):
return m.string[m.span()[0]:m.span()[1]]
# 带有协议的,原样返回,例如 https://、ftp://、file://、tencent://等
if re.search('^[a-z]+://',url,re.I):
return m.string[m.span()[0]:m.span()[1]]
# base64 格式,原样返回
if re.search('^base64',url,re.I):
return m.string[m.span()[0]:m.span()[1]]
root = re.sub(r'^([a-z]+:/{2,}[^/]+).*','\\1/',self.current_url.strip(),re.I)
if re.search('^/(?!/)',url,re.I):
url = re.sub('^/',root,url,re.I)
elif re.search('^//',url):
url = re.sub('^([a-z]+:)//.*$','\\1',root,re.I) + url
else:
path = re.sub('/[^/]*$','',self.current_url) + '/'
p = re.search(r'^[\./]+',url,re.I)
if p:
# 具有 ./ 计算路径
# 获取开头的全部计算路径
p = p.string[p.span()[0]:p.span()[1]]
# 去掉路径中 ./ 后,剩余的点的数量,就是路径向上路径的层级
p = re.sub(r'\./','',p)
# 获得剩余点的数量,得到层级
p = len(p)
pth = path
for i in range(p):
pth = re.sub('[^/]+/','',pth,re.I)
if len(pth)<len(root):
pth = root
url = pth + re.sub(r'^[\./]+','',url)
else:
# 无 ./ 计算路径,当前路径
url = path + url
return m.group(1) + '=' + m.group(2) + url + m.group(4)
使用实例
from spider import Ajax
payload = r'''{"query":"\n query onlineNumberHistory{\n onlineNumberHistory(tag: CCTXFFC, rowsPerPage: 20 , page:10000){\n ...data\n ...page\n }\n }\n fragment data on OnlineNumberPage{\n data{\n id\n tag\n number\n change\n time\n }\n }\n fragment page on OnlineNumberPage{\n page{\n current\n total\n count\n }\n }\n ","variables":{}}'''
ajax = Ajax()
ajax.accept = '*/*'
ajax.AddHeader('origin','https://77tj03.com/')
ajax.AddHeader(':authority','bkd.77tj.cc')
ajax.AddHeader(':path','/graphql')
ajax.AddHeader(':scheme','https')
ajax.AddHeader(':method','POST')
ajax.AddHeader('pragma','no-cache')
ajax.refer = 'https://77tj03.com/'
ajax.AddHeader('content-type','application/json;charset=utf-8')
#ajax.AddHeader(dic={'content-length':str(len(payload))})
ajax.AddHeader('operationname','onlineNumberHistory')
xhr = ajax.Http('https://bkd.77tj.cc/graphql',Ajax.Method.POST,payload,http=2)
由于使用的是requests创建的请求,post的信息,他会自动转为词典,但payload格式本身就是json字符串,不能转成词典,所以http方法里进行了相应的调整,且 content-length应该是自动计算的,不需要在请求头里自行设置,这个例子就是一个http2.0的采集
from spider import Ajax
ajax = Ajax()
ajax.refer = 'https://www.360.cn/brain_of_security/'
ajax.AddHeader('host','inf.safe.360.cn')
xhr = ajax.Http('https://inf.safe.360.cn/api/c?callback=jQuery112408270818558880948_1625620054847&_=1625620054952')
而这个例子,使用同样的类,也可以采集http1.1的站点
--------------
采集类需要完成的其他需求,咱们有机会了再弄,嘿嘿