爬取思路:
1.分析ajax请求,找到存放图片地址的json
2.解析json数据,提取中图片url
3.再次请求图片url,通过open()和write()方法将图片保存至内地。
废话少说,直接上代码:
前提条件是在当前.py文件同级目录下新建一个beauty360的文件夹用来保存图片
import requests
import time
import re
base_url="https://image.so.com/z?"
num=1
headers={
"Host": "image.so.com",
"Referer": "https://image.so.com/zv?ch=beauty",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "__guid=16527278.4407656107534301000.1546852761488.196; __guid=100021698.456336978600101800.1546852883449.8489; count=2; tracker=; lightbox_thumb_visible=1; _S=ab9f5ecb680ae35247705feda8f5bda4; test_cookie_enable=null"
}
header1={
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
#获取存有具体图片url地址的json数据
def get_json(page):
paras = {
"ch": "beauty",
"a": "jsonpViewScroll",
"i": page,
"count": 30
}
try:
response=requests.get(base_url,params=paras,headers=headers)
if response.status_code==200:
json=response.json()
return json
except:
print("wrong url.")
#解析图片url地址,并保存图片至本地
def get_pic(json):
global num
datas=json.get("data")
if datas:
for item in datas:
groupdatas=item.get("groupdata")
if groupdatas:
for group in groupdatas:
picurl=group.get("qhimg_url").strip()
response = requests.get(picurl, headers=header1)
if response.status_code==200:
num=num+1
print(str(num)+": "+picurl)
with open(r"./beauty360/"+str(num)+".jpg","wb") as fp:
fp.write(response.content)
if __name__=="__main__":
for page in range(1,600):
json=get_json(page)
get_pic(json)
time.sleep(3)