天天看点

requests-爬取美女图片源码

爬取思路:

1.分析ajax请求,找到存放图片地址的json

2.解析json数据,提取中图片url

3.再次请求图片url,通过open()和write()方法将图片保存至内地。

废话少说,直接上代码:

前提条件是在当前.py文件同级目录下新建一个beauty360的文件夹用来保存图片

import requests
import time
import re

base_url="https://image.so.com/z?"
num=1
headers={
"Host": "image.so.com",
"Referer": "https://image.so.com/zv?ch=beauty",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "__guid=16527278.4407656107534301000.1546852761488.196; __guid=100021698.456336978600101800.1546852883449.8489; count=2; tracker=; lightbox_thumb_visible=1; _S=ab9f5ecb680ae35247705feda8f5bda4; test_cookie_enable=null"
}

header1={
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
#获取存有具体图片url地址的json数据
def get_json(page):
    paras = {
        "ch": "beauty",
        "a": "jsonpViewScroll",
        "i": page,
        "count": 30
    }
    try:
        response=requests.get(base_url,params=paras,headers=headers)
        if response.status_code==200:
            json=response.json()
            return json
    except:
        print("wrong url.")
        
#解析图片url地址,并保存图片至本地
def get_pic(json):
    global num
    datas=json.get("data")
    if datas:
        for item in datas:
            groupdatas=item.get("groupdata")
            if groupdatas:
                for group in groupdatas:
                    picurl=group.get("qhimg_url").strip()
                    response = requests.get(picurl, headers=header1)
                    if response.status_code==200:
                        num=num+1
                        print(str(num)+": "+picurl)
                        with open(r"./beauty360/"+str(num)+".jpg","wb") as fp:
                            fp.write(response.content)

if __name__=="__main__":
    for page in range(1,600):
        json=get_json(page)
        get_pic(json)
        time.sleep(3)