天天看點

python-requests_html-爬取本兮圖檔前言一、requests_html是什麼?二、使用步驟總結

文章目錄

  • 前言
  • 一、requests_html是什麼?
  • 二、使用步驟
  • 總結

前言

随着技術的不斷發展,爬蟲這門技術也越來越需要,很多人都開啟了學習爬蟲學習,本文就介紹了作者自己爬蟲學習的基礎内容。

提示:以下是本篇文章正文内容,下面案例可供參考

一、requests_html是什麼?

requests_html是基于requests 的一種優化子產品,該子產品是為了簡化爬蟲和簡單擷取網頁和資源而建立的,它和requests出于同一個作者。

二、使用步驟

prthon自學爬蟲

requests_html爬取百度搜尋本兮圖檔

import random,os
from requests_html import HTMLSession, HTML, AsyncHTMLSession
from pprint import pprint

import os,time,re
import requests
class download():
    '''通過url下載下傳到本地'''
    def __init__(self,name,url):
        self.name = name
        self.url = url
    def download_txt(self):
        '''
        儲存txt文本
        '''
        # file_paths = os.path.abspath(os.path.dirname(__file__)) + "\\" + file_name
        desktop_path = os.path.join(os.path.expanduser('~'),"Desktop")
        names = desktop_path+"//"+self.name
        with open(names,'a',encoding="utf-8") as f:#使用追加模式寫入檔案
            f.write(self.url)
            f.close()
    
    def download_picture(self):
        '''
        儲存jpg圖檔
        '''
        headers = {'Proxy-Connection': 'keep-alive'}
        r = requests.get(self.url, stream=True, headers=headers)
        print(r)
        length = float(r.headers['content-length'])
        # path = os.path.abspath(os.path.dirname(__file__))
        desktop_path = os.path.join(os.path.expanduser('~'),"Desktop")
        names = desktop_path+"//"+self.name
        f = open(names, 'wb')
        count = 0
        count_tmp = 0
        time1 = time.time()
        for chunk in r.iter_content(chunk_size=512):
            if chunk:
                f.write(chunk)
                count += len(chunk)
                if time.time() - time1 > 2:
                    p = count / length * 100
                    speed = (count - count_tmp) / 1024  / 2
                    count_tmp = count
                    print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'KB/S')
                    time1 = time.time()
        f.close()

    def formatFloat(num):
        return '{:.2f}'.format(num)

class BenXiTest:
    def __init__(self,url):
        self.start_url = url
        self.session = HTMLSession()  # 執行個體化session
        self.aSession = AsyncHTMLSession()  # 執行個體化異步session
        users = {# 可以在發送請求的時候更換user-agent
                1 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
                2 : 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                3 : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                4 : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
                5 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
        }
        ur1 = random.sample(users.keys(),1)
        self.headers = "users"+str(ur1)

    def get_response(self):
        """擷取響應,并傳回requests_html中的HTML對象"""
        start_url = self.start_url
        r = self.session.get(start_url, headers={'user-agent': self.headers})
        print("網頁狀态",r)

        return r.html

    # 快速擷取頁面中的url
    def fast_get_urls(self):
        """快速擷取頁面中的url"""
        html = self.get_response()

        #"All_paths"
        #網頁所有連結:HTML的 links屬性 可以快速擷取到頁面中 a标簽中的href屬性
        urls = html.links
        # pprint(urls)

        #"Absolute_path"
        #網頁絕對連結:HTML的 absolute_links屬性 可以快速擷取到頁面中 a标簽中的href屬性,并傳回絕對url位址
        absolute_urls = html.absolute_links
        pprint(absolute_urls)

    # 清洗資料(提取資料)
    def get_data(self):
        """使用xpath擷取資料"""
        html = self.get_response()
        a_list = html.find('div img')
        # pprint(a_list)
        
        count = 0
        for a_li in a_list:
            picture_url = a_li.attrs.get('src')
            print(picture_url)
            if picture_url is None:
                continue
            else:
                if picture_url.find('https') == -1:
                    continue
                else:
                    download("//本兮//本兮高清圖檔"+str(count)+".jpg",picture_url).download_picture()
                    count += 1
        print("src已經完成!! \n")
        for a_li in a_list:
            picture_url = a_li.attrs.get('data-src')
            print(picture_url)
            if picture_url is None:
                continue
            else:
                if picture_url.find('https') == -1:
                    continue
                else:
                    download("//本兮//本兮高清圖檔"+str(count)+".jpg",picture_url).download_picture()
                    count += 1
        print("data-src已完成!!!")

    # 加載JS頁面
    def load_js(self):
        html = self.get_response()

        '''
        渲染網站
        下載下傳chromium
        '''
        # script = """
            # () => {
                # return {
                    # width: document.documentElement.clientWidth,
                    # height: document.documentElement.clientHeight,
                    # deviceScaleFactor: window.devicePixelRatio,
                # }
            # }
            # """
        # val = r.html.render(script=script,reload=False)
        # print(val)
        
        # 使用一個 render()方法 來加載js(實際上使用這個pyppeteer)
        # html.render(wait=3)  # js加載
        print(html.html)


if __name__ == '__main__':
    url = 'https://cn.bing.com/images/search?q=%e6%9c%ac%e5%85%ae%e5%9b%be%e7%89%87%e5%a3%81%e7%ba%b8%e9%ab%98%e6%b8%85&qpvt=%e6%9c%ac%e5%85%ae%e5%9b%be%e7%89%87%e5%a3%81%e7%ba%b8%e9%ab%98%e6%b8%85&form=IGRE&first=1&tsc=ImageBasicHover'
    test = BenXiTest(url)
    test.get_data()
           

該處使用的url網絡請求的資料。

總結

以上就是今天要講的内容,本文僅僅簡單介紹了requests_html的使用,而requests_html提供了大量能使我們快速便捷地處理資料的函數和方法。