文章目錄
- 前言
- 一、requests_html是什麼?
- 二、使用步驟
- 總結
前言
随着技術的不斷發展,爬蟲這門技術也越來越需要,很多人都開啟了學習爬蟲學習,本文就介紹了作者自己爬蟲學習的基礎内容。
提示:以下是本篇文章正文内容,下面案例可供參考
一、requests_html是什麼?
requests_html是基于requests 的一種優化子產品,該子產品是為了簡化爬蟲和簡單擷取網頁和資源而建立的,它和requests出于同一個作者。
二、使用步驟
prthon自學爬蟲
requests_html爬取百度搜尋本兮圖檔
import random,os
from requests_html import HTMLSession, HTML, AsyncHTMLSession
from pprint import pprint
import os,time,re
import requests
class download():
'''通過url下載下傳到本地'''
def __init__(self,name,url):
self.name = name
self.url = url
def download_txt(self):
'''
儲存txt文本
'''
# file_paths = os.path.abspath(os.path.dirname(__file__)) + "\\" + file_name
desktop_path = os.path.join(os.path.expanduser('~'),"Desktop")
names = desktop_path+"//"+self.name
with open(names,'a',encoding="utf-8") as f:#使用追加模式寫入檔案
f.write(self.url)
f.close()
def download_picture(self):
'''
儲存jpg圖檔
'''
headers = {'Proxy-Connection': 'keep-alive'}
r = requests.get(self.url, stream=True, headers=headers)
print(r)
length = float(r.headers['content-length'])
# path = os.path.abspath(os.path.dirname(__file__))
desktop_path = os.path.join(os.path.expanduser('~'),"Desktop")
names = desktop_path+"//"+self.name
f = open(names, 'wb')
count = 0
count_tmp = 0
time1 = time.time()
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
count += len(chunk)
if time.time() - time1 > 2:
p = count / length * 100
speed = (count - count_tmp) / 1024 / 2
count_tmp = count
print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'KB/S')
time1 = time.time()
f.close()
def formatFloat(num):
return '{:.2f}'.format(num)
class BenXiTest:
def __init__(self,url):
self.start_url = url
self.session = HTMLSession() # 執行個體化session
self.aSession = AsyncHTMLSession() # 執行個體化異步session
users = {# 可以在發送請求的時候更換user-agent
1 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
2 : 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
3 : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
4 : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
5 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
ur1 = random.sample(users.keys(),1)
self.headers = "users"+str(ur1)
def get_response(self):
"""擷取響應,并傳回requests_html中的HTML對象"""
start_url = self.start_url
r = self.session.get(start_url, headers={'user-agent': self.headers})
print("網頁狀态",r)
return r.html
# 快速擷取頁面中的url
def fast_get_urls(self):
"""快速擷取頁面中的url"""
html = self.get_response()
#"All_paths"
#網頁所有連結:HTML的 links屬性 可以快速擷取到頁面中 a标簽中的href屬性
urls = html.links
# pprint(urls)
#"Absolute_path"
#網頁絕對連結:HTML的 absolute_links屬性 可以快速擷取到頁面中 a标簽中的href屬性,并傳回絕對url位址
absolute_urls = html.absolute_links
pprint(absolute_urls)
# 清洗資料(提取資料)
def get_data(self):
"""使用xpath擷取資料"""
html = self.get_response()
a_list = html.find('div img')
# pprint(a_list)
count = 0
for a_li in a_list:
picture_url = a_li.attrs.get('src')
print(picture_url)
if picture_url is None:
continue
else:
if picture_url.find('https') == -1:
continue
else:
download("//本兮//本兮高清圖檔"+str(count)+".jpg",picture_url).download_picture()
count += 1
print("src已經完成!! \n")
for a_li in a_list:
picture_url = a_li.attrs.get('data-src')
print(picture_url)
if picture_url is None:
continue
else:
if picture_url.find('https') == -1:
continue
else:
download("//本兮//本兮高清圖檔"+str(count)+".jpg",picture_url).download_picture()
count += 1
print("data-src已完成!!!")
# 加載JS頁面
def load_js(self):
html = self.get_response()
'''
渲染網站
下載下傳chromium
'''
# script = """
# () => {
# return {
# width: document.documentElement.clientWidth,
# height: document.documentElement.clientHeight,
# deviceScaleFactor: window.devicePixelRatio,
# }
# }
# """
# val = r.html.render(script=script,reload=False)
# print(val)
# 使用一個 render()方法 來加載js(實際上使用這個pyppeteer)
# html.render(wait=3) # js加載
print(html.html)
if __name__ == '__main__':
url = 'https://cn.bing.com/images/search?q=%e6%9c%ac%e5%85%ae%e5%9b%be%e7%89%87%e5%a3%81%e7%ba%b8%e9%ab%98%e6%b8%85&qpvt=%e6%9c%ac%e5%85%ae%e5%9b%be%e7%89%87%e5%a3%81%e7%ba%b8%e9%ab%98%e6%b8%85&form=IGRE&first=1&tsc=ImageBasicHover'
test = BenXiTest(url)
test.get_data()
該處使用的url網絡請求的資料。
總結
以上就是今天要講的内容,本文僅僅簡單介紹了requests_html的使用,而requests_html提供了大量能使我們快速便捷地處理資料的函數和方法。