1.xpath解析基礎01
xpath解析:最常用且最高效的一種解析方式,通用性
xpath解析原理:
1.執行個體化etree對象,且需要将被解析的頁面源碼數量加載到該對象中。
2.調用etree對象中的xpath方法結合着xpath表達式實作标簽的定位和内容的捕獲
環境的安裝:
pip install lxml
如何執行個體化一個etree對象
1.将本地的htnl源碼資料加載到etree對象中
etree.parse(filPath) #傳回一個etree對象
2.可以将從網際網路上擷取的源碼資料加載到該對象中
etree.HTML('page_text')
xpath('xpath表達式')
2.xpath解析基礎02
xpath表達式:
1./:表示是從根節點開始定位,表示的是一個層級
2.//:表示的是多個層級。可以表示從任意位置開始定位
3.屬性定位
tag[@attrName="attrValue"] #通用格式
4.索引定位,是以是從1開始的
#找到div下的p标簽
r = tree.xpath('//div[class="song"]/p')
#拿到div下第三個p标簽
r = tree.xpath('//div[class="song"]/p[3]')
5.取文本
/text() : 擷取的是标簽中直系的文本内容
//text(): 擷取标簽中非直系的文本内容(所有的文本内容)
#取文本
r = tree.xpath('//div[class="tang"]//li/a[5]/text()') #傳回清單
r = tree.xpath('//div[class="tang"]//li/a[5]/text()')[0] #傳回字元串
6取屬性
#取屬性
r = tree.xpath('//div[@class='tang']/img/@src')
7.代碼
from lxml import etree
#将本地html文檔加載到etree對象中,執行個體化好一個etree對象
tree = etree.parse('./text.html')
#調用xpath方法
#從根部開始,層級檢索,一個/一個層級
r = tree.xpath('/html/head/title')
#r = tree.xpath('/html//title')
#r = tree.xpath('//title')
#定義class為song的div
#r = tree.xpath('//div[class="song"]')
#找到div下的p标簽
#r = tree.xpath('//div[class="song"]/p')
#拿到div下第三個p标簽
#r = tree.xpath('//div[class="song"]/p[3]')
#取文本
#r = tree.xpath('//div[class="tang"]//li/a[5]/text()')
#擷取該标簽中所有的非直系文本内容
#r = tree.xpath('//div[@class = 'tang']//text()')
#取屬性
r = tree.xpath('//div[@class='tang']/img/@src')
3.xpath實戰-58二手房
import requests
from lxml import etree
#需求:爬取58二手房中的房源資訊
url = 'https://bj.58.com/ershoufang/'
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
page_text = requests.get(url = url,headers = headers).text
#資料解析
tree = etree.HTML(page_text)
#存儲的就是li标簽對象
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
fp = open('58.text','w',encoding='utf-8')
for li in li_list:
title = li.xpath('./dv[2]/h2/a/text()')[0]
print(title)
fp.write(title+'\n')
4.xpath解析案例-4K圖檔解析下載下傳
from typing import Any, Union
import requests
from lxml import etree
import os
#創造一個檔案夾
if not os.path.exists('./picLibs'):
os.mkdir('./picLibs')
url = "https://pic.netbian.com/4kmeinv/"
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
page_text = requests.get(url=url,headers=headers).content
#資料解析:src的屬性值,alt屬性
tree = etree.HTML(page_text)
li_list = tree.xpath("//div[@class='slist']/ul/li")
for li in li_list:
img_src = 'https://pic.netbian.com'+ li.xpath('./a/img/@src')[0]
img_name = li.xpath('./a/img/@alt')[0] + '.jpg'
#print(img_name,img_src)
#請求圖檔進行持久化存儲
img_data = requests.get(url=img_src,headers=headers).content
img_path = 'picLibs'+img_name
with open(img_path,'wb') as fp:
fp.write(img_data)
print(img_name+"下載下傳成功")
5.xpath解析案例-全國城市名稱爬取
基礎版:
import requests
from lxml import etree
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
url = 'http://www.aqistudy.cn/historydata/'
page_text = requests.get(url = url,headers=headers).text
tree = etree.HTML(page_text)
host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
all_city_names = []
#解析到熱門城市的城市名稱
for li in host_li_list:
host_city_name = li.xpath('./a/text()')[0]
all_city_names.append(host_city_name)
#解析的是全國城市的名稱:
city_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
for li in city_names_list:
city_names = li.xpath('./a/text()')[0]
all_city_names.append(city_names)
print(all_city_names,len(all_city_names))
改良版;
import requests
from lxml import etree
# headers = {
# 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
# }
# url = 'http://www.aqistudy.cn/historydata/'
# page_text = requests.get(url = url,headers=headers).text
# tree = etree.HTML(page_text)
# host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
#
# all_city_names = []
# #解析到熱門城市的城市名稱
# for li in host_li_list:
# host_city_name = li.xpath('./a/text()')[0]
# all_city_names.append(host_city_name)
# #解析的是全國城市的名稱:
# city_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
# for li in city_names_list:
# city_names = li.xpath('./a/text()')[0]
# all_city_names.append(city_names)
# print(all_city_names,len(all_city_names))
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
url = 'http://www.aqistudy.cn/historydata/'
page_text = requests.get(url = url,headers=headers).text
tree = etree.HTML(page_text)
#解析到所有熱門城市和所有城市對應的a标簽
# //div[@class="bottom"]/ul/li 熱門城市a标簽的層級關系
# //div[@class="bottom"]/ul/div[2]/li/a 全部城市a标簽的層級關系
a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')
all_city_names=[]
for a in a_list:
city_name = a.xpath('./text()')[0]
all_city_names.append(city_name)
print(all_city_names,len(all_city_names))
本文章為B站UP主Python網紅Alex所講的《Python超強爬蟲8天速成》課後筆記。不做商用,請勿打賞!