1.xpath解析基础01
xpath解析:最常用且最高效的一种解析方式,通用性
xpath解析原理:
1.实例化etree对象,且需要将被解析的页面源码数量加载到该对象中。
2.调用etree对象中的xpath方法结合着xpath表达式实现标签的定位和内容的捕获
环境的安装:
pip install lxml
如何实例化一个etree对象
1.将本地的htnl源码数据加载到etree对象中
etree.parse(filPath) #返回一个etree对象
2.可以将从互联网上获取的源码数据加载到该对象中
etree.HTML('page_text')
xpath('xpath表达式')
2.xpath解析基础02
xpath表达式:
1./:表示是从根节点开始定位,表示的是一个层级
2.//:表示的是多个层级。可以表示从任意位置开始定位
3.属性定位
tag[@attrName="attrValue"] #通用格式
4.索引定位,所以是从1开始的
#找到div下的p标签
r = tree.xpath('//div[class="song"]/p')
#拿到div下第三个p标签
r = tree.xpath('//div[class="song"]/p[3]')
5.取文本
/text() : 获取的是标签中直系的文本内容
//text(): 获取标签中非直系的文本内容(所有的文本内容)
#取文本
r = tree.xpath('//div[class="tang"]//li/a[5]/text()') #返回列表
r = tree.xpath('//div[class="tang"]//li/a[5]/text()')[0] #返回字符串
6取属性
#取属性
r = tree.xpath('//div[@class='tang']/img/@src')
7.代码
from lxml import etree
#将本地html文档加载到etree对象中,实例化好一个etree对象
tree = etree.parse('./text.html')
#调用xpath方法
#从根部开始,层级检索,一个/一个层级
r = tree.xpath('/html/head/title')
#r = tree.xpath('/html//title')
#r = tree.xpath('//title')
#定义class为song的div
#r = tree.xpath('//div[class="song"]')
#找到div下的p标签
#r = tree.xpath('//div[class="song"]/p')
#拿到div下第三个p标签
#r = tree.xpath('//div[class="song"]/p[3]')
#取文本
#r = tree.xpath('//div[class="tang"]//li/a[5]/text()')
#获取该标签中所有的非直系文本内容
#r = tree.xpath('//div[@class = 'tang']//text()')
#取属性
r = tree.xpath('//div[@class='tang']/img/@src')
3.xpath实战-58二手房
import requests
from lxml import etree
#需求:爬取58二手房中的房源信息
url = 'https://bj.58.com/ershoufang/'
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
page_text = requests.get(url = url,headers = headers).text
#数据解析
tree = etree.HTML(page_text)
#存储的就是li标签对象
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
fp = open('58.text','w',encoding='utf-8')
for li in li_list:
title = li.xpath('./dv[2]/h2/a/text()')[0]
print(title)
fp.write(title+'\n')
4.xpath解析案例-4K图片解析下载
from typing import Any, Union
import requests
from lxml import etree
import os
#创造一个文件夹
if not os.path.exists('./picLibs'):
os.mkdir('./picLibs')
url = "https://pic.netbian.com/4kmeinv/"
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
page_text = requests.get(url=url,headers=headers).content
#数据解析:src的属性值,alt属性
tree = etree.HTML(page_text)
li_list = tree.xpath("//div[@class='slist']/ul/li")
for li in li_list:
img_src = 'https://pic.netbian.com'+ li.xpath('./a/img/@src')[0]
img_name = li.xpath('./a/img/@alt')[0] + '.jpg'
#print(img_name,img_src)
#请求图片进行持久化存储
img_data = requests.get(url=img_src,headers=headers).content
img_path = 'picLibs'+img_name
with open(img_path,'wb') as fp:
fp.write(img_data)
print(img_name+"下载成功")
5.xpath解析案例-全国城市名称爬取
基础版:
import requests
from lxml import etree
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
url = 'http://www.aqistudy.cn/historydata/'
page_text = requests.get(url = url,headers=headers).text
tree = etree.HTML(page_text)
host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
all_city_names = []
#解析到热门城市的城市名称
for li in host_li_list:
host_city_name = li.xpath('./a/text()')[0]
all_city_names.append(host_city_name)
#解析的是全国城市的名称:
city_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
for li in city_names_list:
city_names = li.xpath('./a/text()')[0]
all_city_names.append(city_names)
print(all_city_names,len(all_city_names))
改良版;
import requests
from lxml import etree
# headers = {
# 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
# }
# url = 'http://www.aqistudy.cn/historydata/'
# page_text = requests.get(url = url,headers=headers).text
# tree = etree.HTML(page_text)
# host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
#
# all_city_names = []
# #解析到热门城市的城市名称
# for li in host_li_list:
# host_city_name = li.xpath('./a/text()')[0]
# all_city_names.append(host_city_name)
# #解析的是全国城市的名称:
# city_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
# for li in city_names_list:
# city_names = li.xpath('./a/text()')[0]
# all_city_names.append(city_names)
# print(all_city_names,len(all_city_names))
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
url = 'http://www.aqistudy.cn/historydata/'
page_text = requests.get(url = url,headers=headers).text
tree = etree.HTML(page_text)
#解析到所有热门城市和所有城市对应的a标签
# //div[@class="bottom"]/ul/li 热门城市a标签的层级关系
# //div[@class="bottom"]/ul/div[2]/li/a 全部城市a标签的层级关系
a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')
all_city_names=[]
for a in a_list:
city_name = a.xpath('./text()')[0]
all_city_names.append(city_name)
print(all_city_names,len(all_city_names))
本文章为B站UP主Python网红Alex所讲的《Python超强爬虫8天速成》课后笔记。不做商用,请勿打赏!