天天看点

Python爬虫xpath解析基础和项目实战

1.xpath解析基础01

xpath解析:最常用且最高效的一种解析方式,通用性

xpath解析原理:

1.实例化etree对象,且需要将被解析的页面源码数量加载到该对象中。

2.调用etree对象中的xpath方法结合着xpath表达式实现标签的定位和内容的捕获

环境的安装:

pip install lxml
           

如何实例化一个etree对象

1.将本地的htnl源码数据加载到etree对象中

etree.parse(filPath) #返回一个etree对象
           

2.可以将从互联网上获取的源码数据加载到该对象中

etree.HTML('page_text')
           
xpath('xpath表达式')
           

2.xpath解析基础02

xpath表达式:

1./:表示是从根节点开始定位,表示的是一个层级

2.//:表示的是多个层级。可以表示从任意位置开始定位

3.属性定位

tag[@attrName="attrValue"]   #通用格式
           

4.索引定位,所以是从1开始的

#找到div下的p标签
    r = tree.xpath('//div[class="song"]/p')
    #拿到div下第三个p标签
    r = tree.xpath('//div[class="song"]/p[3]')
           

5.取文本

/text() : 获取的是标签中直系的文本内容

//text(): 获取标签中非直系的文本内容(所有的文本内容)

#取文本
    r = tree.xpath('//div[class="tang"]//li/a[5]/text()') #返回列表
    r = tree.xpath('//div[class="tang"]//li/a[5]/text()')[0] #返回字符串
           

6取属性

#取属性
    r = tree.xpath('//div[@class='tang']/img/@src')
           

7.代码

from lxml import etree
    #将本地html文档加载到etree对象中,实例化好一个etree对象
    tree = etree.parse('./text.html')
    #调用xpath方法
    #从根部开始,层级检索,一个/一个层级
    r = tree.xpath('/html/head/title')
    #r = tree.xpath('/html//title')
    #r = tree.xpath('//title')
    #定义class为song的div
    #r = tree.xpath('//div[class="song"]')
    #找到div下的p标签
    #r = tree.xpath('//div[class="song"]/p')
    #拿到div下第三个p标签
    #r = tree.xpath('//div[class="song"]/p[3]')
    #取文本
    #r = tree.xpath('//div[class="tang"]//li/a[5]/text()')
    #获取该标签中所有的非直系文本内容
    #r = tree.xpath('//div[@class = 'tang']//text()')
    #取属性
    r = tree.xpath('//div[@class='tang']/img/@src')
           

3.xpath实战-58二手房

import requests
from lxml import etree
#需求:爬取58二手房中的房源信息
    url = 'https://bj.58.com/ershoufang/'
    headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
    }
    page_text = requests.get(url = url,headers = headers).text
    #数据解析
    tree = etree.HTML(page_text)
    #存储的就是li标签对象
    li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
    fp = open('58.text','w',encoding='utf-8')
    for li in li_list:
        title = li.xpath('./dv[2]/h2/a/text()')[0]
        print(title)
        fp.write(title+'\n')
           

4.xpath解析案例-4K图片解析下载

from typing import Any, Union
import requests
from lxml import etree
import os
#创造一个文件夹
if not os.path.exists('./picLibs'):
    os.mkdir('./picLibs')
url = "https://pic.netbian.com/4kmeinv/"
headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
page_text = requests.get(url=url,headers=headers).content
#数据解析:src的属性值,alt属性
tree = etree.HTML(page_text)
li_list = tree.xpath("//div[@class='slist']/ul/li")
for li in li_list:
    img_src = 'https://pic.netbian.com'+ li.xpath('./a/img/@src')[0]
    img_name = li.xpath('./a/img/@alt')[0] + '.jpg'
    #print(img_name,img_src)
    #请求图片进行持久化存储
    img_data = requests.get(url=img_src,headers=headers).content
    img_path = 'picLibs'+img_name
    with open(img_path,'wb') as fp:
        fp.write(img_data)
         print(img_name+"下载成功")
           

5.xpath解析案例-全国城市名称爬取

基础版:

import requests
from lxml import etree
headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
url = 'http://www.aqistudy.cn/historydata/'
page_text = requests.get(url = url,headers=headers).text
tree = etree.HTML(page_text)
host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')

all_city_names = []
#解析到热门城市的城市名称
for li in host_li_list:
    host_city_name = li.xpath('./a/text()')[0]
    all_city_names.append(host_city_name)
#解析的是全国城市的名称:
city_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
for li in city_names_list:
    city_names = li.xpath('./a/text()')[0]
    all_city_names.append(city_names)
print(all_city_names,len(all_city_names))
           

改良版;

import requests
from lxml import etree
# headers = {
#     'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
# }
# url = 'http://www.aqistudy.cn/historydata/'
# page_text = requests.get(url = url,headers=headers).text
# tree = etree.HTML(page_text)
# host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
#
# all_city_names = []
# #解析到热门城市的城市名称
# for li in host_li_list:
#     host_city_name = li.xpath('./a/text()')[0]
#     all_city_names.append(host_city_name)
# #解析的是全国城市的名称:
# city_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
# for li in city_names_list:
#     city_names = li.xpath('./a/text()')[0]
#     all_city_names.append(city_names)
# print(all_city_names,len(all_city_names))

headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
 }
url = 'http://www.aqistudy.cn/historydata/'
page_text = requests.get(url = url,headers=headers).text
tree = etree.HTML(page_text)
#解析到所有热门城市和所有城市对应的a标签
# //div[@class="bottom"]/ul/li   热门城市a标签的层级关系
# //div[@class="bottom"]/ul/div[2]/li/a   全部城市a标签的层级关系
a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')
all_city_names=[]
for a in a_list:
    city_name = a.xpath('./text()')[0]

    all_city_names.append(city_name)
print(all_city_names,len(all_city_names))
           

本文章为B站UP主Python网红Alex所讲的《Python超强爬虫8天速成》课后笔记。不做商用,请勿打赏!