from bs4 import BeautifulSoup
对象的实例化:
1.本地html文件加载
fp=open('./test.html','r',encoding='utf-8')
soup=BeautifulSoup(fp,'lxml')
2.互联网获取页面加载
page_text=response.text
soup=BeautifulSoup(page_text,'lxml')
提供用于数据解析方法和属性:
1.soup.tagname:返回文档中第一次出现对应的标签
2.soup.find('tagname')
soup.find('div',class_='song')
3.soup.find_all('tagname')返回符合要求的所有标签
4.soup.select('id')
soup.select('.tang>ul>li>a'):>表示的是一个层级
soup.select('.tang>ul a'):空格表示的是多个层级
5.获取标签中的文本数据:
soup.a.text/string/get_text()
其中text,get_text()获得全部文本内容,string只获取直系文本内容
6.获取标签属性:
soup.a['href']
例子如下:
#需求:爬取三国演义所有章节标题及内容
from bs4 import BeautifulSoup
import requests
if __name__ == '__main__':
headers = {
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
url='https://www.shicimingju.com/book/sanguoyanyi.html'
page_text=requests.get(url=url,headers=headers).text
soup=BeautifulSoup(page_text,'lxml')
li_title=soup.select('.book-mulu>ul>li')
fp=open('./sanguo.txt','w',encoding='utf-8')
for i in li_title:
title=i.a.string
detail_url='https://www.shicimingju.com'+i.a['href']
detail_content=requests.get(url=detail_url,headers=headers).text
detail_soup=BeautifulSoup(detail_content,'lxml')
div_tag=detail_soup.find('div',class_='chapter_content')
content=div_tag.text
fp.write(title+":"+content+'\n')
print(title,'爬取成功')