環境:python3.5 bs4 lxml這是需要安裝的
使用urllib子產品來通路頁面 bs4解析頁面,存到*.txt檔案中
#-*- coding:utf-8 -*-
import urllib.request
import time,os
import numpy as np
from bs4 import BeautifulSoup
hds=[{'User-Agent': 'Mozilla/5.0 (Windows; U;Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
def search(value,page):
url='http://so.gushiwen.org/search.aspx?type=author&page='+urllib.request.quote(str(page))+'&value='+urllib.request.quote(value)
#拼接url使用urllib.request.quote()把漢字和數字拼接到url裡
time.sleep(np.random.rand() * 3)#随機等一段時間再進行通路
try:
req=urllib.request.Request(url,headers=hds[page%3])
html_resource=urllib.request.urlopen(req)
# print(html_resource.info()) 輸出伺服器資訊
plain_text=html_resource.read().decode()
return plain_text
except (urllib.request.HTTPError,urllib.request.URLError) as e:
print(e)
def rmline(str):
#這是一個去除字元串中的空行的函數
data=''
for line in str:
l=line.strip()
if len(l)!=0:
data+=l
return data
def parseHtml(html,page):
# soup=BeautifulSoup(html,'lxml')
# title=soup.title#擷取網頁title
# title_name=title.name#擷取title的名字 也就是标簽的名字
# title_string=title.string#擷取title的值
# title_parent=soup.title.parent#title的父對象
#
# soup.p#擷取第一個p标簽
# soup.p['class']#擷取第一個p标簽裡面的class值
#
# soup.find_all('a')#找到所有的a标簽
# soup.find(id='***')#擷取id為***的标簽
soup=BeautifulSoup(html,'lxml')
with open('poem.txt',mode='a',encoding='utf-8') as f:
f.write('第'+str(page)+'頁'+'\n')
if page==1:
for poem in soup.find_all("div", "sons")[1:]:
poem_title = poem.p
poem_title_str = poem_title.a.string
poem_author = poem_title.next_sibling.next_sibling
poem_author_str = poem_author.span.string
poem_value = poem_author.next_sibling.next_sibling
poem_value_str = list(poem_value.strings)[0]
# 注意:.strings擷取的是generator它是疊代的。直接取值不好取,轉化成list再取值。
with open('poem.txt', mode='a', encoding='utf-8') as f:
f.write('題目: ' + rmline(poem_title_str)+'\n')
f.write('作者: ' + poem_author_str+'\n')
f.write('内容: ' + poem_value_str+'\n')
f.write('-----------------------------------------'+'\n')
else:
for poem in soup.find_all("div", "sons"):
poem_title = poem.p
poem_title_str = poem_title.a.string
poem_author = poem_title.next_sibling.next_sibling
poem_author_str = poem_author.span.string
poem_value = poem_author.next_sibling.next_sibling
poem_value_str = list(poem_value.strings)[0]
# 注意:.strings擷取的是generator它是疊代的。直接取值不好取,轉化成list再取值。
with open('poem.txt', mode='a', encoding='utf-8') as f:
f.write('題目: ' + rmline(poem_title_str)+'\n')
f.write('作者: ' + poem_author_str+'\n')
f.write('内容: ' + poem_value_str+'\n')
f.write('-----------------------------------------'+'\n')
if __name__=="__main__":
filename='poem.txt'
if os.path.exists(filename):
os.remove(filename)
for page in range(1,4):
#1,4是指頁數,也就是下載下傳前三頁資料。下面的作者名字可以随意改,或者寫詩的名字也可以
parseHtml(search('納蘭性德',page), page)