系統實作流程如下:
此次項目資料采集部分使用的Python庫有:requests,xlwt,json,matplotlib,tkinter,os,re,time
import datetime
import time
import socket
import pymysql
import re
import urllib.parse
from urllib import error
from bs4 import BeautifulSoup # 導入urllib庫的request子產品
import lxml # 文檔解析器
import os # os子產品就是對作業系統進行操作
import numpy as np # 清單、字典、字元串等中計算元素重複的次數
資料爬取:
def get_html_text(index_url,page_url):
for i in range(1,80):
url = index_url + str(i) + '&showtj=&showhot=&author=&key=&code='
response = urllib.request.Request(url, headers=headers)
try:
res = urllib.request.urlopen(response) # 調用urlopen()從伺服器擷取網頁響應(respone),其傳回的響應是一個執行個體
html = res.read().decode('utf-8') # 調用傳回響應示例中的read(),可以讀取html
soup = BeautifulSoup(html, 'lxml')
result = soup.find_all('ul', class_="cpquery") # 和上面的不同,這裡要聞在'ul,class = news-txtd'中,是以要修改一下。
download_soup = BeautifulSoup(str(result), 'lxml')
url_all = download_soup.find_all('a')
hours=download_soup.find_all('span', class_="newslist-time")
hours = re.findall('\d{4}-\d{1,2}-\d{1,2}', str(result))
for a_url in url_all:
a_title = a_url.get_text()
titles.append(a_title)
a_url = a_url.get('href')
a_url = urllib.parse.urljoin(page_url, a_url)
urls.append(a_url)
for hour in hours:
hour = datetime.datetime.strptime(hour, '%Y-%m-%d')
times.append(hour)
# time.sleep(2) # 暫停下載下傳頁面2秒
# socket.setdefaulttimeout(15) # 控制現在内容的時間
except urllib.error.URLError as e:
if hasattr(e, 'reason'):
print("連接配接失敗!", e.reason)
for i in range(len(urls)):
try:
data = urllib.request.Request(urls[i], headers=headers)
res = urllib.request.urlopen(data)
page_info = res.read().decode('utf-8')
soup = BeautifulSoup(page_info, 'lxml')
text = ""
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
for p in soup.select('p'):
text += p.get_text()
text += "\n"
txt(urls[i], titles[i], text, times[i], now)
print(i)
print("success!")
except OSError:
print("解析錯誤!")
pass # 如果報錯就不管,繼續讀取下一個url
continue
# else:
# url = 'http://wjw.liaocheng.gov.cn/xwzx_12609/gzdt/index.html'
# response = urllib.request.Request(url, headers=headers)
# try:
# res = urllib.request.urlopen(response) # 調用urlopen()從伺服器擷取網頁響應(respone),其傳回的響應是一個執行個體
# html = res.read().decode('utf-8') # 調用傳回響應示例中的read(),可以讀取html
# soup = BeautifulSoup(html, 'lxml')
# result = soup.find_all('ul', class_='news-list news-list9') # 和上面的不同,這裡要聞在'ul,class = news-txtd'中,是以要修改一下。
# download_soup = BeautifulSoup(str(result), 'lxml')
# url_all = download_soup.find_all('a')
# hours = re.findall('\d{4}-\d{1,2}-\d{1,2}', str(result))
# for a_url in url_all:
# a_title = a_url.get_text()
# titles.append(a_title)
# a_url = a_url.get('href')
# a_url = urllib.parse.urljoin(page_url, a_url)
# urls.append(a_url)
# for hour in hours:
# hour = datetime.datetime.strptime(hour, '%Y-%m-%d')
# times.append(hour)
# # time.sleep(2) # 暫停下載下傳頁面2秒
# # socket.setdefaulttimeout(15) # 控制現在内容的時間
# except urllib.error.URLError as e:
# if hasattr(e, 'reason'):
# print("連接配接失敗!", e.reason)
删除資料庫
def delete_data():
# 連接配接資料庫
db = pymysql.connect(
host='XXXX.XXXX.XXXX.XXXX', # 資料庫位址
port=3306, # 資料庫端口号
user='XXXX', # 資料庫賬号
password='XXXXXXXXXX', # 資料庫密碼
db='python',
use_unicode=True,
charset="utf8") # 資料庫表名
# 建立資料庫對象
conn = db.cursor()
try:
conn.execute("DELETE from filedata_bak where city='濱州市(新聞)' and datakinds=0")
db.commit()
except Exception as e:
print(e)
db.rollback()
儲存資料到資料庫
def txt(urls,title, content, hour,now): # 定義函數名
# 連接配接資料庫
db = pymysql.connect(
host='XXXX.XXXX.XXXX.XXXX', # 資料庫位址
port=3306, # 資料庫端口号
user='XXXX', # 資料庫賬号
password='XXXXXXXXXX', # 資料庫密碼
db='python',
use_unicode=True,
charset="utf8") # 資料庫表名
# 建立資料庫對象
conn = db.cursor()
try:
# 執行sql語句,插入資料
conn.execute(
"insert ignore into filedata_bak(websitename,datalink,title,content,datatype,city,province,datakinds,pubdate,createtime) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
"濱州市衛生健康委員會", urls, title, content, "文本", "濱州市(新聞)", "山東省", 0, hour, now))
# 送出到資料庫執行
db.commit()
print('恭喜您,導入資料成功!')
except:
# 發生錯誤時復原
db.rollback()
print('sorry,導入資料失敗!')
db.commit() # 關閉資料庫連接配接
conn.close() # 關閉指針對象
db.close() # 關閉連接配接對象
def main():
page_url = 'http://wjw.binzhou.gov.cn/'
index_url = 'http://wjw.binzhou.gov.cn/xinwen/class/?2.html&page='
get_html_text(index_url, page_url)
if name == '__main__':
delete_data()
headers = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
urls = []
times = []
titles = []
main()