[Python] 純文字檢視 複制代碼import requests
import re
import json
import time
from lxml import etree
import dill
# 全局變量
tieba_prefix = "http://tieba.baidu.com"
userdict = {}
# 參數資訊類
class para:
headers = None
cookies = None
max_loop = None
max_page = None
max_num = None
# 使用者資訊類
class userinfo(object):
def __init__(self, url):
self.url = url
self.id = None
self.username = None
self.age = None
self.tie = None
self.sex = None
self.concern_num = None
self.concern_url = None
self.concern_list = []
self.fans_num = None
self.fans_url = None
self.fans_list = []
# 儲存到檔案
def saveToFile(self):
dictObj = {
"url": self.url,
"id": self.id,
"username": self.username,
"age": self.age,
"tie": self.tie,
"sex": self.sex,
"concern_num": self.concern_num,
"concern_url": self.concern_url,
"fans_num": self.fans_num,
"fans_url": self.fans_url
}
# url解析
def getHtmlFromUrl(url, loop_info):
response = requests.get(url, headers=para.headers, cookies=para.cookies)
print('目前頁面:' + url)
print(loop_info)
if response.status_code == 200:
# 很抱歉,您要通路的頁面不存在。
if response.url == 'http://static.tieba.baidu.com/tb/error.html?ErrType=1':
data = response.content.decode('gbk') # gbk編碼
html = etree.HTML(data)
result = html.xpath('//div[@id="errorText"]/p/text()')
if len(result) > 0:
print(result[0])
else:
print('擷取錯誤消息失敗')
return
data = response.content.decode('utf-8')
html = etree.HTML(data)
# 抱歉,您通路的使用者已被屏蔽。
if response.url == 'http://tieba.baidu.com/tb/static-ucenter/html/error.html':
result = html.xpath('//div[@id="errorText"]/p/text()')
if len(result) > 0:
print(result[0])
else:
print('擷取錯誤消息失敗')
return
# 正常結果
return html
else:
print('頁面擷取失敗')
print(response.status_code)
print(response.history)
# 擷取使用者頁面資訊
def get_concern_info(html, user, id, loop_info):
# 識别id
if id == '':
result = html.xpath('//a[@class="nav_icon nav_main"]/@href')[0]
matchObj = re.search(r'.*?id=(tb.*)', result)
if matchObj:
id = matchObj.group(1)
else:
print("id No match!!")
return
# 使用者名
username = html.xpath(
'//span[starts-with(@class,"userinfo_username ")]/text()')[0]
# 吧齡
result = html.xpath(
'//div[@class="userinfo_userdata"]/span[2]/text()')[0][3:-1]
age = float(result)
# 發帖數
result = html.xpath(
'//div[@class="userinfo_userdata"]/span[4]/text()')[0][3:]
# 發帖數上萬時顯示小數
if result[-1] == '萬':
tie = int(float(result[0:-1]) * 10000)
else:
tie = int(result)
# 性别
sex = html.xpath(
'//div[@class="userinfo_userdata"]/span[1]/@class')[0][26:]
# 關注數
result = html.xpath(
'//ul[@id="concern_wrap_concern"]/..//span[@class="concern_num"]/a/text()'
)
if len(result) > 0:
concern_num = result[0]
# 關注頁
result = html.xpath(
'//ul[@id="concern_wrap_concern"]/..//span[@class="concern_num"]/a/@href'
)
concern_url = tieba_prefix + result[0]
# 粉絲數
result = html.xpath(
'//ul[@id="concern_wrap_fans"]/..//span[@class="concern_num"]/a/text()'
)
if len(result) > 0:
fans_num = result[0]
# 粉絲頁
result = html.xpath(
'//ul[@id="concern_wrap_fans"]/..//span[@class="concern_num"]/a/@href'
)
fans_url = tieba_prefix + result[0]
# 完善使用者資訊
user.id = id
user.username = username
user.age = age
user.tie = tie
user.sex = sex
# 屬性可能不存在
if 'concern_num' in locals():
user.concern_num = concern_num
user.concern_url = concern_url
if 'fans_num' in locals():
user.fans_num = fans_num
user.fans_url = fans_url
# 使用者資訊實時導出
#user.saveToFile()
# 追加已擷取使用者
userdict[id] = user
print('加入使用者:' + username)
# 疊代上限檢測
if loop_info['Node'] <= para.max_loop:
#初始化循環資訊
loop_info['Node'] = loop_info['Node'] + 1
# 關注頁疊代
if concern_url != None:
loop_info['Origin'] = username + " 關注頁"
loop_concern(concern_url, loop_info, user)
# 粉絲頁疊代
if fans_url != None:
loop_info['Origin'] = username + " 粉絲頁"
loop_concern(fans_url, loop_info, user)
# 關注/粉絲頁循環 但是百度關注顯示上限為500
def loop_concern(url, loop_info, user):
# 初始化
loop_info['Page'] = 1
while True:
# 頁面處理
html = getHtmlFromUrl(url, loop_info)
# 目前頁擷取失敗時,終止
if html == None:
break
get_concern(html, loop_info, user)
# 頁面擷取上限時,終止
if loop_info['Page'] >= para.max_page:
break
#循環資訊
loop_info['Page'] = loop_info['Page'] + 1
# 下一頁url
result = html.xpath(
'//div[@class="pager pager-center"]/a[@class="next"]/@href')
if len(result) > 0:
url = tieba_prefix + result[0]
else:
# 最後一頁時退出
break
# 關注/粉絲頁提取
def get_concern(html, loop_info, user):
# 初始化
loop_info['Num'] = 0
pageIdList = html.xpath('//div[@class="user"]/@portrait')
pageUrlList = html.xpath('//span[@class="name"]/a/@href')
# pageUrlList size預設和pageIdList size相等
for i in range(len(pageIdList)):
u_id = pageIdList[i]
u_url = tieba_prefix + pageUrlList[i]
# 擷取上限時,終止
if loop_info['Num'] >= para.max_num:
break
#循環資訊
loop_info['Num'] = loop_info['Num'] + 1
# 已經擷取的使用者不再重複爬取
if u_id not in userdict.keys():
u_html = getHtmlFromUrl(u_url, loop_info)
# 目前使用者擷取失敗時,跳過
if u_html == None:
continue
# 建立子使用者資訊
sub_user = userinfo(u_url)
get_concern_info(u_html, sub_user, u_id, loop_info)
#加入到關注/粉絲清單
if loop_info['Origin'][-3:] == '關注頁':
user.concern_list.append(userdict[u_id])
elif loop_info['Origin'][-3:] == '粉絲頁':
user.fans_list.append(userdict[u_id])
def main(max_loop, max_page, max_num, origin_url):
#最大疊代層數
para.max_loop = max_loop
#最大頁數
para.max_page = max_page
# 一頁最大為20
para.max_num = max_num
# 寫入headers
para.headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
# 打開所儲存的cookies内容檔案
f = open(r'cookies.txt', 'r')
# 初始化cookies
para.cookies = {}
# 寫入cookies
for line in f.read().split(';'):
#其設定為1就會把字元串拆分成2份
name, value = line.strip().split('=', 1)
#字典cookies添加
para.cookies[name] = value
# 初始化循環消息
loop_info = {'Node': 0, 'Page': 0, 'Num': 0, 'Origin': ''}
# 建立使用者資訊
user = userinfo(origin_url)
# 第一個使用者連結
html = getHtmlFromUrl(origin_url, loop_info)
if html == None:
print("原始輸入錯誤")
return
# 擷取使用者資訊
get_concern_info(html, user, '', loop_info)
return userdict
if __name__ == '__main__':
origin_url = '貼吧使用者url'
main(2, 10, 2, origin_url)
# 儲存結果
filename = r"crawler_data.pkl"
dill.dump_session(filename)
print("完成!!")