承接上一篇文章:https://blog.csdn.net/weixin_43906500/article/details/115921689
上一篇文章中實作了擷取大V的熱門微網誌下的轉發清單,在本文中,通過對代碼進行改造,實作對轉發清單和評論清單的擷取
1.擷取使用者接口
轉發清單使用者請求如下:
評論清單使用者請求如下:
通過設定不同參數,來選擇不同接口擷取資料
if(type_name=="mblog"):
url = "https://weibo.com/aj/v6/%s/info/big?id=%s&page=%d" % (type_name, id,page_num)
elif(type_name=="comment"):
url = "https://weibo.com/aj/v6/%s/big?id=%s&from=singleWeiBo&page=%d" % (type_name, id,page_num)
2.分析擷取方法
通過檢視網頁可知
擷取轉發清單時,通過翻頁即可擷取,同時傳回的json資料中含有總共的頁數量
而擷取評論清單時,雖通過page參數控制頁數,但終止條件不明确,通過分析json資料,發現規律:
當"正在加載,請稍候..."出現說明标簽正在加載,可繼續翻頁
當<!-- 檢視更多 --><!-- 檢視更多 -->中間為空時,說明已到終止頁
try:
page = r["data"]["page"]["totalpage"]
dic_post["totalpage"] = page
print("總頁數為:%s" % page)
if (page_num >= int(page)):
break
except:
count = r["data"]["count"]
dic_post["count"] = count
print("總評論數為:%s" % count)
print(html)
if("正在加載,請稍候..." in html):
continue
more = html.split("<!-- 檢視更多 -->")
if(more[1].strip()==""):
break
3.資料結果如下
完整代碼如下
import json
import re
from urllib import request
import urllib
import config
def get_user(o_id,type_name):
dic = {}
list_repost = []
# o_id = "1968758563"
headers = config.get_headers()
add = urllib.request.Request(url="https://weibo.com/u/%s?is_hot=1" % o_id, headers=headers)
r = urllib.request.urlopen(url=add, timeout=10).read().decode('utf-8')
links = re.findall(r'<a action-data=\\"allowForward=1&url=https:\\/\\/weibo.com\\/%s\\/(.*?)&mid=(\d+)&' % o_id, r)
# print(links)
# print(len(links))
try:
for post_num in range(len(links)):
print("第%d篇文章" % post_num)
id = links[post_num][1]
print(links[post_num][0],id)
dic_post = {}
repost_users_all = []
count_num = 0
for page_num in range(1,5):
try:
print("第%d頁" % page_num)
if(type_name=="mblog"):
url = "https://weibo.com/aj/v6/%s/info/big?id=%s&page=%d" % (type_name, id,page_num)
elif(type_name=="comment"):
url = "https://weibo.com/aj/v6/%s/big?id=%s&from=singleWeiBo&page=%d" % (type_name, id,page_num)
print(url)
add = urllib.request.Request(url=url,headers=headers)
r = urllib.request.urlopen(url=add, timeout=10).read().decode("utf8")
r = json.loads(r)
html = r["data"]["html"]
repost_users = re.findall(r'<img.*?usercard="id=(.*?)"', html)
# print(len(repost_users))
# print(repost_users)
count_num = count_num + len(repost_users)
repost_users_all.extend(repost_users)
try:
page = r["data"]["page"]["totalpage"]
dic_post["totalpage"] = page
print("總頁數為:%s" % page)
if (page_num >= int(page)):
break
except:
count = r["data"]["count"]
dic_post["count"] = count
print("總評論數為:%s" % count)
print(html)
if("正在加載,請稍候..." in html):
continue
more = html.split("<!-- 檢視更多 -->")
if(more[1].strip()==""):
break
except:
break
dic_post["mid"] = id
dic_post["users"] = repost_users_all
dic_post["user_len"] = len(dic_post["users"])
list_repost.append(dic_post)
except:
pass
dic["o_id"] = o_id
dic["repost_data"] = list_repost
return dic
if __name__ == '__main__':
dic = get_user("1906123125","mblog")
json_f = open("data/data_九州億品_mblog.json","w")
json.dump(dic, json_f,indent=4)
dic = get_user("1906123125", "comment")
json_f = open("data/data_九州億品_comment.json", "w")
json.dump(dic, json_f, indent=4)