思路:
1.手工查找一些僵屍使用者
2.對僵屍使用者的關注、粉絲清單進行多層周遊擷取大量候選使用者集
3.手工标注僵屍使用者
技術難點在于第二步,工程量在第三步
1.手工查找僵屍使用者
通過在微網誌手工查找,發現僵屍使用者如下:
其特征較為明顯、關注多,粉絲少,幾乎沒有活躍度
2.疊代周遊僵屍使用者關注粉絲清單
參考此前文章:https://blog.csdn.net/weixin_43906500/article/details/115919312
相關代碼略有修改,并封裝為函數庫,如下
2.1擷取粉絲關注
get_follow_fan.py
import re
from urllib import request
import urllib
import code_weibo.config as config
import json
def get_follow_fan(o_id,num=5):
headers = config.get_headers()
add = urllib.request.Request(url='https://weibo.com/u/%s' % o_id, headers=headers)
r = urllib.request.urlopen(url=add, timeout=10).read().decode('utf-8')
p_id = re.findall(r'CONFIG\[\'page_id\']=\'(\d+)\'',r)[0]
follow_data = []
fan_data = []
dic_follow_fan = {}
for i in range(1,num+1):
add = urllib.request.Request(url='https://weibo.com/p/%s/follow?page=%d' % (p_id, i), headers=headers)
r = urllib.request.urlopen(url=add, timeout=10).read().decode('utf-8')
follows = re.findall(r'action-type=\\"itemClick\\" action-data=\\"uid=(\d+)&fnick=(.*?)&',r)
print("關注:")
print(len(follows))
if(len(follows)==0):
break
for follow in follows:
dic = {}
dic["uid"] = follow[0]
dic["name"] = follow[1]
follow_data.append(dic)
for i in range(1,num+1):
add = urllib.request.Request(url='https://weibo.com/p/%s/follow?relate=fans&page=%d' % (p_id, i), headers=headers)
r = urllib.request.urlopen(url=add, timeout=10).read().decode('utf-8')
fans = re.findall(r'action-type=\\"itemClick\\" action-data=\\"uid=(\d+)&fnick=(.*?)&', r)
print("粉絲:")
print(len(fans))
if(len(fans)==0):
break
for fan in fans:
if(fan[0]=="2671109275"):
continue
dic = {}
dic["uid"] = fan[0]
dic["name"] = fan[1]
fan_data.append(dic)
dic_follow_fan["follows"] = follow_data
dic_follow_fan["fans"] = fan_data
return dic_follow_fan
if __name__ == '__main__':
dic_follow_fan = get_follow_fan("6598708078",num=5)
print(dic_follow_fan)
json_f = open("../data/dic_follow_fan_7006403277.json", "w")
json.dump(dic_follow_fan, json_f, indent=4)
通過編寫代碼,疊代掉用上述代碼,實作擷取大量僵屍使用者候選集
一個僵屍使用者可查找100關注100粉絲,由于樣本沒有粉絲,故在關注清單周遊,可周遊使用者規模大約在10000數量級(100*100*2)
2.2擷取候選僵屍使用者集
其中重點需要關注的地方在于初始使用者的選取,根據分析僵屍使用者主要分為兩類,一類是具有大量關注少量粉絲的普通僵屍使用者,一類是具有大量粉絲,少量關注,但微網誌特征表現為僵屍特征
根據分析
當初始使用者選取第一類僵屍使用者時,第一層大多為第二類僵屍使用者,第二層使用者則為第一類僵屍使用者,樣本特征效果不佳
當初始使用者選取第二類僵屍使用者時,第一層存在不少第一類僵屍使用者,第二層使用者中會出現大量第一類僵屍使用者,樣本特征效果好
調用代碼如下
get_corpse_users.py
import json
from code_weibo import get_follow_fan
uid = "5737286648"
dic_all = []
dic_follow_fan = get_follow_fan.get_follow_fan(uid)
print(dic_follow_fan)
json_f = open("users_data/dic_follow_fan_%s.json" % uid, "w")
json.dump(dic_follow_fan, json_f, indent=4)
follows = dic_follow_fan["follows"]
fans = dic_follow_fan["fans"]
i = 1
for follow in follows:
dic_item = {}
print("第%d頁"%i)
i = i + 1
uid_item = follow["uid"]
name_item = follow["name"]
print(uid_item)
dic_follow_fan = get_follow_fan.get_follow_fan(uid_item)
print(dic_follow_fan)
dic_item["uid"] = uid_item
dic_item["name"] = name_item
dic_item["data"] = dic_follow_fan
dic_all.append(dic_item)
json_f = open("users_data/dic_follow_all_%s_new.json" % uid, "w")
json.dump(dic_all, json_f, indent=4)
資料展示:
3.标記僵屍使用者集
标記僵屍資料集,借助此前獲得的僵屍使用者候選集,并利用selenium技術輔助标記,最終實作兩類僵屍使用者的标記
代碼如下所示
mark_corpse_users.py
import json
from code_weibo import get_user_info
from selenium import webdriver
browser = webdriver.Firefox()
# 擷取 selenium cookies方法
# browser.get("https://weibo.com/u/2828585100")
# input()
# browser.get("https://weibo.com/u/2828585100")
# print(browser.get_cookies())
# 添加cookies
cookies = [{'name': '_s_tentry', 'value': 'passport.weibo.com', 'path': '/', 'domain': '.weibo.com', 'secure': False, 'httpOnly': False, 'sameSite': 'None'}, {'name': 'Apache', 'value': '6794026294436.286.1620953056338', 'path': '/', 'domain': '.weibo.com', 'secure': False, 'httpOnly': False, 'sameSite': 'None'}, {'name': 'SINAGLOBAL', 'value': '6794026294436.286.1620953056338', 'path': '/', 'domain': '.weibo.com', 'secure': False, 'httpOnly': False, 'expiry': 1936313056, 'sameSite': 'None'}, {'name': 'ULV', 'value': '1620953056359:1:1:1:6794026294436.286.1620953056338:', 'path': '/', 'domain': '.weibo.com', 'secure': False, 'httpOnly': False, 'expiry': 1652057056, 'sameSite': 'None'}, {'name': 'WBtopGlobal_register_version', 'value': '2021051408', 'path': '/', 'domain': 'weibo.com', 'secure': False, 'httpOnly': False, 'sameSite': 'None'}, {'name': 'SCF', 'value': 'AjA2nZTlgOEISRLSh7zaUGcHLSancs_A9StQ9B2X8729_poa7MN5uq7XgzVrX_6mQYsMVP_dyuS2LHyLSvhRq84.', 'path': '/', 'domain': '.weibo.com', 'secure': True, 'httpOnly': True, 'expiry': 1936313100, 'sameSite': 'None'}, {'name': 'SUB', 'value': '_2A25NmbhcDeRhGeBL4loW8CbMzDSIHXVu7q6UrDV8PUNbmtAKLRT7kW9NRq1zk3_SB65S9z0QcCTqmiRmzO-0TZQU', 'path': '/', 'domain': '.weibo.com', 'secure': True, 'httpOnly': True, 'sameSite': 'None'}, {'name': 'SUBP', 'value': '0033WrSXqPxfM725Ws9jqgMF55529P9D9Whz61LsS7a7JfSRlmm6Blq55JpX5K2hUgL.Foqf1KnNehn7S0n2dJLoIpYc1K2Ni--ciKn7iKL2i--fiKLsiKLsi--Xi-iFiK.R', 'path': '/', 'domain': '.weibo.com', 'secure': True, 'httpOnly': False, 'expiry': 1652489100, 'sameSite': 'None'}, {'name': 'ALF', 'value': '1652489100', 'path': '/', 'domain': '.weibo.com', 'secure': True, 'httpOnly': False, 'expiry': 1652489100, 'sameSite': 'None'}, {'name': 'SSOLoginState', 'value': '1620953100', 'path': '/', 'domain': '.weibo.com', 'secure': True, 'httpOnly': False, 'sameSite': 'None'}]
browser.get("https://weibo.com/337558589?is_all=1")
browser.delete_all_cookies()
for cookie in cookies:
browser.add_cookie(cookie)
f = open("../users_data/dic_follow_all_6878691599.json", "r")
str = f.read()
data = json.loads(str)
i = 1
uid_len = 0
dic_all = []
need_len = 200
do_flag = 1
for item in data:
print("第%d個使用者" % i)
# print(item)
uid = item["uid"]
name = item["name"]
print(uid+":"+name)
item_data = item["data"]
follows = item_data["follows"]
fans = item_data["fans"]
dic_uid = {}
dic_uid["uid"] = uid
dic_uid["name"] = name
try:
get_user_info.get_user_info(uid, dic_uid)
url = "https://weibo.com/u/%s" % uid
print(url)
browser.get(url)
print(json.dumps(dic_uid, ensure_ascii=False, indent=4))
dic_uid["mark"] = input()
print(json.dumps(dic_uid, ensure_ascii=False, indent=4))
dic_all.append(dic_uid)
uid_len = uid_len + 1
except:
print("跳過使用者%s" % uid)
continue
print("follows:")
for follow in follows:
print(uid_len)
print(follow)
uid = follow["uid"]
try:
get_user_info.get_user_info(uid,follow)
url = "https://weibo.com/u/%s" % uid
print(url)
browser.get(url)
print(json.dumps(follow, ensure_ascii=False, indent=4))
follow["mark"] = input()
# 終止标記過程
if (follow["mark"] == "end"):
do_flag = 0
break
print(json.dumps(follow, ensure_ascii=False, indent=4))
dic_all.append(follow)
uid_len = uid_len + 1
except:
print("跳過使用者%s" % uid)
continue
if (uid_len > need_len):
break
if(do_flag==0):
break
if (uid_len > need_len):
break
print("fans:")
for fan in fans:
print(uid_len)
print(fan)
uid = fan["uid"]
try:
get_user_info.get_user_info(uid, fan)
url = "https://weibo.com/u/%s" % uid
print(url)
browser.get(url)
print(json.dumps(fan, ensure_ascii=False, indent=4))
fan["mark"] = input()
# 終止标記過程
if (fan["mark"] == "end"):
do_flag = 0
break
print(json.dumps(fan, ensure_ascii=False, indent=4))
dic_all.append(fan)
uid_len = uid_len + 1
except:
print("跳過使用者%s" % uid)
if(uid_len > need_len):
break
if (do_flag == 0):
break
i = i +1
print(uid_len)
json_f = open("../users_data/dic_marked_6878691599.json", "w")
json.dump(dic_all, json_f, indent=4)
資料展示: