#movielens資料集,代碼隻考慮該使用者是否對某些物品感興趣,不管評分,有分數就認為感興趣
import pandas as pd
from collections import defaultdict
import numpy as np
# 讀取檔案
ratings = pd.read_csv('./movielens/ratings.csv')
trainset=ratings.sample(frac=0.75,random_state=0,axis=0)
testset=ratings[~ratings.index.isin(trainset.index)]
user_item=defaultdict(list)#使用者-物品表
item_user=defaultdict(list)#倒排表
for idx,item in trainset.iterrows():
userId=int(item["userId"])
movieId=int(item["movieId"])
rating=item["rating"]
# print(userId,movieId,rating)
user_item[userId].append(movieId)
item_user[movieId].append(userId)
#建構使用者共現矩陣
w=np.zeros([611,611])
item_user=dict(sorted(item_user.items(), key=lambda x: x[0]))
for item,users in item_user.items():
for u in users:
for v in users:
if(u!=v):
w[u][v]+=1
w[v][u]+=1
#構造使用者相似矩陣
for i in range(len(w)):
for j in range(0,i):
if(len(user_item[i])*len(user_item[j])==0):
w[i][j]=w[j][i]=0.0
else:
w[i][j]=w[j][i]=w[i][j]/np.sqrt(len(user_item[i])*len(user_item[j]))
recommendations=defaultdict(list)
def get_recommendations(uid):
scores={}
#擷取相似使用者的id
arg=np.argsort(w[uid])
similar_users=arg[::-1]
for idx in similar_users[:10]:#找到K個最相似的使用者
for item in user_item[idx]:#每個相似使用者喜歡的電影
if(item not in user_item[uid]):#該使用者沒看過的電影
score=0.0
for user in item_user[item]:#計算得分
score+=w[uid][user]*1.0
scores[item]=score
scores=dict(sorted(scores.items(), key=lambda x: x[1],reverse=True))
for x in list(scores.keys())[:10]:
recommendations[uid].append(x)
for i in range(1,611):
get_recommendations(i)
def cal_precision():
user_item_test=defaultdict(list)
for idx,item in testset.iterrows():
userId=int(item["userId"])
movieId=int(item["movieId"])
rating=item["rating"]
user_item_test[userId].append(movieId)
precision=0.0
for i in range(1,611):
real=set(user_item_test[i])
est=set(recommendations[i])
precision+=len(real&est)/len(est)
return precision/610
cal_precision()
with open("./movielens/usercf.csv","w") as f:
f.write("userId,recommendation\n")
for i in range(1,620):
f.write(str(i)+",")
for x in recommendations[i]:
f.write(str(x)+" ")
f.write("\n")