天天看點

矩陣分解ML-latest-small矩陣分解并作出推薦

你的努力将成就更好的自己。

矩陣分解并作出推薦

#字元編碼!!!
#-*- Coding: utf-8 -*-
import pandas as pd
import numpy as np
import tensorflow as tf

def data_process():
    ratings = pd.read_csv("F:\\ml-latest-small\\ratings.csv")
    movies = pd.read_csv("F:\\ml-latest-small\\movies.csv")
    movies['movieRow'] = movies.index

    movies = movies[['movieRow','movieId','title']]
    movies.to_csv("F:\\ml-latest-small\\moviesProcessed.csv",index = False,header = True,encoding = 'utf-8')
    print(movies.tail())

    ratings = pd.merge(ratings,movies,on = 'movieId')
    ratings = ratings[['userId','movieRow','rating']]

    ratings.to_csv('F:\\ml-latest-small\\ratingsProcessed.csv', index=False, header=True, encoding='utf-8')
    print(ratings.head())
    return ratings,movies

#建構矩陣
def build_matrix(ratings):
    user_no = ratings['userId'].max()+1
    movies_no = ratings['movieRow'].max()+1

    rate = np.zeros((movies_no,user_no))
    print(rate.shape)

    flag = 0
    ratings_length = np.shape(ratings)[0]
    print(np.shape(ratings))

    for index, row in ratings.iterrows():
        rate[int(row['movieRow']),int(row['userId'])] = row['rating']
        flag+1
    record = rate > 0
    record = np.array(record,dtype= int)
    print(record)
    return rate,record,user_no,movies_no
#構模組化型
def normalizeRatings(rate,record):
    m,n = rate.shape
    rating_mean = np.zeros((m,1))
    rating_norm = np.zeros((m,n))
    for i in range(m):
        idx = (record[i,:]!=0)
        rating_mean[i] = np.mean(rate[i,idx])
        rating_norm[i,idx] = rate[i,idx] - rating_mean[i]
    return rating_norm,rating_mean
def build_model(rate,record,movies_no,user_no):
    rating_norm,rating_mean = normalizeRatings(rate,record)
    rating_mean = np.nan_to_num(rating_mean)
    num_features = 12
    x = tf.Variable(tf.random_normal([movies_no,num_features],stddev= 0.35))
    theta = tf.Variable(tf.random_normal([user_no,num_features],stddev=0.35))

    loss = 1/2 * tf.reduce_sum(((tf.matmul(x, theta, transpose_b=True) - rating_norm) * record) ** 2) + \
        0.5*(1/2 * (tf.reduce_sum(x ** 2) + tf.reduce_sum(theta ** 2)))

    train = tf.train.AdamOptimizer(1e-3).minimize(loss)
    return x,theta,train,loss,rating_mean

#訓練模型
def Train(loss):
    tf.summary.scalar('train_loss',loss)
    summaryMerged = tf.summary.merge_all()
    filename = "F:\\ml-latest-small\\movie_tensorborad.csv"
    writer = tf.summary.FileWriter(filename)
    return summaryMerged,writer 

def recommend(movies):
    user_id = input(u'您要想哪位使用者進行推薦?請輸入使用者編号:')
    sortedResult = predicts[:, int(user_id)].argsort()[::-1]
    # argsort()函數傳回的是數組值從小到大的索引值; argsort()[::-1] 傳回的是數組值從大到小的索引值
    print(u'為該使用者推薦的評分最高的20部電影是:'.center(80, '='))
    # center() 傳回一個原字元串居中,并使用空格填充至長度 width 的新字元串。預設填充字元為空格。
    idx = 0
    for i in sortedResult:
        print(u'評分: %.2f, 電影名: %s' % (predicts[i, int(user_id)]-2, movies.iloc[i]['title']))
        idx += 1
        if idx == 20:
            break
#評估模型
if __name__ == "__main__":
    ratings,movies = data_process()
    rate,record,user_no,movies_no =build_matrix(ratings)
    x,theta,train,loss,rating_mean = build_model(rate,record,movies_no,user_no)
    summaryMerged,writer = Train(loss)
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        for i in range(2000):
            _,movie_summary = sess.run([train,summaryMerged])
            writer.add_summary(movie_summary,i)
        current_x,current_theta = sess.run([x,theta])
        predicts = np.dot(current_x,current_theta.T) + rating_mean
        error = np.sqrt(np.sum(((predicts - rate)*record)**2))
        print(u'模型評估errors:', error)
        
        recommend(movies)           

複制

結果如下:

movieRow  movieId                                      title
9737      9737   193581  Black Butler: Book of the Atlantic (2017)
9738      9738   193583               No Game No Life: Zero (2017)
9739      9739   193585                               Flint (2017)
9740      9740   193587        Bungo Stray Dogs: Dead Apple (2018)
9741      9741   193609        Andrew Dice Clay: Dice Rules (1991)
   userId  movieRow  rating
0       1         0     4.0
1       5         0     4.0
2       7         0     4.5
3      15         0     2.5
4      17         0     4.5
(9742, 611)
(100836, 3)
[[0 1 0 ... 1 1 1]
 [0 0 0 ... 1 0 0]
 [0 1 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

模型評估errors: 151.9801784601805
您要想哪位使用者進行推薦?請輸入使用者編号:1
==============================為該使用者推薦的評分最高的20部電影是:===============================
評分: 5.39, 電影名: Now You See Me (2013)
評分: 4.43, 電影名: Postman, The (Postino, Il) (1994)
評分: 4.40, 電影名: My Neighbor Totoro (Tonari no Totoro) (1988)
評分: 4.35, 電影名: Color Purple, The (1985)
評分: 4.23, 電影名: The Revenant (2015)
評分: 4.21, 電影名: Smoke (1995)
評分: 4.19, 電影名: Big Sleep, The (1946)
評分: 4.19, 電影名: Drugstore Cowboy (1989)
評分: 4.16, 電影名: Whale Rider (2002)
評分: 4.12, 電影名: Gandhi (1982)
評分: 4.10, 電影名: Murder in the First (1995)
評分: 4.10, 電影名: Lone Star (1996)
評分: 4.03, 電影名: Lifeboat (1944)
評分: 4.00, 電影名: Planes, Trains & Automobiles (1987)
評分: 3.99, 電影名: Moonstruck (1987)
評分: 3.93, 電影名: Remains of the Day, The (1993)
評分: 3.91, 電影名: To Kill a Mockingbird (1962)
評分: 3.90, 電影名: Crash (1996)
評分: 3.89, 電影名: Grave of the Fireflies (Hotaru no haka) (1988)
評分: 3.88, 電影名: Wallace & Gromit: The Best of Aardman Animation (1996)           

複制