Factorization machine principle and projects

principle:

projects:

Building a Movie Recommendation Service

Matrix factorization algorithm implement by hand

load necessary packages

1
2
3
4
5
6
import pandas as pd
import numpy as np
import sys, numpy as np
from numpy import genfromtxt
import codecs
from numpy import linalg as LA

load movie data and rating data

1
2
movies=pd.read_csv("movies.csv")
ratings=pd.read_csv("ratings.csv")

Movie ids are not continuous, build movie dicitionary with line no as numpy movie id ,its actual movie id as the key.

1
2
3
4
5
6
7
8
9
10
11
12
def build_movies_dict(movies_file):
i = 0
movie_id_dict = {}
with codecs.open(movies_file, 'r', 'latin-1') as f:
for line in f:
if i == 0:
i = i+1
else:
movieId,title,genres = line.split(',')
movie_id_dict[int(movieId)] = i-1
i = i +1
return movie_id_dict

Each line of i/p file represents one tag applied to one movie by one user,and has the following format: userId,movieId,tag,timestamp make sure you know the number of users and items for your dataset return the sparse matrix as a numpy array.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def read_data(input_file,movies_dict):
#no of users
users = 718
#users = 5
#no of movies
movies = 8927
#movies = 135887
X = np.zeros(shape=(users,movies))
i = 0
#X = genfromtxt(input_file, delimiter=",",dtype=str)
with open(input_file,'r') as f:
for line in f:
if i == 0:
i = i +1
else:
#print "i is",i
user,movie_id,rating,timestamp = line.split(',')
#get the movie id for the numpy array consrtruction
id = movies_dict[int(movie_id)]
#print "user movie rating",user, movie, rating, i
X[int(user)-1,id] = float(rating)
i = i+1
return X

matrix factorization implementation

X is the user-rate-movie matrix, it’s very sparse. P, Q are user-feature matrix and movie-featuree matrix respectively. The objective is to use gradient descend method to find the P,Q where $P \times Q$ approximates X.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def matrix_factorization(X,P,Q,K,steps,alpha,beta):
Q = Q.T
for step in xrange(steps):
print (step)
#for each user
for i in xrange(X.shape[0]):
#for each item
for j in xrange(X.shape[1]):
if X[i][j] > 0 :

#calculate the error of the element
eij = X[i][j] - np.dot(P[i,:],Q[:,j])
#second norm of P and Q for regularilization
sum_of_norms = 0
#for k in xrange(K):
# sum_of_norms += LA.norm(P[:,k]) + LA.norm(Q[k,:])
#added regularized term to the error
sum_of_norms += LA.norm(P) + LA.norm(Q)
#print sum_of_norms
eij += ((beta/2) * sum_of_norms)
#print eij
#compute the gradient from the error
for k in xrange(K):
P[i][k] = P[i][k] + alpha * ( 2 * eij * Q[k][j] - (beta * P[i][k]))
Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - (beta * Q[k][j]))

#compute total error
error = 0
#for each user
for i in xrange(X.shape[0]):
#for each item
for j in xrange(X.shape[1]):
if X[i][j] > 0:
error += np.power(X[i][j] - np.dot(P[i,:],Q[:,j]),2)
if error < 0.001:
break
return P, Q.T

main function

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def main(X,K):
#no of users
N= X.shape[0]
#no of movies
M = X.shape[1]
#P: an initial matrix of dimension N x K, where is n is no of users and k is hidden latent features
P = np.random.rand(N,K)
#Q : an initial matrix of dimension M x K, where M is no of movies and K is hidden latent features
Q = np.random.rand(M,K)
#steps : the maximum number of steps to perform the optimisation, hardcoding the values
#alpha : the learning rate, hardcoding the values
#beta : the regularization parameter, hardcoding the values
steps = 5000
alpha = 0.0002
beta = float(0.02)
estimated_P, estimated_Q = matrix_factorization(X,P,Q,K,steps,alpha,beta)
#Predicted numpy array of users and movie ratings
modeled_X = np.dot(estimated_P,estimated_Q.T)
np.savetxt('mf_result.txt', modeled_X, delimiter=',')
1
2
3
4
5
6
7
8
9
10
11
12
13
if __name__ == '__main__':
#MatrixFactorization.py <rating file> <no of hidden features> <movie mapping file>
if len(sys.argv) == 4:
ratings_file = sys.argv[1]
no_of_features = int(sys.argv[2])
movies_mapping_file = sys.argv[3]

#build a dictionary of movie id mapping with counter of no of movies
movies_dict = build_movies_dict(movies_mapping_file)
#read data and return a numpy array
numpy_arr = read_data(ratings_file,movies_dict)
#main function
main(numpy_arr,no_of_features)

recommend movies for users who have rated some of the movies

recommend 50 tops movies for each user based on his/her unrated movies. Implemented this seperately from building model as once the model is built, we can use it many times.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def dict_with_user_unrated_movies(rating_file,movie_mapping_id):
#no of users
users = 718
#users = 5
#no of movie ids
#movies = 4
movies = 8927
dict_with_unrated_movies_users ={}
X = np.zeros(shape=(users,movies))
i = 0
with open(rating_file,'r') as f:
for line in f:
if i == 0:
i = i +1
else:
user,movie,rating,timestamp = line.split(',')
id = movie_mapping_id[int(movie)]
#print "user movie rating",user, movie, rating, i
X[int(user)-1,id] = float(rating)
i = i+1
#print X
for row in xrange(X.shape[0]):
unrated_movi_ids = np.nonzero(X[row] == 0)
#print "user",row+1, "has unrated movies", list(unrated_movi_ids[0])
unrated_movi_ids = list(unrated_movi_ids[0])
unrated_movi_ids = map(lambda x: x+1,unrated_movi_ids)
dict_with_unrated_movies_users[row+1] = unrated_movi_ids
#print "dict with unrated movies",dict_with_unrated_movies_users
return dict_with_unrated_movies_users


#recommend top 25 movies for user specified
def top_25_recommended_movies(pred_rating_file,users,unrated_movies_per_user,movies_mapping_names,movie_mapping_id):
#dicitonary with numpy movie id as key and actual movie id as value
reverse_movie_id_mapping = {}
for key,val in movie_mapping_id.items():
reverse_movie_id_mapping[val] = key
#for each user, predict top 25 movies
for user in users:
dict_pred_unrated_movies = {}
unrated_movies = unrated_movies_per_user[int(user)]
for unrated_movie in unrated_movies:
dict_pred_unrated_movies[int(unrated_movie)] = pred_rating_file[int(user)-1][int(unrated_movie)-1]
#recommend top k movies
SortedMovies = sorted(dict_pred_unrated_movies.iteritems(), key=operator.itemgetter(1), reverse=True)
print ("Top 25 movies recommendation for the user", user)
for i in range(25):
movie_id, rating = SortedMovies[i]
actual_movie_id = reverse_movie_id_mapping[movie_id]
#recommend movies only if the predicted rating is greater than 3.5
if rating >= 3.5 :
print ("{} ".format(movie))
#print ("{} with Movie rating value {}".format(movies_mapping_names[actual_movie_id],rating))
print ("\n")

#main method
def recommend_movies_for_users(orig_rating_file,pred_rating_file,movies_file,users):
#method to get the mapping between movie names, actual movie id and numpy movie id
movies_mapping_names,movie_mapping_id = dict_with_movie_and_id(movies_file)
#build predicted numpy movie id from the saved predicted matrix of user and movie ratings
predicted_rating_numpy_array = build_predicted_numpy_array(pred_rating_file)
#dictionary of unrated movies for each user
dict_with_unrated_movies_users = dict_with_user_unrated_movies(orig_rating_file,movie_mapping_id)
#method which actually recommends top 25 unrated movies based on their the predicted score
top_25_recommended_movies(predicted_rating_numpy_array,users,dict_with_unrated_movies_users,movies_mapping_names,movie_mapping_id)

if __name__ == '__main__':
if len(sys.argv) == 5:
#read the rating file for the missing
orig_rating_file = sys.argv[1]
pred_rating_file = sys.argv[2]
movies_file = sys.argv[3]
list_of_users = sys.argv[4]
with open (list_of_users,'r') as f:
users = f.readline().split(',')
recommend_movies_for_users(orig_rating_file,pred_rating_file,movies_file,users)

Using Apache Spark to faciliate computing

load data and convert it to RDD

1
2
movies = sc.textFile("/FileStore/tables/movies.csv")
ratings= sc.textFile("/FileStore/tables/ratings.csv")

data cleaning

1
2
ratings=ratings.map(lambda x:x.split(","))
movies=movies.filter(lambda x:'movieId' not in x).map(lambda x:x.split(',')).map(lambda x:(x[0],(x[1],x[2])))

have a glimpse at ratings data

1
ratings.take(2)

Out[154]: [['userId', 'movieId', 'rating', 'timestamp'], ['1', '1', '5.0', '847117005']]

remove the header line

1
ratings=ratings.filter(lambda x:"userId" not in x).map(lambda x:(x[0],x[1],x[2]))

split the data into training set, validation set and test set

1
2
3
training_RDD, validation_RDD, test_RDD = ratings.randomSplit([6, 2, 2])
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

use alternate least squares algorithm to calcualte the coefficients of the factorization machines

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from pyspark.mllib.recommendation import ALS
import math
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02
min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
model = ALS.train(training_RDD, rank, iterations=iterations,
lambda_=regularization_parameter)
predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
errors[err] = error
err += 1
print ('For rank %s the RMSE is %s' % (rank, error))
if error < min_error:
min_error = error
best_rank = rank
print ('The best model was trained with rank %s' % best_rank)

For rank 4 the RMSE is 0.9359056108007288
For rank 8 the RMSE is 0.9382743636404246
For rank 12 the RMSE is 0.9389290854027168
The best model was trained with rank 4

what the prediction looks like

1
predictions.take(3)

Out[159]:
[((44, 3272), 3.9103419004701716),
((618, 7184), 2.9601086695162566),
((264, 52328), 3.3141828739969803)]

test model performance on test data set

1
2
3
4
5
6
7
model = ALS.train(training_RDD, best_rank,  iterations=iterations,
lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print ('For testing data the RMSE is %s' % (error))

For testing data the RMSE is 0.9457633345043662

recommend movies for a specified user, here, user with id=2

1
2
3
4
5
6
7
8
# Recommend to user movies which are unrated by himself, recommend movie for ID:2 as an example.
# First find all unrated movies for ID:2
personInfo=ratings.filter(lambda x:x[0]=='2')
movieRated=personInfo.map(lambda x:x[1]).collect()
movieUnrated=ratings.filter(lambda x:x[1] not in movieRated).map(lambda x:x[1]).distinct()
moviePredict=movieUnrated.map(lambda x:['2',x])
predictRes=model.predictAll(moviePredict)
predictRes=predictRes.map(lambda x:(x.product,x.rating))

join movie information

1
2
predictRes=predictRes.map(lambda x:(str(x[0]),(x[1]))).join(movies)
predictRes.take(3)

Out[169]:
[('27706',
(3.761019669094141,
("Lemony Snicket's A Series of Unfortunate Events (2004)",
'Adventure|Children|Comedy|Fantasy'))),
('37240', (2.4144329602095578, ('Why We Fight (2005)', 'Documentary'))),
('45183',
(4.492474207135306,
('Protector The (a.k.a. Warrior King) (Tom yum goong) (2005)',
'Action|Comedy|Crime|Thriller')))]

reformat the data to fit the model, take the highest 25 movies predicted ratings

1
2
3
predictRes=predictRes.map(lambda x:(x[1][1][0],x[1][0]))
top_movies = predictRes.takeOrdered(25, key=lambda x: -x[1])
top_movies

Out[172]:
[('Paulie (1998)', 6.010185255833662),
('Sweet Land (2005)', 5.791408996053072),
('Stir Crazy (1980)', 5.76573694136966),
('Westerner The (1940)', 5.549436023244587),
('Battlestar Galactica (2003)', 5.533985629970289),
('Truly Madly Deeply (1991)', 5.483704154666285),
('Auntie Mame (1958)', 5.380268674819099),
('School Daze (1988)', 5.363982694591471),
('Bugs Bunny / Road Runner Movie The (a.k.a. The Great American Chase) (1979)',
5.358809646115851),
('Cashback (2006)', 5.358809646115851),
('Memoirs of a Geisha (2005)', 5.317983563349337),
('About Time (2013)', 5.316004836486595),
('Vicious Kind The (2009)', 5.285894518338154),
('Spread (2009)', 5.285894518338154),
('New Rose Hotel (1998)', 5.27282052920912),
('Cocaine Cowboys (2006)', 5.264844765867043),
("Empire of the Wolves (L'empire des loups) (2005)", 5.242122655555558),
('Repo! The Genetic Opera (2008)', 5.242122655555558),
('Visitors The (Visiteurs Les) (1993)', 5.24178791470284),
('Aguirre: The Wrath of God (Aguirre der Zorn Gottes) (1972)',
5.23498039535383),
('Bloody Sunday (2002)', 5.2286529970954305),
('Mrs. Miniver (1942)', 5.216295450453579),
('Chaos (2001)', 5.212268024919439),
("Twelve O'Clock High (1949)", 5.207334792061248),
('American Pimp (1999)', 5.207111214164087)]