principle:

projects:

Building a Movie Recommendation Service

Matrix factorization algorithm implement by hand

load necessary packages

import pandas as pd
import numpy as np
import sys, numpy as np
from numpy import genfromtxt
import codecs
from numpy import linalg as LA

load movie data and rating data

1 2	movies=pd.read_csv("movies.csv") ratings=pd.read_csv("ratings.csv")

Movie ids are not continuous, build movie dicitionary with line no as numpy movie id ,its actual movie id as the key.

def build_movies_dict(movies_file):
    i = 0
    movie_id_dict = {}
    with codecs.open(movies_file, 'r', 'latin-1') as f:
        for line in f:
            if i == 0:
                i = i+1
            else:
                movieId,title,genres = line.split(',')
                movie_id_dict[int(movieId)] = i-1
                i = i +1
    return movie_id_dict

Each line of i/p file represents one tag applied to one movie by one user,and has the following format: userId,movieId,tag,timestamp make sure you know the number of users and items for your dataset return the sparse matrix as a numpy array.

def read_data(input_file,movies_dict):
    #no of users
    users = 718
    #users = 5
    #no of movies
    movies = 8927
    #movies = 135887
    X = np.zeros(shape=(users,movies))
    i = 0
    #X = genfromtxt(input_file, delimiter=",",dtype=str)
    with open(input_file,'r') as f:
        for line in f:
            if i == 0:
                i = i +1
            else:
                #print "i is",i
                user,movie_id,rating,timestamp = line.split(',')
                #get the movie id for the numpy array consrtruction
                id = movies_dict[int(movie_id)]
                #print "user movie rating",user, movie, rating, i
                X[int(user)-1,id] = float(rating)
                i = i+1
    return X

matrix factorization implementation

X is the user-rate-movie matrix, it’s very sparse. P, Q are user-feature matrix and movie-featuree matrix respectively. The objective is to use gradient descend method to find the P,Q where $P \times Q$ approximates X.

def matrix_factorization(X,P,Q,K,steps,alpha,beta):
    Q = Q.T
    for step in xrange(steps):
        print (step)
        #for each user
        for i in xrange(X.shape[0]):
            #for each item
            for j in xrange(X.shape[1]):
                if X[i][j] > 0 :

                    #calculate the error of the element
                    eij = X[i][j] - np.dot(P[i,:],Q[:,j])
                    #second norm of P and Q for regularilization
                    sum_of_norms = 0
                    #for k in xrange(K):
                    #    sum_of_norms += LA.norm(P[:,k]) + LA.norm(Q[k,:])
                    #added regularized term to the error
                    sum_of_norms += LA.norm(P) + LA.norm(Q)
                    #print sum_of_norms
                    eij += ((beta/2) * sum_of_norms)
                    #print eij
                    #compute the gradient from the error
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * ( 2 * eij * Q[k][j] - (beta * P[i][k]))
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - (beta * Q[k][j]))

        #compute total error
        error = 0
        #for each user
        for i in xrange(X.shape[0]):
            #for each item
            for j in xrange(X.shape[1]):
                if X[i][j] > 0:
                    error += np.power(X[i][j] - np.dot(P[i,:],Q[:,j]),2)
        if error < 0.001:
            break
    return P, Q.T

main function

def main(X,K):
    #no of users
    N= X.shape[0]
    #no of movies
    M = X.shape[1]
    #P: an initial matrix of dimension N x K, where is n is no of users and k is hidden latent features
    P = np.random.rand(N,K)
    #Q : an initial matrix of dimension M x K, where M is no of movies and K is hidden latent features
    Q = np.random.rand(M,K)
    #steps : the maximum number of steps to perform the optimisation, hardcoding the values
    #alpha : the learning rate, hardcoding the values
    #beta  : the regularization parameter, hardcoding the values
    steps = 5000
    alpha = 0.0002
    beta = float(0.02)
    estimated_P, estimated_Q = matrix_factorization(X,P,Q,K,steps,alpha,beta)
    #Predicted numpy array of users and movie ratings
    modeled_X = np.dot(estimated_P,estimated_Q.T)
    np.savetxt('mf_result.txt', modeled_X, delimiter=',')

if __name__ == '__main__':
    #MatrixFactorization.py <rating file>  <no of hidden features>  <movie mapping file>
    if len(sys.argv) == 4:
        ratings_file =  sys.argv[1]
        no_of_features = int(sys.argv[2])
        movies_mapping_file = sys.argv[3]

        #build a dictionary of movie id mapping with counter of no of movies
        movies_dict = build_movies_dict(movies_mapping_file)
        #read data and return a numpy array
        numpy_arr = read_data(ratings_file,movies_dict)
        #main function
        main(numpy_arr,no_of_features)

recommend 50 tops movies for each user based on his/her unrated movies. Implemented this seperately from building model as once the model is built, we can use it many times.

def dict_with_user_unrated_movies(rating_file,movie_mapping_id):
    #no of users
    users = 718
    #users = 5
    #no of movie ids
    #movies = 4
    movies = 8927
    dict_with_unrated_movies_users ={}
    X = np.zeros(shape=(users,movies))
    i = 0
    with open(rating_file,'r') as f:
        for line in f:
            if i == 0:
                i = i +1
            else:
                user,movie,rating,timestamp = line.split(',')
                id = movie_mapping_id[int(movie)]
                #print "user movie rating",user, movie, rating, i
                X[int(user)-1,id] = float(rating)
                i = i+1
    #print X
    for row in xrange(X.shape[0]):
        unrated_movi_ids = np.nonzero(X[row] == 0)
        #print "user",row+1, "has unrated movies", list(unrated_movi_ids[0])
        unrated_movi_ids = list(unrated_movi_ids[0])
        unrated_movi_ids = map(lambda x: x+1,unrated_movi_ids)
        dict_with_unrated_movies_users[row+1] = unrated_movi_ids
    #print "dict with unrated movies",dict_with_unrated_movies_users
    return dict_with_unrated_movies_users


#recommend top 25 movies for user specified
def top_25_recommended_movies(pred_rating_file,users,unrated_movies_per_user,movies_mapping_names,movie_mapping_id):
    #dicitonary with numpy movie id as key and actual movie id as value
    reverse_movie_id_mapping = {}
    for key,val in movie_mapping_id.items():
        reverse_movie_id_mapping[val] = key
    #for each user, predict top 25 movies
    for user in users:
        dict_pred_unrated_movies = {}
        unrated_movies = unrated_movies_per_user[int(user)]
        for unrated_movie in unrated_movies:
            dict_pred_unrated_movies[int(unrated_movie)] = pred_rating_file[int(user)-1][int(unrated_movie)-1]
        #recommend top k movies
        SortedMovies = sorted(dict_pred_unrated_movies.iteritems(), key=operator.itemgetter(1), reverse=True)
        print ("Top 25 movies recommendation for the user", user)
        for i in range(25):
            movie_id, rating = SortedMovies[i]
            actual_movie_id = reverse_movie_id_mapping[movie_id]
            #recommend movies only if the predicted rating is greater than 3.5
            if rating >= 3.5 :
                print ("{} ".format(movie))
            #print ("{} with Movie rating value {}".format(movies_mapping_names[actual_movie_id],rating))
        print ("\n")

#main method
def recommend_movies_for_users(orig_rating_file,pred_rating_file,movies_file,users):
    #method to get the mapping between movie names, actual movie id and numpy movie id
    movies_mapping_names,movie_mapping_id = dict_with_movie_and_id(movies_file)
    #build predicted numpy movie id from the saved predicted matrix of user and movie ratings
    predicted_rating_numpy_array = build_predicted_numpy_array(pred_rating_file)
    #dictionary of unrated movies for each user
    dict_with_unrated_movies_users = dict_with_user_unrated_movies(orig_rating_file,movie_mapping_id)
    #method which actually recommends top 25 unrated movies based on their the predicted score
    top_25_recommended_movies(predicted_rating_numpy_array,users,dict_with_unrated_movies_users,movies_mapping_names,movie_mapping_id)

if __name__ == '__main__':
    if len(sys.argv) == 5:
        #read the rating file for the missing
        orig_rating_file = sys.argv[1]
        pred_rating_file = sys.argv[2]
        movies_file = sys.argv[3]
        list_of_users = sys.argv[4]
        with open (list_of_users,'r') as f:
          users = f.readline().split(',')
        recommend_movies_for_users(orig_rating_file,pred_rating_file,movies_file,users)

Using Apache Spark to faciliate computing

load data and convert it to RDD

1 2	movies = sc.textFile("/FileStore/tables/movies.csv") ratings= sc.textFile("/FileStore/tables/ratings.csv")

data cleaning

1 2	ratings=ratings.map(lambda x:x.split(",")) movies=movies.filter(lambda x:'movieId' not in x).map(lambda x:x.split(',')).map(lambda x:(x[0],(x[1],x[2])))

have a glimpse at ratings data

1	ratings.take(2)

Out[154]: [['userId', 'movieId', 'rating', 'timestamp'], ['1', '1', '5.0', '847117005']]

remove the header line

1	ratings=ratings.filter(lambda x:"userId" not in x).map(lambda x:(x[0],x[1],x[2]))

split the data into training set, validation set and test set

1
2
3

training_RDD, validation_RDD, test_RDD = ratings.randomSplit([6, 2, 2])
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

use alternate least squares algorithm to calcualte the coefficients of the factorization machines

from pyspark.mllib.recommendation import ALS
import math
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02
min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank,  iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print ('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank
print ('The best model was trained with rank %s' % best_rank)

For rank 4 the RMSE is 0.9359056108007288
For rank 8 the RMSE is 0.9382743636404246
For rank 12 the RMSE is 0.9389290854027168
The best model was trained with rank 4

what the prediction looks like

1	predictions.take(3)

Out[159]:
[((44, 3272), 3.9103419004701716),
((618, 7184), 2.9601086695162566),
((264, 52328), 3.3141828739969803)]

test model performance on test data set

model = ALS.train(training_RDD, best_rank,  iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print ('For testing data the RMSE is %s' % (error))

For testing data the RMSE is 0.9457633345043662

# Recommend to user movies which are unrated by himself, recommend movie for ID:2 as an example.
# First find all unrated movies for ID:2
personInfo=ratings.filter(lambda x:x[0]=='2')
movieRated=personInfo.map(lambda x:x[1]).collect()
movieUnrated=ratings.filter(lambda x:x[1] not in movieRated).map(lambda x:x[1]).distinct()
moviePredict=movieUnrated.map(lambda x:['2',x])
predictRes=model.predictAll(moviePredict)
predictRes=predictRes.map(lambda x:(x.product,x.rating))

join movie information

1 2	predictRes=predictRes.map(lambda x:(str(x[0]),(x[1]))).join(movies) predictRes.take(3)

Out[169]:
[('27706',
(3.761019669094141,
("Lemony Snicket's A Series of Unfortunate Events (2004)",
'Adventure|Children|Comedy|Fantasy'))),
('37240', (2.4144329602095578, ('Why We Fight (2005)', 'Documentary'))),
('45183',
(4.492474207135306,
('Protector The (a.k.a. Warrior King) (Tom yum goong) (2005)',
'Action|Comedy|Crime|Thriller')))]

reformat the data to fit the model, take the highest 25 movies predicted ratings

1
2
3

predictRes=predictRes.map(lambda x:(x[1][1][0],x[1][0]))
top_movies = predictRes.takeOrdered(25, key=lambda x: -x[1])
top_movies

Out[172]:
[('Paulie (1998)', 6.010185255833662),
('Sweet Land (2005)', 5.791408996053072),
('Stir Crazy (1980)', 5.76573694136966),
('Westerner The (1940)', 5.549436023244587),
('Battlestar Galactica (2003)', 5.533985629970289),
('Truly Madly Deeply (1991)', 5.483704154666285),
('Auntie Mame (1958)', 5.380268674819099),
('School Daze (1988)', 5.363982694591471),
('Bugs Bunny / Road Runner Movie The (a.k.a. The Great American Chase) (1979)',
5.358809646115851),
('Cashback (2006)', 5.358809646115851),
('Memoirs of a Geisha (2005)', 5.317983563349337),
('About Time (2013)', 5.316004836486595),
('Vicious Kind The (2009)', 5.285894518338154),
('Spread (2009)', 5.285894518338154),
('New Rose Hotel (1998)', 5.27282052920912),
('Cocaine Cowboys (2006)', 5.264844765867043),
("Empire of the Wolves (L'empire des loups) (2005)", 5.242122655555558),
('Repo! The Genetic Opera (2008)', 5.242122655555558),
('Visitors The (Visiteurs Les) (1993)', 5.24178791470284),
('Aguirre: The Wrath of God (Aguirre der Zorn Gottes) (1972)',
5.23498039535383),
('Bloody Sunday (2002)', 5.2286529970954305),
('Mrs. Miniver (1942)', 5.216295450453579),
('Chaos (2001)', 5.212268024919439),
("Twelve O'Clock High (1949)", 5.207334792061248),
('American Pimp (1999)', 5.207111214164087)]