[Recommendation system notes] python implements the simplest recommendation system (complete code attached)

Data introduction

Film csv format data

Similarity factor uses scene comparison

  • If the data has a fractional dilation problem, Pearson correlation coefficient is used.
  • If the data is "dense", there are basically common values between variables, and these distance data are important, use Euclidean or Manhattan distances.
  • If the data is sparse, cosine similarity is used.
    Pearson correlation coefficient is used in this experiment.
Pearson correlation coefficient

The formula for calculating Pearson correlation coefficient is:



In addition to the seemingly complex formulas above, another problem is that the data must be traversed several times in order to obtain a calculation result.good
We have another formula that calculates the approximation of Pearson correlation coefficient:



Although this formula may seem more complex, and its calculation results may be unstable with some errors, its greatest advantage is that it can be implemented in code with only one iteration of the data.The code is as follows:
def pearson_distance(self,usr1,usr2):
        sum_x_y = 0
        sum_x = 0
        sum_y = 0
        sum_x_2 = 0
        sum_y_2 = 0
        n = 0
        for movie in usr1.keys():
            if movie in usr2.keys():
                n += 1
                x = usr1[movie]
                y = usr2[movie]
                sum_x_y += x*y
                sum_x += x
                sum_y += y
                sum_x_2 += x**2
                sum_y_2 += y**2
        if n==0:return 0
        denominator = math.sqrt(sum_x_2-float(sum_x**2)/n) * math.sqrt(sum_y_2-float(sum_y**2)/n)
        if denominator==0:return 0
        return (sum_x_y - float(sum_x*sum_y)/n)/denominator

Complete Code


import math

class recommender(object):
    def o_distance(self,usr1,usr2):
        distance = 0
        for movie in usr1.keys():
            if movie in usr2.keys():
                distance += abs(usr1[movie]-usr2[movie])**2
        return math.sqrt(distance)

    def pearson_distance(self,usr1,usr2):
        sum_x_y = 0
        sum_x = 0
        sum_y = 0
        sum_x_2 = 0
        sum_y_2 = 0
        n = 0
        for movie in usr1.keys():
            if movie in usr2.keys():
                n += 1
                x = usr1[movie]
                y = usr2[movie]
                sum_x_y += x*y
                sum_x += x
                sum_y += y
                sum_x_2 += x**2
                sum_y_2 += y**2
        if n==0:return 0
        denominator = math.sqrt(sum_x_2-float(sum_x**2)/n) * math.sqrt(sum_y_2-float(sum_y**2)/n)
        if denominator==0:return 0
        return (sum_x_y - float(sum_x*sum_y)/n)/denominator
                    
    def k_nearst(self,k):
        distances = []
        for usr,rate in self.usr_rating.items():
            if not usr == self.usr:
                distance = self.pearson_distance(self.usr_rating[self.usr],self.usr_rating[usr])
                if distance != 0:distances.append((usr,distance))        
        distances.sort(key=lambda item:item[1],reverse=True)
        #print(distances)
        if k>len(distances):return distances
        else:return distances[:k]
        
    def load_data(self,path):
        with open(path) as f:
             lines = f.readlines()
        usr_name = [i.strip('"') for i in lines[0].strip().split(',')[1:]]
        for line in lines[1:]:
            items = line.strip().split(',')
            movie = items[0].strip('"')
            for index in range(1,len(items)):
                if not items[index]=='':
                    if usr_name[index-1] not in self.usr_rating:
                        self.usr_rating[usr_name[index-1]]={movie:int(items[index])}
                    else:self.usr_rating[usr_name[index-1]][movie] = int(items[index])
                    

    def recomend_k(self,nearst,k):
        recommend = {}
        total_distance = 0
        for item in nearst: 
            total_distance+=item[1]
        for item in nearst:
            u_name = item[0]
            weight = float(item[1])/total_distance
            for movie,rate in self.usr_rating[u_name].items():
                if movie not in self.usr_rating[self.usr].keys():
                    if movie not in recommend.keys():
                        recommend[movie] = rate*weight
                    else:
                        recommend[movie] += rate*weight
        print(recommend)   
        top_k = list(recommend.items())
        top_k.sort(key=lambda x:x[1],reverse=True)
        if k>len(top_k):return top_k
        else:return top_k[:k]

    def __init__(self,path,usr):
        self.path=path
        #'/home/ant2017/Downloads/Movie_Ratings.csv'
        self.usr = usr
        self.usr_rating = {}
        self.load_data(path)
        
    def run(self):
        nearst = self.k_nearst(5)
        print(nearst)
        top_k = self.recomend_k(nearst,5)
        for item in top_k:
            print("Recommend movies for you:"+item[0]+"\t Recommended Index:"+str(item[1]))  
        
path = '/home/ant2017/Downloads/Movie_Ratings.csv'
r = recommender(path,'vanessa')
r.run()

Tags: Lambda

Posted on Sat, 27 Jun 2020 12:13:02 -0400 by Bleej