Similarity-based recommendation

In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy
import random
In [2]:
# From https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz
path = "C://Users/Julian McAuley/Documents/class_files/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
In [3]:
f = gzip.open(path, 'rt', encoding="utf8")
In [4]:
header = f.readline()
header = header.strip().split('\t')
In [5]:
dataset = []
In [6]:
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)
In [7]:
dataset[0]
Out[7]:
{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}
In [8]:
len(dataset)
Out[8]:
904765

First we'll build a few useful data structures, in this case just to maintain a collection of the items reviewed by each user, and the collection of users who have reviewed each item.

In [9]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
In [10]:
itemNames = {}
In [11]:
for d in dataset:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    itemNames[item] = d['product_title']
In [12]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom
In [13]:
def mostSimilar(i):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem: # For all items
        if i == i2: continue # other than the query
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:10]

Generating a recommendation

In [14]:
dataset[2]
Out[14]:
{'marketplace': 'US',
 'customer_id': '6111003',
 'review_id': 'RIZR67JKUDBI0',
 'product_id': 'B0006VMBHI',
 'product_parent': '603261968',
 'product_title': 'AudioQuest LP record clean brush',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Three Stars',
 'review_body': 'removes dust. does not clean',
 'review_date': '2015-08-31'}
In [15]:
query = dataset[1]['product_id']
In [16]:
mostSimilar(query)
Out[16]:
[(0.005194805194805195, 'B000GBAR5G'),
 (0.004842615012106538, 'B00181T20O'),
 (0.004024144869215292, 'B0002E3FCO'),
 (0.0035714285714285713, 'B00JQYUAQU'),
 (0.0035714285714285713, 'B00IPQZWAY'),
 (0.0035714285714285713, 'B00GT01VQW'),
 (0.0035714285714285713, 'B00DZVYDEK'),
 (0.0035714285714285713, 'B005FM2DJE'),
 (0.0035714285714285713, 'B004XNRFIU'),
 (0.0035714285714285713, 'B00439I880')]
In [17]:
itemNames[query]
Out[17]:
'Sennheiser HD203 Closed-Back DJ Headphones'
In [18]:
[itemNames[x[1]] for x in mostSimilar(query)]
Out[18]:
['Dean Vendetta XM Electric Guitar,',
 'BEHRINGER MS16',
 'CAD Audio EPF-15A Pop Filter on 6-inch Gooseneck',
 'Reliable Hardware Company RH-RMSET-25-A 25 Sets of Rack Rail Screws and Washers',
 'Gold Les Paul Style Guitar Pickguard Bracket',
 'Washburn PXL10QWBM Parallaxe PXL Series Solid-Body Electric Guitar, Quilt Wine Burst Matte Finish',
 'GP Percussion B2 6" / 7" Tunable Hickory Bongos PRO Series, Bottom Tuning, Dark Brown with Matching 11021B GP Percussion Double Braced Stand and TMS Polishing Cloth',
 'Squier by Fender "Stop Dreaming, Start Playing" Set: Affinity J Bass with Rumble 15 Amp with Gear Guardian Extended Warranty - Metallic Blue',
 'NEW PRO QUALITY FENDER STARCASTER STRAT ELECTRIC GUITAR',
 'Gon Bops Bongo Knee Pads (Standard)']

Efficient similarity-based recommendation

In [19]:
def mostSimilarFast(i):
    similarities = []
    users = usersPerItem[i]
    candidateItems = set()
    for u in users:
        candidateItems = candidateItems.union(itemsPerUser[u])
    for i2 in candidateItems:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:10]
In [20]:
mostSimilarFast(query)
Out[20]:
[(0.005194805194805195, 'B000GBAR5G'),
 (0.004842615012106538, 'B00181T20O'),
 (0.004024144869215292, 'B0002E3FCO'),
 (0.0035714285714285713, 'B00JQYUAQU'),
 (0.0035714285714285713, 'B00IPQZWAY'),
 (0.0035714285714285713, 'B00GT01VQW'),
 (0.0035714285714285713, 'B00DZVYDEK'),
 (0.0035714285714285713, 'B005FM2DJE'),
 (0.0035714285714285713, 'B004XNRFIU'),
 (0.0035714285714285713, 'B00439I880')]

Collaborative-filtering-based rating estimation

In [21]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
In [22]:
for d in dataset:
    user,item = d['customer_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
In [23]:
ratingMean = sum([d['star_rating'] for d in dataset]) / len(dataset)
In [24]:
ratingMean
Out[24]:
4.251102772543146

Our prediction function computes (a) a list of the user's previous ratings (ignoring the query item); and (b) a list of the similarities of these previous items, compared to the query. These weights are used to constructed a weighted average of the ratings from the first set.

In [25]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

Let's try a simple example:

In [26]:
dataset[1]
Out[26]:
{'marketplace': 'US',
 'customer_id': '14640079',
 'review_id': 'RZSL0BALIYUNU',
 'product_id': 'B003LRN53I',
 'product_parent': '986692292',
 'product_title': 'Sennheiser HD203 Closed-Back DJ Headphones',
 'product_category': 'Musical Instruments',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Five Stars',
 'review_body': 'Nice headphones at a reasonable price.',
 'review_date': '2015-08-31'}
In [27]:
u,i = dataset[1]['customer_id'], dataset[1]['product_id']
In [28]:
predictRating(u, i)
Out[28]:
5.0
In [29]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)
In [30]:
alwaysPredictMean = [ratingMean for d in dataset]
In [31]:
cfPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataset]
In [32]:
labels = [d['star_rating'] for d in dataset]
In [33]:
MSE(alwaysPredictMean, labels)
Out[33]:
1.4796142779564334
In [34]:
MSE(cfPredictions, labels)
Out[34]:
1.6146130004291603

In this case, the accuracy of our rating prediction model was actually worse (in terms of the MSE) than just predicting the mean rating. However note again that this is just a heuristic, and could be modified to improve its predictions (e.g. by using a different similarity function other than the Jaccard similarity).