import gzip
import sklearn
from sklearn import linear_model
from collections import defaultdict
import random
import math
def parse(f):
for l in gzip.open(f):
yield eval(l)
# Download data from:
# https://drive.google.com/uc?id=1V4MLeoEiPQdocCbUHjR_7L9ZmxTufPFe
dataset = list(parse("goodreads_reviews_comics_graphic.json.gz"))
len(dataset)
542338
dataset[1]
{'user_id': 'bafc2d50014200cda7cb2b6acd60cd73', 'book_id': '6315584', 'review_id': '72f1229aba5a88f9e72f0dcdc007dd22', 'rating': 4, 'review_text': "I've never really liked Spider-Man. I am, however, a huge fan of the Dresden Files. Jim Butcher is clever and sarcastic and probably the perfect choice to pen a superhero novel. I really enjoyed this book!", 'date_added': 'Wed Aug 10 06:06:48 -0700 2016', 'date_updated': 'Fri Aug 12 08:49:54 -0700 2016', 'read_at': 'Fri Aug 12 08:49:54 -0700 2016', 'started_at': 'Wed Aug 10 00:00:00 -0700 2016', 'n_votes': 0, 'n_comments': 0}
def feature(d):
dayFeat = [0]*7 # One hot encoding of day of week
dayDict = {"Mon":0, "Tue":1, "Wed":2, "Thu":3, "Fri":4, "Sat":5, "Sun":6}
dayFeat[dayDict[d['date_added'][:3]]] = 1
return [1, d['rating'], d['n_comments']] + dayFeat[1:]
X = [feature(d) for d in dataset]
y = [len(d['review_text']) for d in dataset]
model = sklearn.linear_model.LinearRegression()
model.fit(X,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
yPred = model.predict(X)
def MSE(predictions, labels):
differences = [(x-y)**2 for x,y in zip(predictions,labels)]
return sum(differences) / len(differences)
mse = MSE(yPred, y)
mse
624989.9720071978
ysort = y[:]
ysort.sort()
ymin = ysort[len(ysort)//10]
ymax = ysort[-len(ysort)//10]
ymin,ymax
(37, 1252)
X1a,y1a = [], []
for x_, y_ in zip(X,y):
if y_ >= ymin and y_ <= ymax:
X1a.append(x_)
y1a.append(y_)
len(X1a), len(y1a)
(435083, 435083)
model.fit(X1a,y1a)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
yPred1a = model.predict(X1a)
MSE(yPred1a, y1a)
84193.77172576706
# Advantages: model may conform better to model assumptions (e.g. normality).
# MSE won't be dominated by samples from the tail of the distribution
# Disadvantages: doesn't give us a practical means of prediction for outlying instances
y1b = [math.log(y_ + 1) for y_ in y]
model.fit(X,y1b)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
yPred1b = model.predict(X)
yPred1b = [math.exp(y_) - 1 for y_ in yPred1b]
MSE(yPred1b, y)
7.680529824252287e+44
# why so inaccurate?
# Advantages: May better correspond to model assumptions
# Disadvantages: Requires careful hand-tuning of transformation to satisfy model assumptions
median = ysort[len(y)//2]
y1c = [y_ > median for y_ in y]
model = sklearn.linear_model.LogisticRegression()
model.fit(X, y1c)
/usr/lib/python3/dist-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)
yPred1c = model.predict(X)
sum(yPred1c == y1c) / len(y)
0.5230704837204843
# Advantages: Avoids the modeling assumptions of regression problems altogether
# Disadvantages: Very "coarse-grained", may not match our original goals
# Follow tensorflow code from: https://cseweb.ucsd.edu/~jmcauley/pml/code/chap3.html;
# Replace the MSE with sum of absolute values
Note: You can insert an image (e.g. containing a hand-written solution) via edit->insert image
# Code based on http://cseweb.ucsd.edu/classes/fa20/cse258-a/code/workbook4.html
# Utility data structures
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
usersPerItem = defaultdict(set) # U_i from class slides
itemsPerUser = defaultdict(set) # I_u from class slides
for d in dataset:
user,item = d['user_id'], d['book_id']
reviewsPerUser[user].append(d)
reviewsPerItem[item].append(d)
usersPerItem[item].add(user)
itemsPerUser[user].add(item)
ratingMean = sum([d['rating'] for d in dataset]) / len(dataset)
ratingMean
3.778138356523054
def Jaccard(s1, s2):
numer = len(s1.intersection(s2))
denom = len(s1.union(s2))
return numer / denom
# This function should be re-defined for each of your model variants
def predictRating(user,item):
ratings = []
similarities = []
for d in reviewsPerUser[user]:
i2 = d['book_id']
if i2 == item: continue
ratings.append(d['rating'])
similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
if (sum(similarities) > 0):
weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
return sum(weightedRatings) / sum(similarities)
else:
# User hasn't rated any similar items
return ratingMean
# Example use:
dataset[1]
{'user_id': 'bafc2d50014200cda7cb2b6acd60cd73', 'book_id': '6315584', 'review_id': '72f1229aba5a88f9e72f0dcdc007dd22', 'rating': 4, 'review_text': "I've never really liked Spider-Man. I am, however, a huge fan of the Dresden Files. Jim Butcher is clever and sarcastic and probably the perfect choice to pen a superhero novel. I really enjoyed this book!", 'date_added': 'Wed Aug 10 06:06:48 -0700 2016', 'date_updated': 'Fri Aug 12 08:49:54 -0700 2016', 'read_at': 'Fri Aug 12 08:49:54 -0700 2016', 'started_at': 'Wed Aug 10 00:00:00 -0700 2016', 'n_votes': 0, 'n_comments': 0}
u,i = dataset[1]['user_id'], dataset[1]['book_id']
predictRating(u,i)
4.44493246042927
sample = random.sample(dataset, 1000)
sampleLabels = [d['rating'] for d in sample]
# Baseline prediction
alwaysPredictMean = [ratingMean for d in sample]
# Prediction using item-to-item similarity above
cfPredictions = [predictRating(d['user_id'], d['book_id']) for d in sample]
# Baseline accuracy
MSE(alwaysPredictMean, sampleLabels)
1.2941597424294882
# Item-to-item similarity accuracy
MSE(cfPredictions, sampleLabels)
1.0782389104426735
ratingDict = {}
for d in dataset:
user,item = d['user_id'], d['book_id']
ratingDict[(user,item)] = d['rating']
userAverages = {}
itemAverages = {}
for u in itemsPerUser:
rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
userAverages[u] = sum(rs) / len(rs)
for i in usersPerItem:
rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
itemAverages[i] = sum(rs) / len(rs)
def predictRating2(user,item):
ratings = []
similarities = []
for d in reviewsPerUser[user]:
i2 = d['book_id']
if i2 == item: continue
ratings.append(d['rating'] - itemAverages[i2])
similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
if (sum(similarities) > 0):
weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
return itemAverages[item] + sum(weightedRatings) / sum(similarities)
else:
# User hasn't rated any similar items
return ratingMean
cfPredictions = [predictRating2(d['user_id'], d['book_id']) for d in sample]
MSE(cfPredictions, sampleLabels)
0.7991615857906483
def Cosine(s1, s2):
# Not a proper implementation, operates on sets so correct for interactions only
numer = len(s1.intersection(s2))
denom = math.sqrt(len(s1)) * math.sqrt(len(s2))
return numer / denom
def predictRating3(user,item):
ratings = []
similarities = []
for d in reviewsPerUser[user]:
i2 = d['book_id']
if i2 == item: continue
ratings.append(d['rating'] - itemAverages[i2])
similarities.append(Cosine(usersPerItem[item],usersPerItem[i2]))
if (sum(similarities) > 0):
weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
return itemAverages[item] + sum(weightedRatings) / sum(similarities)
else:
# User hasn't rated any similar items
return ratingMean
cfPredictions = [predictRating3(d['user_id'], d['book_id']) for d in sample]
MSE(cfPredictions, sampleLabels)
0.8046933773938549
def predictRating4(user,item):
ratings = []
similarities = []
for d in reviewsPerItem[item]:
u2 = d['user_id']
if u2 == user: continue
ratings.append(d['rating'])
similarities.append(Jaccard(itemsPerUser[user],itemsPerUser[u2]))
if (sum(similarities) > 0):
weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
return sum(weightedRatings) / sum(similarities)
else:
return ratingMean
cfPredictions = [predictRating4(d['user_id'], d['book_id']) for d in sample]
MSE(cfPredictions, sampleLabels)
1.2918759089186271