import gzip
import matplotlib.pyplot as plt
import numpy
import random
import scipy
import tensorflow as tf
from collections import defaultdict
from fastFM import als
from scipy.spatial import distance
Data is available at http://cseweb.ucsd.edu/~jmcauley/pml/data/. Download and save to your own directory
dataDir = "/home/jmcauley/pml_data/"
Parse the Goodreads comic book data (excluding review text)
def parseData(fname):
for l in gzip.open(fname):
d = eval(l)
del d['review_text'] # Discard the reviews to save memory
d['year'] = int(d['date_added'][-4:]) # Use this for exercises
yield d
data = list(parseData(dataDir + "goodreads_reviews_comics_graphic.json.gz"))
random.shuffle(data)
For example...
data[0]
{'book_id': '15799191', 'date_added': 'Sun Jun 30 06:12:24 -0700 2013', 'date_updated': 'Thu Jul 04 08:02:07 -0700 2013', 'n_comments': 0, 'n_votes': 0, 'rating': 4, 'read_at': 'Thu Jul 04 08:02:07 -0700 2013', 'review_id': '86b43a448463928ac5d7887b364e2fcd', 'started_at': 'Sun Jun 30 00:00:00 -0700 2013', 'user_id': '23852e2647c217deb24964aadb26be64', 'year': 2013}
Utility data structures. Most importantly, each user and item is mapped to an ID from 1 to nUsers/nItems
userIDs,itemIDs = {},{}
for d in data:
u,i = d['user_id'],d['book_id']
if not u in userIDs: userIDs[u] = len(userIDs)
if not i in itemIDs: itemIDs[i] = len(itemIDs)
nUsers,nItems = len(userIDs),len(itemIDs)
nUsers,nItems
(59347, 89311)
Build the factorization machine design matrix. Note that each instance is a row, and the columns encode both users and items. Other features could straightforwardly be added.
X = scipy.sparse.lil_matrix((len(data), nUsers + nItems))
for i in range(len(data)):
user = userIDs[data[i]['user_id']]
item = itemIDs[data[i]['book_id']]
X[i,user] = 1 # One-hot encoding of user
X[i,nUsers + item] = 1 # One-hot encoding of item
Target (rating) to predict for each row
y = numpy.array([d['rating'] for d in data])
Initialize the factorization machine
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
Split data into train and test portions
X_train,y_train = X[:400000],y[:400000]
X_test,y_test = X[400000:],y[400000:]
Train the model
fm.fit(X_train, y_train)
FMRegression(init_stdev=0.1, l2_reg=0, l2_reg_V=0.5, l2_reg_w=0.1, n_iter=1000, random_state=123, rank=5)
Extract predictions on the test set
y_pred = fm.predict(X_test)
y_pred[:10]
array([2.23866277, 4.37847526, 3.84866594, 5.03002627, 3.94993096, 4.36740166, 4.22497778, 4.30029268, 3.28282377, 4.05036905])
y_test[:10]
array([5, 4, 2, 5, 4, 5, 5, 5, 5, 5])
def MSE(predictions, labels):
differences = [(x-y)**2 for x,y in zip(predictions,labels)]
return sum(differences) / len(differences)
MSE(y_pred, y_test)
1.5940755947213534
Simple example, just incorporating a one-hot encoding of the year (see data extraction in examples above)
minYear = min([d['year'] for d in data])
maxYear = max([d['year'] for d in data])
nYears = maxYear - minYear + 1
minYear, maxYear, nYears
(2005, 2017, 13)
userIDs,itemIDs = {},{}
for d in data:
u,i = d['user_id'],d['book_id']
if not u in userIDs: userIDs[u] = len(userIDs)
if not i in itemIDs: itemIDs[i] = len(itemIDs)
nUsers,nItems = len(userIDs),len(itemIDs)
X = scipy.sparse.lil_matrix((len(data), nUsers + nItems + nYears))
for i in range(len(data)):
user = userIDs[data[i]['user_id']]
item = itemIDs[data[i]['book_id']]
year = data[i]['year'] - minYear
X[i,user] = 1 # One-hot encoding of user
X[i,nUsers + item] = 1 # One-hot encoding of item
X[i,nUsers + nItems + year] = 1 # One-hot encoding of year
y = numpy.array([d['rating'] for d in data])
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
X_train,y_train,data_train = X[:400000],y[:400000],data[:400000]
X_test,y_test,data_test = X[400000:],y[400000:],data[400000:]
fm.fit(X_train, y_train)
FMRegression(init_stdev=0.1, l2_reg=0, l2_reg_V=0.5, l2_reg_w=0.1, n_iter=1000, random_state=123, rank=5)
y_pred_with_features = fm.predict(X_test)
MSE(y_pred_with_features, y_test)
1.6068283763129323
Cold start plots. Count training instances per item (could also measure coldness per user if we had user features).
d
{'book_id': '31423340', 'date_added': 'Mon Sep 18 19:24:36 -0700 2017', 'date_updated': 'Thu Sep 21 04:54:01 -0700 2017', 'n_comments': 6, 'n_votes': 13, 'rating': 4, 'read_at': 'Thu Sep 21 15:03:21 -0700 2017', 'review_id': '4f39bbb5aeab832c794a3d2e40943949', 'started_at': 'Tue Sep 19 20:23:13 -0700 2017', 'user_id': '1d945500234cbc7a6138a4d017dbfe4b', 'year': 2017}
nTrainPerItem = defaultdict(int)
for d in data_train:
nTrainPerItem[d['book_id']] += 1
rmsePerNtrainFeatures = defaultdict(list)
rmsePerNtrain = defaultdict(list)
for d,y,ypf,yp in zip(data_test,y_test,y_pred_with_features,y_pred):
e2_features = (y-ypf)**2
e2 = (y-yp)**2
nt = nTrainPerItem[d['book_id']]
if nt < 5:
rmsePerNtrainFeatures[str(nt)].append(e2_features)
rmsePerNtrain[str(nt)].append(e2)
elif nt < 10:
rmsePerNtrainFeatures['5-9'].append(e2_features)
rmsePerNtrain['5-9'].append(e2)
elif nt < 20:
rmsePerNtrainFeatures['10-19'].append(e2_features)
rmsePerNtrain['10-19'].append(e2)
else:
rmsePerNtrainFeatures['20+'].append(e2_features)
rmsePerNtrain['20+'].append(e2)
for r in rmsePerNtrain:
rmsePerNtrainFeatures[r] = sum(rmsePerNtrainFeatures[r]) / len(rmsePerNtrainFeatures[r])
rmsePerNtrain[r] = sum(rmsePerNtrain[r]) / len(rmsePerNtrain[r])
rmsePerNtrain
defaultdict(list, {'0': 1.1034747614259885, '1': 1.286213343267511, '10-19': 1.8579978486101703, '2': 1.3303440529731148, '20+': 1.696268806936738, '3': 1.4912379121450905, '4': 1.4174955135661096, '5-9': 1.670982336713272})
Xlab = ['0','1','2','3','4','5-9','10-19','20+']
X = [0,1,2,3,4,5.5,7,8.5] # Average of above ranges
YF = [rmsePerNtrainFeatures[x] for x in Xlab]
Y = [rmsePerNtrain[x] for x in Xlab]
plt.xlim(0, max(X))
plt.plot(X,YF,color='k',label='with features')
plt.plot(X,Y,color='grey',linestyle='--',label='without features')
plt.xticks(X,Xlab)
plt.xlabel("Number of training ratings per item")
plt.ylabel("MSE for such items")
plt.title("Performance versus item `coolness'")
plt.legend(loc="best")
plt.show()
Read social data from epinions
userIDs = {}
itemIDs = {}
interactions = []
socialTrust = defaultdict(set)
f = open(dataDir + "epinions_data/epinions.txt", 'rb')
header = f.readline()
for l in f:
try:
l = l.decode('utf-8')
l = l.split()
except Exception as e:
continue
i = l[0]
u = l[1]
if not u in userIDs: userIDs[u] = len(userIDs)
if not i in itemIDs: itemIDs[i] = len(itemIDs)
interactions.append((u,i))
f.close()
f = open(dataDir + "epinions_data/network_trust.txt", 'r')
for l in f:
try:
u,_,v = l.strip().split()
except Exception as e:
continue
if u in userIDs and v in userIDs:
socialTrust[u].add(v)
f.close()
random.shuffle(interactions)
items = list(itemIDs.keys())
BPR model. First we'll use a regular BPR model just to assess similarity between friends' latent representations. Later we can implement different social sampling assumptions just by passing different samples to the same model.
class BPRbatch(tf.keras.Model):
def __init__(self, K, lamb):
super(BPRbatch, self).__init__()
# Initialize variables
self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
# Regularization coefficient
self.lamb = lamb
# Prediction for a single instance
def predict(self, u, i):
p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
return p
# Regularizer
def reg(self):
return self.lamb * (tf.nn.l2_loss(self.betaI) +\
tf.nn.l2_loss(self.gammaU) +\
tf.nn.l2_loss(self.gammaI))
def score(self, sampleU, sampleI):
u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
beta_i = tf.nn.embedding_lookup(self.betaI, i)
gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
return x_ui
def call(self, sampleU, sampleI, sampleJ):
x_ui = self.score(sampleU, sampleI)
x_uj = self.score(sampleU, sampleJ)
return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))
def trainingStepBPR(model, interactions):
Nsamples = 50000
with tf.GradientTape() as tape:
sampleU, sampleI, sampleJ = [], [], []
for _ in range(Nsamples):
u,i = random.choice(interactions) # positive sample
j = random.choice(items) # negative sample
while j in itemsPerUser[u]:
j = random.choice(items)
sampleU.append(userIDs[u])
sampleI.append(itemIDs[i])
sampleJ.append(itemIDs[j])
loss = model(sampleU,sampleI,sampleJ)
loss += model.reg()
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients((grad, var) for
(grad, var) in zip(gradients, model.trainable_variables)
if grad is not None)
return loss.numpy()
First, train a regular BPR model (no social terms)
optimizer = tf.keras.optimizers.Adam(0.1)
modelBPR = BPRbatch(10, 0.00001)
nTrain = int(len(interactions) * 0.9)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,i in interactionsTrain:
itemsPerUser[u].append(i)
usersPerItem[i].append(u)
for i in range(100):
obj = trainingStepBPR(modelBPR, interactions)
if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))
iteration 10, objective = 0.5586551 iteration 20, objective = 0.5336623 iteration 30, objective = 0.5308751 iteration 40, objective = 0.53656584 iteration 50, objective = 0.5435766 iteration 60, objective = 0.543326 iteration 70, objective = 0.54365164 iteration 80, objective = 0.54245454 iteration 90, objective = 0.5428503 iteration 100, objective = 0.5451498
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for u,i in interactionsTest:
interactionsTestPerUser[u].add(i)
itemSet.add(i)
def AUCu(model, u, N):
win = 0
if N > len(interactionsTestPerUser[u]):
N = len(interactionsTestPerUser[u])
positive = random.sample(interactionsTestPerUser[u],N)
negative = random.sample(itemSet.difference(interactionsTestPerUser[u]),N)
for i,j in zip(positive,negative):
si = model.predict(userIDs[u], itemIDs[i]).numpy()
sj = model.predict(userIDs[u], itemIDs[j]).numpy()
if si > sj:
win += 1
return win/N
def AUC(model):
av = []
for u in interactionsTestPerUser:
av.append(AUCu(model, u, 10))
return sum(av) / len(av)
AUC(modelBPR)
0.6800902434544706
Compute similarities among friends' latent representations
sims = []
simFriends = []
while len(sims) < 10000:
try:
u,i = random.choice(interactions)
v = random.sample(socialTrust[u],1)[0] # trust link
j = random.sample(itemsPerUser[v],1)[0] # friend's item
k = random.choice(items) # random item
except Exception as e:
continue
s1 = 1 - distance.cosine(modelBPR.gammaI[itemIDs[i]],modelBPR.gammaI[itemIDs[k]])
s2 = 1 - distance.cosine(modelBPR.gammaI[itemIDs[i]],modelBPR.gammaI[itemIDs[j]])
if s1 > 1:
print("?")
break
sims.append(s1)
simFriends.append(s2)
Similarity between randomly chosen pairs of items
sum(sims)/len(sims)
0.003534959910223597
Similarity between an item and one consumed by a friend
sum(simFriends)/len(simFriends)
0.04061316501272158
(similarity is not particularly high, but still significantly higher than random pairs)
Implement the social model. Uses the model above, just with different samples.
def trainingStepBPRsocial(model, interactions):
Nsamples = 50000
with tf.GradientTape() as tape:
sampleU, sampleI, sampleJ = [], [], []
while len(sampleU) < Nsamples/2:
try:
u,i = random.choice(interactions) # positive sample
v = random.sample(socialTrust[u],1)[0] # trust link
j = random.sample(itemsPerUser[v],1)[0] # friend's item
k = random.choice(items) # negative item
if j in itemsPerUser[u] or k in itemsPerUser[u]:
continue
except Exception as e:
continue
while j in itemsPerUser[u]:
j = random.choice(items)
sampleU.append(userIDs[u])
sampleI.append(itemIDs[i]) # Positive
sampleJ.append(itemIDs[j]) # greater than social
sampleU.append(userIDs[u])
sampleI.append(itemIDs[j]) # Social
sampleJ.append(itemIDs[k]) # greater than negative
loss = model(sampleU,sampleI,sampleJ)
loss += model.reg()
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients((grad, var) for
(grad, var) in zip(gradients, model.trainable_variables)
if grad is not None)
return loss.numpy()
optimizer = tf.keras.optimizers.Adam(0.1)
modelBPRsocial = BPRbatch(10, 0.00001)
for i in range(100):
obj = trainingStepBPRsocial(modelBPRsocial, interactions)
if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))
iteration 10, objective = 0.6443011 iteration 20, objective = 0.618395 iteration 30, objective = 0.6267633 iteration 40, objective = 0.6299376 iteration 50, objective = 0.6279181 iteration 60, objective = 0.6186671 iteration 70, objective = 0.6207659 iteration 80, objective = 0.6173624 iteration 90, objective = 0.61636496 iteration 100, objective = 0.60944057
AUC(modelBPRsocial)
0.6322810869785714