import datetime
import dateutil
import gzip
import numpy
import matplotlib.pyplot as plt
import random
import scipy
import sklearn
import tensorflow as tf
import time
from collections import defaultdict
from fastFM import als
from sklearn import linear_model
Data is available at http://cseweb.ucsd.edu/~jmcauley/pml/data/. Download and save to your own directory
dataDir = "/home/jmcauley/pml_data/"
Example based on Bay-Area bike-share data. Extract the time information from the events.
events = {}
for f in [dataDir + "201408_trip_data.csv", dataDir + "201402_trip_data.csv"]:
f = open(f, 'r')
l = f.readline()
for l in f:
l = l.split(',')
tripID = l[0]
timeString = l[2]
timeUnix =\
time.mktime(datetime.datetime.strptime(timeString, "%m/%d/%Y %H:%M").timetuple())
events[tripID] = [timeUnix, timeString]
Find the earliest event (so that we can sort events from the first to the last hour)
earliest = None
for event in events:
if earliest == None or events[event][0] < earliest[0]:
earliest = events[event]
earliestTime = earliest[0]
Count events by hour
hourly = defaultdict(int)
for event in events:
t = events[event][0]
hour = int(t - earliestTime) // (60*60)
hourly[hour] += 1
Autoregressive feature representation. Here we don't include a bias term, though could easily include one.
def feature(hour):
previousHours = []
# Features for last 5 hours, one day ago, one week ago, and one year ago
for i in [1,2,3,4,5,24,24*7,24*7*365]:
previousHour = hour - i
previousHourExists = previousHour in hourly
if previousHourExists:
previousHours += [hourly[previousHour]]
else:
previousHours += [0]
return previousHours
X = [feature(x) for x in hourly]
y = [hourly[x] for x in hourly]
model = sklearn.linear_model.LinearRegression(fit_intercept=False)
model.fit(X, y)
theta = model.coef_
The observation one week ago is the most predictive, followed by the observation from the previous hour:
theta
Parse ratings and timestamps from (a small fraction of) Goodreads fantasy novel data
ratingsTime = []
z = gzip.open(dataDir + "goodreads_reviews_fantasy_paranormal.json.gz")
for l in z:
d = eval(l)
t = dateutil.parser.parse(d['date_updated'])
ratingsTime.append((t,d['rating']))
if len(ratingsTime) >= 50000:
break
Sort observations by time
ratingsTime.sort()
len(ratingsTime)
Keep track of a window (wSize) of ratings and timestamps (the raw time is just for printing the plot)
wSize = 1000
x = [r[0] for r in ratingsTime] # as raw times
y = [r[1] for r in ratingsTime] # ratings
xu = [time.mktime(d.timetuple()) for d in x] # as unix times
Use a dynamic-programming approach to build the sliding window
xSum = sum(xu[:wSize])
ySum = sum(y[:wSize])
sliding = []
for i in range(wSize,len(x)-1):
xSum += xu[i] - xu[i-wSize]
ySum += y[i] - y[i-wSize]
sliding.append((xSum*1.0/wSize,ySum*1.0/wSize))
X and Y coordinates for plotting
X = [a[0] for a in sliding]
Y = [a[1] for a in sliding]
plt.plot(X[::1000],Y[::1000], label="5000 rating window", color='k')
plt.xticks([X[600], X[-350]], [x[wSize+600].strftime("%d/%m/%Y"), x[-350].strftime("%d/%m/%Y")])
plt.xlim(X[0], X[-1])
plt.ylabel("Rating")
plt.xlabel("Date")
plt.legend(loc="best",fontsize=8)
plt.title("Ratings over time (sliding windows)")
plt.show()
def parse(path):
g = gzip.open(path, 'r')
for l in g:
yield eval(l)
Extract the interaction data, including the timestamps associated with each interaction
userIDs = {}
itemIDs = {}
interactions = []
interactionsPerUser = defaultdict(list)
for d in parse(dataDir + "goodreads_reviews_comics_graphic.json.gz"):
u = d['user_id']
i = d['book_id']
t = d['date_added']
r = d['rating']
dt = dateutil.parser.parse(t)
t = int(dt.timestamp())
if not u in userIDs: userIDs[u] = len(userIDs)
if not i in itemIDs: itemIDs[i] = len(itemIDs)
interactions.append((t,u,i,r))
interactionsPerUser[u].append((t,i,r))
Interaction with timestamp
interactions[0]
len(interactions)
Sort interactions by time (including interaction sequences for each user). Useful when building data structures that include adjacent pairs of interactions (but consider whether this is desirable if making train/test splits!).
interactions.sort()
itemIDs['dummy'] = len(itemIDs)
Build a data structure including users, items, and their previous items
interactionsWithPrevious = []
for u in interactionsPerUser:
interactionsPerUser[u].sort()
lastItem = 'dummy'
for (t,i,r) in interactionsPerUser[u]:
interactionsWithPrevious.append((t,u,i,lastItem,r))
lastItem = i
itemsPerUser = defaultdict(set)
for _,u,i,_ in interactions:
itemsPerUser[u].add(i)
items = list(itemIDs.keys())
Define the tensorflow model. Similar to models from Chapter 5, with the addition of the term associated with the previous interaction.
optimizer = tf.keras.optimizers.Adam(0.1)
FMPC class. UI and IJ are given as initialization options, allowing us to exclude certain terms (for exercises later).
class FPMC(tf.keras.Model):
def __init__(self, K, lamb, UI = 1, IJ = 1):
super(FPMC, self).__init__()
# Initialize variables
self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
self.gammaUI = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
self.gammaIU = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
self.gammaIJ = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
self.gammaJI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
# Regularization coefficient
self.lamb = lamb
# Which terms to include
self.UI = UI
self.IJ = IJ
# Prediction for a single instance
def predict(self, u, i, j):
p = self.betaI[i] + self.UI * tf.tensordot(self.gammaUI[u], self.gammaIU[i], 1) +\
self.IJ * tf.tensordot(self.gammaIJ[i], self.gammaJI[j], 1)
return p
# Regularizer
def reg(self):
return self.lamb * (tf.nn.l2_loss(self.betaI) +\
tf.nn.l2_loss(self.gammaUI) +\
tf.nn.l2_loss(self.gammaIU) +\
tf.nn.l2_loss(self.gammaIJ) +\
tf.nn.l2_loss(self.gammaJI))
def call(self, sampleU, # user
sampleI, # item
sampleJ, # previous item
sampleK): # negative item
u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
j = tf.convert_to_tensor(sampleJ, dtype=tf.int32)
k = tf.convert_to_tensor(sampleK, dtype=tf.int32)
gamma_ui = tf.nn.embedding_lookup(self.gammaUI, u)
gamma_iu = tf.nn.embedding_lookup(self.gammaIU, i)
gamma_ij = tf.nn.embedding_lookup(self.gammaIJ, i)
gamma_ji = tf.nn.embedding_lookup(self.gammaJI, j)
beta_i = tf.nn.embedding_lookup(self.betaI, i)
x_uij = beta_i + self.UI * tf.reduce_sum(tf.multiply(gamma_ui, gamma_iu), 1) +\
self.IJ * tf.reduce_sum(tf.multiply(gamma_ij, gamma_ji), 1)
gamma_uk = tf.nn.embedding_lookup(self.gammaUI, u)
gamma_ku = tf.nn.embedding_lookup(self.gammaIU, k)
gamma_kj = tf.nn.embedding_lookup(self.gammaIJ, k)
gamma_jk = tf.nn.embedding_lookup(self.gammaJI, j)
beta_k = tf.nn.embedding_lookup(self.betaI, k)
x_ukj = beta_k + self.UI * tf.reduce_sum(tf.multiply(gamma_uk, gamma_ku), 1) +\
self.IJ * tf.reduce_sum(tf.multiply(gamma_kj, gamma_jk), 1)
return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_uij - x_ukj)))
modelFPMC = FPMC(5, 0.00001)
def trainingStep(model, interactions):
with tf.GradientTape() as tape:
sampleU, sampleI, sampleJ, sampleK = [], [], [], []
for _ in range(100000):
_,u,i,j,_ = random.choice(interactions) # positive sample
k = random.choice(items) # negative sample
while k in itemsPerUser[u]:
k = random.choice(items)
sampleU.append(userIDs[u])
sampleI.append(itemIDs[i])
sampleJ.append(itemIDs[j])
sampleK.append(itemIDs[k])
loss = model(sampleU,sampleI,sampleJ,sampleK)
loss += model.reg()
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients((grad, var) for
(grad, var) in zip(gradients, model.trainable_variables)
if grad is not None)
return loss.numpy()
Run 100 batches
for i in range(100):
obj = trainingStep(modelFPMC, interactionsWithPrevious)
if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))
(still using the hourly bikeshare interaction data from examples above)
xsort = list(hourly.keys())
xsort.sort()
X = [feature(x) for x in xsort]
y = [hourly[x] for x in xsort]
model = sklearn.linear_model.LinearRegression(fit_intercept=False)
model.fit(X, y)
theta = model.coef_
ypred = model.predict(X)
res = y - ypred
plt.plot(xsort, res, color='k')
plt.ylabel("residual")
plt.xlabel("hourly observation")
plt.title("Residuals (autoregressive model)")
plt.show()
(using Goodreads data from examples above)
len(interactions)
interactionsTrain = interactionsWithPrevious[:500000]
interactionsTest = interactionsWithPrevious[500000:]
The FPMC implementation we built above allowed us to control which terms (user/item or item/item) were included
modelFPMC = FPMC(5, 0.001, 1, 1)
modelMF = FPMC(5, 0.001, 1, 0)
modelFMC = FPMC(5, 0.001, 0, 1)
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for _,u,i,j,_ in interactionsTest:
interactionsTestPerUser[u].add((i,j))
itemSet.add(i)
itemSet.add(j)
def AUCu(model, u, N):
win = 0
if N > len(interactionsTestPerUser[u]):
N = len(interactionsTestPerUser[u])
positive = random.sample(interactionsTestPerUser[u],N)
negative = random.sample(itemSet,N)
for (i,j),k in zip(positive,negative):
sp = model.predict(userIDs[u], itemIDs[i], itemIDs[j]).numpy()
sn = model.predict(userIDs[u], itemIDs[k], itemIDs[j]).numpy()
if sp > sn:
win += 1
return win/N
def AUC(model):
av = []
for u in interactionsTestPerUser:
av.append(AUCu(model, u, 10))
return sum(av) / len(av)
for model,name in [(modelFPMC,"FPMC"), (modelMF,"MF"), (modelFMC,"FMC")]:
for i in range(100):
obj = trainingStep(model, interactionsTrain)
if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))
print(name + " AUC = " + str(AUC(model)))
FISM implementaion, using a factorization machine
nItems = len(itemIDs)
nUsers = len(userIDs)
fismInter = random.sample(interactions,100000)
Factorization machine design matrix. Note that we have two sets of features (the user history, and the target item). Both are of dimension nItems.
X = scipy.sparse.lil_matrix((len(fismInter), 2*nItems))
for n in range(len(fismInter)):
_,u,i,r = fismInter[n]
item = itemIDs[i]
history = itemsPerUser[u]
for j in history:
if i == j: continue # Exclude the target item from the history
X[n,itemIDs[j]] = 1.0 / (len(history) - 1) # One-hot encoding, normalized by history length
X[n,nItems + item] = 1
y = numpy.array([r for _,_,_,r in fismInter])
Fairly slow and memory-hungry (every row contains a copy of a user's history). Could possibly be implemented faster in Tensorflow.
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
X_train,y_train = X[:90000],y[:90000]
X_test,y_test = X[90000:],y[90000:]
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)
def MSE(predictions, labels):
differences = [(x-y)**2 for x,y in zip(predictions,labels)]
return sum(differences) / len(differences)
MSE(y_pred, y_test)
(still using Goodreads data)
interactions.sort()
interactionsPerItem = defaultdict(list)
for t,u,i,r in interactions:
interactionsPerItem[i].append((t,u,r))
random.shuffle(interactions)
Regression-based approach. Just collect past K interactions (ratings) as features.
def feat(t,u,i):
older = [r for (q,u,r) in interactionsPerItem[i] if q < t] # Collect previous ratings
f = []
for k in range(1,6):
try:
f += [0,older[-k]] # Previous rating
except Exception as e:
f += [1,0] # Or missing value indicator if we don't have history of length k
if len(older):
f += [0,sum(older)/len(older)] # Add feature for the average (going beyond just last k)
else:
f += [1,0] # Missing value indicator if no interaction history
return f + [1]
X = [feat(t,u,i) for t,u,i,_ in interactions]
y = [r for _,_,_,r in interactions]
X_train, X_test = X[:400000],X[400000:]
y_train, Y_test = y[:400000],y[400000:]
model = sklearn.linear_model.LinearRegression(fit_intercept=False)
model.fit(X_train, y_train)
theta = model.coef_
theta
y_pred = model.predict(X_test)
MSE(y_pred, y_test)
Factorization machine-based approach. Copy the same features from the model above, but also include a user term. In theory, this should allow us to learn how sensitive a particular user is to herding.
XF = scipy.sparse.lil_matrix((len(interactions), nUsers + len(X[0])))
for n in range(len(interactions)):
_,u,i,r = interactions[n]
user = userIDs[u]
XF[n,user] = 1
for j in range(len(X[n])): # Copy features from previous model
XF[n,nUsers+j] = X[n][j]
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
y = numpy.array(y)
X_train,y_train = XF[:400000],y[:400000]
X_test,y_test = XF[400000:],y[400000:]
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)
MSE(y_pred, y_test)