import gzip
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
Data is available at http://cseweb.ucsd.edu/~jmcauley/pml/data/. Download and save to your own directory
dataDir = "/home/jmcauley/pml_data/"
def parseData(fname):
for l in open(fname):
yield eval(l)
data = list(parseData(dataDir + "beer_50000.json"))[:5000]
How many unique words are there?
wordCount = defaultdict(int)
for d in data:
for w in d['review/text'].split():
wordCount[w] += 1
len(wordCount)
Ignore capitalization and remove punctuation
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
for w in r.split():
wordCount[w] += 1
len(wordCount)
With stemming
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
for w in r.split():
w = stemmer.stem(w)
wordCount[w] += 1
len(wordCount)
Just build our feature vector by taking the most popular words (lowercase, punctuation removed, but no stemming)
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
for w in r.split():
wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
def feature(datum):
feat = [0]*len(words)
r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
for w in r.split():
if w in words:
feat[wordId[w]] += 1
feat.append(1) # offset
return feat
Extract bag-of-word features. For a bigger dataset, replace this with a sparse matrix to save memory (see examples in Chapter 6)
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
Simple example...
sentence = "Dark red color, light beige foam"
unigrams = sentence.split()
bigrams = list(zip(unigrams[:-1], unigrams[1:]))
trigrams = list(zip(unigrams[:-2], unigrams[1:-1], unigrams[2:]))
trigrams
Extract n-grams up to length 5 (same dataset as example above)
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
ws = r.split()
ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
for w in ws + ws2 + ws3 + ws4 + ws5:
wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
A few of our 1000 most popular n-grams. Note the combination of n-grams of different lengths
words[200:210]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
def feature(datum):
feat = [0]*len(words)
r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
ws = r.split()
ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
for w in ws + ws2 + ws3 + ws4 + ws5:
if w in words:
feat[wordId[w]] += 1
feat.append(1) #offset
return feat
Same as the model in the previous example above, except using n-grams rather than just unigrams
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
wordSort = list(zip(theta[:-1], words))
wordSort.sort()
Some of the most negative and positive n-grams...
wordSort[:20]
wordSort[-20:]
dataset = []
Small set of Goodreads fantasy reviews
z = gzip.open(dataDir + "goodreads_reviews_fantasy_paranormal.json.gz")
for l in z.readlines():
dataset.append(eval(l))
if len(dataset) == 50000:
break
For example...
dataset[0]
Similar process to extract bag-of-words representations as in previous examples
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
for w in r.split():
wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
df = defaultdict(int)
for d in dataset:
r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
for w in set(r.split()):
df[w] += 1
Here we extract frequencies for terms in a single specific review
rev = dataset[9] # Query review
rev
tf = defaultdict(int)
r = ''.join([c for c in rev['review_text'].lower() if not c in punctuation])
for w in r.split():
# Note = rather than +=, different versions of tf could be used instead
tf[w] = 1
tfidf = dict(zip(words,[tf[w] * math.log2(len(dataset) / df[w]) for w in words]))
tfidfQuery = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]
Find the highest tf-idf words in our example review
maxTf = [(tf[w],w) for w in words]
maxTf.sort(reverse=True)
maxTfIdf = [(tfidf[w],w) for w in words]
maxTfIdf.sort(reverse=True)
maxTfIdf[:10]
Cosine similarity
def Cosine(x1,x2):
numer = 0
norm1 = 0
norm2 = 0
for a1,a2 in zip(x1,x2):
numer += a1*a2
norm1 += a1**2
norm2 += a2**2
if norm1*norm2:
return numer / math.sqrt(norm1*norm2)
return 0
Find the other reviews in the corpus with the highest cosine similarity between tf-idf vectors
similarities = []
for rev2 in dataset:
tf = defaultdict(int)
r = ''.join([c for c in rev2['review_text'].lower() if not c in punctuation])
for w in r.split():
# Note = rather than +=
tf[w] = 1
tfidf2 = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]
similarities.append((Cosine(tfidfQuery, tfidf2), rev2['review_text']))
similarities.sort(reverse=True)
similarities[:10]
A few utility data structures (used later)
beerStyles = {} # Style of each item
categories = set() # Set of item categories
reviewsPerUser = defaultdict(list)
beerIdToName = {} # Map an ID to the name of the product
z = open(dataDir + "beer_50000.json")
reviews = []
reviewDicts = []
for l in z:
d = eval(l)
reviews.append(d['review/text'])
beerStyles[d['beer/beerId']] = d['beer/style']
categories.add(d['beer/style'])
beerIdToName[d['beer/beerId']] = d['beer/name']
reviewsPerUser[d['user/profileName']].append((d['review/timeUnix'], d['beer/beerId']))
reviewDicts.append(d)
if len(reviews) == 50000:
break
Tokenize the reviews, so that each review becomes a list of words
reviewTokens = []
punctuation = set(string.punctuation)
for r in reviews:
r = ''.join([c for c in r.lower() if not c in punctuation])
tokens = []
for w in r.split():
tokens.append(w)
reviewTokens.append(tokens)
Example of a tokenized review
reviewTokens[0]
Fit the word2vec model
model = Word2Vec(reviewTokens,
min_count=5, # Words/items with fewer instances are discarded
size=10, # Model dimensionality
window=3, # Window size
sg=1) # Skip-gram model
Extract word representation for a particular word
model.wv['yeast']
Find similar words to a given query
model.wv.similar_by_word("grassy")
Almost the same as word2vec, but "documents" are made up of sequences of item IDs rather than words
reviewLists = []
for u in reviewsPerUser:
rl = list(reviewsPerUser[u])
rl.sort()
reviewLists.append([x[1] for x in rl])
model10 = Word2Vec(reviewLists,
min_count=5, # Words/items with fewer instances are discarded
size=10, # Model dimensionality
window=3, # Window size
sg=1) # Skip-gram model
beerIdToName['7360']
model10.wv.similar_by_word('7360')
for b in model10.wv.similar_by_word('7360'):
print(beerIdToName[b[0]])
Visualize the embeddings from the model above using t-SNE
X = []
beers = []
for b in beerIdToName:
try:
X.append(list(model10.wv[b]))
beers.append(b)
except Exception as e:
pass
Fit a model with just two components for the sake of visualization
X_embedded = TSNE(n_components=2).fit_transform(X)
Generate scatterplots using the embedded points (one scatter plot per category)
scatterPlotsX = defaultdict(list)
scatterPlotsY = defaultdict(list)
for xy, b in zip(X, beers):
cat = beerStyles[b]
try:
scatterPlotsX[cat].append(xy[0])
scatterPlotsY[cat].append(xy[1])
except Exception as e:
pass
Plot data from a few categories (more interesting with a larger dataset)
plt.scatter(scatterPlotsX['American Adjunct Lager'],
scatterPlotsY['American Adjunct Lager'], color='k', lw=0, label="Adjunct Lager")
plt.scatter(scatterPlotsX['American Porter'],
scatterPlotsY['American Porter'], color='grey', lw=0, label = "Porter")
plt.scatter(scatterPlotsX['Smoked Beer'],
scatterPlotsY['Smoked Beer'], color='lightgrey', lw = 0, label = "Smoked Beer")
plt.legend(loc='lower left')
plt.xticks([])
plt.yticks([])
plt.xlabel("first embedded dimension ")
plt.ylabel("second embedded dimension")
plt.title("\emph{TSNE}-based item embeddings")
plt.show()
Simple sentiment analysis pipeline
def parseData(fname):
for l in open(fname):
yield eval(l)
data = list(parseData(dataDir + "beer_50000.json"))[:5000]
random.shuffle(data)
Add a couple of "options" for the representation (in this case whether we should convert to lower case, remove puncuation). More could be added.
def feature(datum, words, wordId, tolower=True, removePunct=True):
feat = [0]*len(words)
r = datum['review/text']
if tolower:
r = r.lower()
if removePunct:
r = ''.join([c for c in r if not c in punctuation])
for w in r.split():
if w in words:
feat[wordId[w]] += 1
feat.append(1) # offset
return feat
Condense the pipeline code (see Chapter 3) into a single function
def pipeline(dSize = 1000, tolower=True, removePunct=True):
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data: # Strictly, should just use the *training* data to extract word counts
r = d['review/text']
if tolower:
r = r.lower()
if removePunct:
r = ''.join([c for c in r if not c in punctuation])
for w in r.split():
wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:dSize]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
X = [feature(d, words, wordId, tolower, removePunct) for d in data]
y = [d['review/overall'] for d in data]
Ntrain,Nvalid,Ntest = 4000,500,500
Xtrain,Xvalid,Xtest = X[:Ntrain],X[Ntrain:Ntrain+Nvalid],X[Ntrain+Nvalid:]
ytrain,yvalid,ytest = y[:Ntrain],y[Ntrain:Ntrain+Nvalid],y[Ntrain+Nvalid:]
bestModel = None
bestVal = None
bestLamb = None
ls = [0.01, 0.1, 1, 10, 100, 1000, 10000]
errorTrain = []
errorValid = []
for l in ls:
model = sklearn.linear_model.Ridge(l)
model.fit(Xtrain, ytrain)
predictTrain = model.predict(Xtrain)
MSEtrain = sum((ytrain - predictTrain)**2)/len(ytrain)
errorTrain.append(MSEtrain)
predictValid = model.predict(Xvalid)
MSEvalid = sum((yvalid - predictValid)**2)/len(yvalid)
errorValid.append(MSEvalid)
print("l = " + str(l) + ", validation MSE = " + str(MSEvalid))
if bestVal == None or MSEvalid < bestVal:
bestVal = MSEvalid
bestModel = model
bestLamb = l
predictTest = bestModel.predict(Xtest)
MSEtest = sum((ytest - predictTest)**2)/len(ytest)
MSEtest
plt.xticks([])
plt.xlabel(r"$\lambda$")
plt.ylabel(r"error (MSE)")
plt.title(r"Validation Pipeline")
plt.xscale('log')
plt.plot(ls, errorTrain, color='k', linestyle='--', label='training error')
plt.plot(ls, errorValid, color='grey',zorder=4,label="validation error")
plt.plot([bestLamb], [MSEtest], linestyle='', marker='x', color='k', label="test error")
plt.legend(loc='best')
plt.show()
Run models with different feature representation options and dictionary sizes
pipeline(2000, False, False)
pipeline(1000, True, True)
Use item2vec to make recommendations (following code from exercises in Chapter 4)
def recScore(i, userHistory):
historyInVocab = [w for w in userHistory if w in model10.wv]
if len(historyInVocab) == 0:
return 0
sc = model10.wv.distance(str(i), historyInVocab[-1])
return sc
def rec(userHistory):
historyInVocab = [w for w in userHistory if w in model10.wv]
if len(historyInVocab) == 0:
return 0
return model10.wv.most_similar(positive = historyInVocab, topn=10)
recScore(20539, reviewLists[0])
rec(reviewLists[0])
(see tf-idf retrieval examples above)
Predict the rating using item2vec item similarity scores. Adapts models from Chapter 4.
ratingMean = sum([d['review/overall'] for d in reviewDicts]) / len(reviewDicts)
itemAverages = defaultdict(list)
reviewsPerUser = defaultdict(list)
for d in reviewDicts:
i = d['beer/beerId']
u = d['user/profileName']
itemAverages[i].append(d['review/overall'])
reviewsPerUser[u].append(d)
for i in itemAverages:
itemAverages[i] = sum(itemAverages[i]) / len(itemAverages[i])
def predictRating(user,item):
ratings = []
similarities = []
if not str(item) in model10.wv:
return ratingMean
for d in reviewsPerUser[user]:
i2 = d['beer/beerId']
if i2 == item: continue
ratings.append(d['review/overall'] - itemAverages[i2])
if str(i2) in model10.wv:
similarities.append(model10.wv.distance(str(item), str(i2)))
if (sum(similarities) > 0):
weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
return itemAverages[item] + sum(weightedRatings) / sum(similarities)
else:
return ratingMean
u,i = reviewDicts[0]['user/profileName'],reviewDicts[0]['beer/beerId']
predictRating(u,i)
ratingMean
alwaysPredictMean = [ratingMean for _ in reviewDicts]
labels = [d['review/overall'] for d in reviewDicts]
predictions = [predictRating(d['user/profileName'],d['beer/beerId']) for d in reviewDicts]
def MSE(predictions, labels):
differences = [(x-y)**2 for x,y in zip(predictions,labels)]
return sum(differences) / len(differences)
MSE(alwaysPredictMean, labels)
MSE(predictions, labels)