import gzip
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
Data is available at http://cseweb.ucsd.edu/~jmcauley/pml/data/. Download and save to your own directory
dataDir = "/home/jmcauley/pml_data/"
def parseData(fname):
for l in open(fname):
yield eval(l)
data = list(parseData(dataDir + "beer_50000.json"))[:5000]
How many unique words are there?
wordCount = defaultdict(int)
for d in data:
for w in d['review/text'].split():
wordCount[w] += 1
len(wordCount)
36225
Ignore capitalization and remove punctuation
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
for w in r.split():
wordCount[w] += 1
len(wordCount)
19426
With stemming
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
for w in r.split():
w = stemmer.stem(w)
wordCount[w] += 1
len(wordCount)
14847
Just build our feature vector by taking the most popular words (lowercase, punctuation removed, but no stemming)
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
for w in r.split():
wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
def feature(datum):
feat = [0]*len(words)
r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
for w in r.split():
if w in words:
feat[wordId[w]] += 1
feat.append(1) # offset
return feat
Extract bag-of-word features. For a bigger dataset, replace this with a sparse matrix to save memory (see examples in Chapter 6)
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
Simple example...
sentence = "Dark red color, light beige foam"
unigrams = sentence.split()
bigrams = list(zip(unigrams[:-1], unigrams[1:]))
trigrams = list(zip(unigrams[:-2], unigrams[1:-1], unigrams[2:]))
trigrams
[('Dark', 'red', 'color,'), ('red', 'color,', 'light'), ('color,', 'light', 'beige'), ('light', 'beige', 'foam')]
Extract n-grams up to length 5 (same dataset as example above)
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
ws = r.split()
ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
for w in ws + ws2 + ws3 + ws4 + ws5:
wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
A few of our 1000 most popular n-grams. Note the combination of n-grams of different lengths
words[200:210]
['pint', 'hoppy', 'stout', 'though', 'lots', 'and the', 'malty', 'mouthfeel is', 'even', 'quickly']
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
def feature(datum):
feat = [0]*len(words)
r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
ws = r.split()
ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
for w in ws + ws2 + ws3 + ws4 + ws5:
if w in words:
feat[wordId[w]] += 1
feat.append(1) #offset
return feat
Same as the model in the previous example above, except using n-grams rather than just unigrams
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
wordSort = list(zip(theta[:-1], words))
wordSort.sort()
Some of the most negative and positive n-grams...
wordSort[:20]
[(-0.39143632655636557, 'a lot of'), (-0.28212293516954884, 'a bit more'), (-0.2661141170411719, 'water'), (-0.2530229314150494, 'the background'), (-0.24105277280700568, 'corn'), (-0.21580888155062322, 'as it warms'), (-0.20470888977263463, 'hint of'), (-0.2032403642895844, 'yellow'), (-0.19776086681438282, 'little more'), (-0.19515009289478227, 'watery'), (-0.1833549486926261, 'straw'), (-0.1784682882644964, 'a hint'), (-0.1749628781772528, 'around the'), (-0.17387445409086819, 'kind'), (-0.17381459136980834, 'down the'), (-0.17231514207712764, 'lot'), (-0.16887649424485385, 'weak'), (-0.16647751584896267, 'bad'), (-0.16513650187976975, 'old'), (-0.1581538428300185, 'up to')]
wordSort[-20:]
[(0.1596617274119243, 'wonderful'), (0.16316670449503792, 'off white head'), (0.17379190414356988, 'a little more'), (0.17560247184995886, 'favorite'), (0.17674628573616408, 'not too'), (0.17724972085645002, 'i really'), (0.17919486420127723, 'easy to'), (0.18200914314264174, 'i am'), (0.18394566780123306, 'background'), (0.18752400440800757, 'touch of'), (0.19751810949893422, 'the best'), (0.19933817230459436, 'hint'), (0.21249123771083625, 'this one is'), (0.2141045147587393, 'a hint of'), (0.22043777880133708, 'not bad'), (0.2376891974695213, 'of these'), (0.23871898220027807, 'a bad'), (0.24874277211071016, 'lot of'), (0.2748653479324936, 'a lot'), (0.3121741564883301, 'bit more')]
dataset = []
Small set of Goodreads fantasy reviews
z = gzip.open(dataDir + "goodreads_reviews_fantasy_paranormal.json.gz")
for l in z.readlines():
dataset.append(eval(l))
if len(dataset) == 50000:
break
For example...
dataset[0]
{'book_id': '18245960', 'date_added': 'Sun Jul 30 07:44:10 -0700 2017', 'date_updated': 'Wed Aug 30 00:00:26 -0700 2017', 'n_comments': 1, 'n_votes': 28, 'rating': 5, 'read_at': 'Sat Aug 26 12:05:52 -0700 2017', 'review_id': 'dfdbb7b0eb5a7e4c26d59a937e2e5feb', 'review_text': 'This is a special book. It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind. This is what I love about good science fiction - it pushes your thinking about where things can go. \n It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I\'ve read. For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc. \n It is a book about science, and aliens. The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell. Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though. \n But what would happen if our SETI stations received a message - if we found someone was out there - and the person monitoring and answering the signal on our side was disillusioned? That part of the book was a bit dark - I would like to think human reaction to discovering alien civilization that is hostile would be more like Enders Game where we would band together. \n I did like how the book unveiled the Trisolaran culture through the game. It was a smart way to build empathy with them and also understand what they\'ve gone through across so many centuries. And who know a 3 body problem was an unsolvable math problem? But I still don\'t get who made the game - maybe that will come in the next book. \n I loved this quote: \n "In the long history of scientific progress, how many protons have been smashed apart in accelerators by physicists? How many neutrons and electrons? Probably no fewer than a hundred million. Every collision was probably the end of the civilizations and intelligences in a microcosmos. In fact, even in nature, the destruction of universes must be happening at every second--for example, through the decay of neutrons. Also, a high-energy cosmic ray entering the atmosphere may destroy thousands of such miniature universes...."', 'started_at': 'Tue Aug 15 13:23:18 -0700 2017', 'user_id': '8842281e1d1347389f2ab93d60773d4d'}
Similar process to extract bag-of-words representations as in previous examples
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
for w in r.split():
wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
df = defaultdict(int)
for d in dataset:
r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
for w in set(r.split()):
df[w] += 1
Here we extract frequencies for terms in a single specific review
rev = dataset[9] # Query review
rev
{'book_id': '76620', 'date_added': 'Tue Sep 02 17:03:40 -0700 2008', 'date_updated': 'Wed Dec 14 12:30:43 -0800 2016', 'n_comments': 2, 'n_votes': 30, 'rating': 5, 'read_at': 'Tue May 05 00:00:00 -0700 2015', 'review_id': '9206654986a733bd753753aa0c882813', 'review_text': "I read this after hearing from a few people that it was among their all-time favorites. I was almost put off when I saw it was a story about rabbits, originally written as a tale by a father to his children - but I'm glad I wasn't. \n I found the folk tales about El-ahrairah to be very impressive. The author clearly had a vivid imagination to create so much of the rabbits culture and history. But I think this book was worth reading as it's really a story about survival, leadership, and human nature. \n Oh and Fiver rocks. And BigWig is the man.", 'started_at': 'Sun Apr 19 00:00:00 -0700 2015', 'user_id': '8842281e1d1347389f2ab93d60773d4d'}
tf = defaultdict(int)
r = ''.join([c for c in rev['review_text'].lower() if not c in punctuation])
for w in r.split():
# Note = rather than +=, different versions of tf could be used instead
tf[w] = 1
tfidf = dict(zip(words,[tf[w] * math.log2(len(dataset) / df[w]) for w in words]))
tfidfQuery = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]
Find the highest tf-idf words in our example review
maxTf = [(tf[w],w) for w in words]
maxTf.sort(reverse=True)
maxTfIdf = [(tfidf[w],w) for w in words]
maxTfIdf.sort(reverse=True)
maxTfIdf[:10]
[(6.475214154215886, 'nature'), (6.1604918290613755, 'tales'), (5.77675046027207, 'children'), (5.7117950184313, 'saw'), (5.316168825598678, 'history'), (5.286585714095165, 'father'), (4.98210658996402, 'worth'), (4.910068021149542, 'glad'), (4.881720019873613, 'human'), (4.731589561708275, 'tale')]
Cosine similarity
def Cosine(x1,x2):
numer = 0
norm1 = 0
norm2 = 0
for a1,a2 in zip(x1,x2):
numer += a1*a2
norm1 += a1**2
norm2 += a2**2
if norm1*norm2:
return numer / math.sqrt(norm1*norm2)
return 0
Find the other reviews in the corpus with the highest cosine similarity between tf-idf vectors
similarities = []
for rev2 in dataset:
tf = defaultdict(int)
r = ''.join([c for c in rev2['review_text'].lower() if not c in punctuation])
for w in r.split():
# Note = rather than +=
tf[w] = 1
tfidf2 = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]
similarities.append((Cosine(tfidfQuery, tfidf2), rev2['review_text']))
similarities.sort(reverse=True)
similarities[:10]
[(1.0, "I read this after hearing from a few people that it was among their all-time favorites. I was almost put off when I saw it was a story about rabbits, originally written as a tale by a father to his children - but I'm glad I wasn't. \n I found the folk tales about El-ahrairah to be very impressive. The author clearly had a vivid imagination to create so much of the rabbits culture and history. But I think this book was worth reading as it's really a story about survival, leadership, and human nature. \n Oh and Fiver rocks. And BigWig is the man."), (0.33623828530476574, 'Need a review \n For a full explanation of the provenance and history of Arabian Nights, as well as an alternative translation, see \n https://www.theatlantic.com/entertain... \n For children, see \n Aladdin and Other Tales from the Arabian Nights, Dawood, 1960 \n https://www.goodreads.com/review/show...'), (0.2944751254505903, 'Awesome! Love this author and his story world. Found quite a few errors/typos in the text, but the story is worth reading.'), (0.2944520286327877, "At first I was iffy about this book but it intrigued by the viewpoint it was written from then before I knew it is was sucked in and couldn't put it down!! I almost cried when I saw there was 3 more books."), (0.2902791374813246, "I was delighted by this book ...the only fault is that it was too short ! What a fantastic idea; a refuge for the children who have had adventures & now cannot fit back into the identity assigned to them. How many of us are not comfortable in the families we were born to? I loved the way the different doorways were sorted; one would think that adventures shared would be a bonding moment. Rivalries will be ever present ; guess that is human nature. I don't want to describe too much & ruin the magic; this author is the heir apparent to Neil Gaiman. Im so glad she has more books to discover !"), (0.27867963900816956, "Loved this tale and look forward to reading much more of Gem Sivad's tales."), (0.2755108699963019, "I'm really liking these books. It was good to learn more about the vampire lore in these ones and I'm glad that Jane found some companionship in this book, but I wasn't liking the kidnapped children storyline."), (0.2685687782658902, 'An amazing story. The first taste of the Tolkien universe, this children tales has it all and is superbly written. A must absolutely read.'), (0.260579392737578, "I'm enjoying reading this series. I wish I would have read this before I saw the movie...but oh well."), (0.2602744050799704, "See full review for Enchanted at: https://toomanybooksnotenoughshelves.... \n Sunday Woodcutter is the seventh daughter of a seventh daughter. Because she is such a thing, she has magical properties. Her gift bestowed upon her by her Fairy Godmother Joy is that whatever she writes comes true. It has had harsh implications in the past, so Sunday just writes about her family history. She had six older sisters and three older brothers. Jack Junior died tragically thanks in large part to the royal family of Arilland. Monday was married off to a prince and they never see her anymore. Tuesday died. Wednesday is a little bit strange. Thursday married a pirate king and sends them gifts whenever she can. Friday has a pure heart and a talent with needle and thread and uses it to make clothes for the poor children. Saturday is tomboyish and spends her days with her father and brother in the wood with her axe. Sunday is very lonely, until she meets an enchanted frog who she fasts befriends. \n But the irony - of course - is that the frog (Grumble) is actually the crown prince Rumbold. Thanks to one of her Sunday's kisses, he becomes his human self without any remembrance of his year prior to becoming a frog. All he knows is he is desperately in love with Sunday and needs her to be his at any cost. So, like in most fairy tales, a few balls are held with all the maidens in the kingdom invited. But maybe they aren't meant to live happily ever after... \n As I've said in my previous posts, I really like fairy-tale retellings and this book seemed to be right up my alley. But...it wasn't to be.")]
A few utility data structures (used later)
beerStyles = {} # Style of each item
categories = set() # Set of item categories
reviewsPerUser = defaultdict(list)
beerIdToName = {} # Map an ID to the name of the product
z = open(dataDir + "beer_50000.json")
reviews = []
reviewDicts = []
for l in z:
d = eval(l)
reviews.append(d['review/text'])
beerStyles[d['beer/beerId']] = d['beer/style']
categories.add(d['beer/style'])
beerIdToName[d['beer/beerId']] = d['beer/name']
reviewsPerUser[d['user/profileName']].append((d['review/timeUnix'], d['beer/beerId']))
reviewDicts.append(d)
if len(reviews) == 50000:
break
Tokenize the reviews, so that each review becomes a list of words
reviewTokens = []
punctuation = set(string.punctuation)
for r in reviews:
r = ''.join([c for c in r.lower() if not c in punctuation])
tokens = []
for w in r.split():
tokens.append(w)
reviewTokens.append(tokens)
Example of a tokenized review
reviewTokens[0]
['a', 'lot', 'of', 'foam', 'but', 'a', 'lot', 'in', 'the', 'smell', 'some', 'banana', 'and', 'then', 'lactic', 'and', 'tart', 'not', 'a', 'good', 'start', 'quite', 'dark', 'orange', 'in', 'color', 'with', 'a', 'lively', 'carbonation', 'now', 'visible', 'under', 'the', 'foam', 'again', 'tending', 'to', 'lactic', 'sourness', 'same', 'for', 'the', 'taste', 'with', 'some', 'yeast', 'and', 'banana']
Fit the word2vec model
model = Word2Vec(reviewTokens,
min_count=5, # Words/items with fewer instances are discarded
size=10, # Model dimensionality
window=3, # Window size
sg=1) # Skip-gram model
Extract word representation for a particular word
model.wv['yeast']
array([ 0.2489907 , -0.08647301, -0.6366861 , 0.45150977, -0.922045 , -0.3455544 , -0.7728421 , 0.8667233 , 0.01321169, -1.0097654 ], dtype=float32)
Find similar words to a given query
model.wv.similar_by_word("grassy")
[('piny', 0.990902841091156), ('citrus', 0.9905515909194946), ('floral', 0.989111602306366), ('citrussy', 0.9866609573364258), ('citric', 0.9859722256660461), ('flowery', 0.9844235181808472), ('pine', 0.9834136962890625), ('piney', 0.9828003644943237), ('spicy', 0.9817010760307312), ('herbal', 0.9794316291809082)]
Almost the same as word2vec, but "documents" are made up of sequences of item IDs rather than words
reviewLists = []
for u in reviewsPerUser:
rl = list(reviewsPerUser[u])
rl.sort()
reviewLists.append([x[1] for x in rl])
model10 = Word2Vec(reviewLists,
min_count=5, # Words/items with fewer instances are discarded
size=10, # Model dimensionality
window=3, # Window size
sg=1) # Skip-gram model
beerIdToName['7360']
'Crystal Bitter Ale'
model10.wv.similar_by_word('7360')
[('55890', 0.9990268349647522), ('10788', 0.9987408518791199), ('32374', 0.9984040260314941), ('4370', 0.9983484745025635), ('18898', 0.998065710067749), ('48047', 0.9975799918174744), ('39606', 0.9974957704544067), ('10480', 0.9967398643493652), ('36376', 0.9967268109321594), ('13009', 0.996677041053772)]
for b in model10.wv.similar_by_word('7360'):
print(beerIdToName[b[0]])
Summerbrau Lager Pilot Rock Porter Puget Sound Vanilla Porter Pinnacle Peak Pale Ale Ferdinand Svetly Lezak Matso's Monsoonal Blonde Leviathan Imperial Stout Ter Dolen Tripel La Délivrance Pale Ale
Visualize the embeddings from the model above using t-SNE
X = []
beers = []
for b in beerIdToName:
try:
X.append(list(model10.wv[b]))
beers.append(b)
except Exception as e:
pass
Fit a model with just two components for the sake of visualization
X_embedded = TSNE(n_components=2).fit_transform(X)
Generate scatterplots using the embedded points (one scatter plot per category)
scatterPlotsX = defaultdict(list)
scatterPlotsY = defaultdict(list)
for xy, b in zip(X, beers):
cat = beerStyles[b]
try:
scatterPlotsX[cat].append(xy[0])
scatterPlotsY[cat].append(xy[1])
except Exception as e:
pass
Plot data from a few categories (more interesting with a larger dataset)
plt.scatter(scatterPlotsX['American Adjunct Lager'],
scatterPlotsY['American Adjunct Lager'], color='k', lw=0, label="Adjunct Lager")
plt.scatter(scatterPlotsX['American Porter'],
scatterPlotsY['American Porter'], color='grey', lw=0, label = "Porter")
plt.scatter(scatterPlotsX['Smoked Beer'],
scatterPlotsY['Smoked Beer'], color='lightgrey', lw = 0, label = "Smoked Beer")
plt.legend(loc='lower left')
plt.xticks([])
plt.yticks([])
plt.xlabel("first embedded dimension ")
plt.ylabel("second embedded dimension")
plt.title("\emph{TSNE}-based item embeddings")
plt.show()
Simple sentiment analysis pipeline
def parseData(fname):
for l in open(fname):
yield eval(l)
data = list(parseData(dataDir + "beer_50000.json"))[:5000]
random.shuffle(data)
Add a couple of "options" for the representation (in this case whether we should convert to lower case, remove puncuation). More could be added.
def feature(datum, words, wordId, tolower=True, removePunct=True):
feat = [0]*len(words)
r = datum['review/text']
if tolower:
r = r.lower()
if removePunct:
r = ''.join([c for c in r if not c in punctuation])
for w in r.split():
if w in words:
feat[wordId[w]] += 1
feat.append(1) # offset
return feat
Condense the pipeline code (see Chapter 3) into a single function
def pipeline(dSize = 1000, tolower=True, removePunct=True):
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data: # Strictly, should just use the *training* data to extract word counts
r = d['review/text']
if tolower:
r = r.lower()
if removePunct:
r = ''.join([c for c in r if not c in punctuation])
for w in r.split():
wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:dSize]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
X = [feature(d, words, wordId, tolower, removePunct) for d in data]
y = [d['review/overall'] for d in data]
Ntrain,Nvalid,Ntest = 4000,500,500
Xtrain,Xvalid,Xtest = X[:Ntrain],X[Ntrain:Ntrain+Nvalid],X[Ntrain+Nvalid:]
ytrain,yvalid,ytest = y[:Ntrain],y[Ntrain:Ntrain+Nvalid],y[Ntrain+Nvalid:]
bestModel = None
bestVal = None
bestLamb = None
ls = [0.01, 0.1, 1, 10, 100, 1000, 10000]
errorTrain = []
errorValid = []
for l in ls:
model = sklearn.linear_model.Ridge(l)
model.fit(Xtrain, ytrain)
predictTrain = model.predict(Xtrain)
MSEtrain = sum((ytrain - predictTrain)**2)/len(ytrain)
errorTrain.append(MSEtrain)
predictValid = model.predict(Xvalid)
MSEvalid = sum((yvalid - predictValid)**2)/len(yvalid)
errorValid.append(MSEvalid)
print("l = " + str(l) + ", validation MSE = " + str(MSEvalid))
if bestVal == None or MSEvalid < bestVal:
bestVal = MSEvalid
bestModel = model
bestLamb = l
predictTest = bestModel.predict(Xtest)
MSEtest = sum((ytest - predictTest)**2)/len(ytest)
MSEtest
plt.xticks([])
plt.xlabel(r"$\lambda$")
plt.ylabel(r"error (MSE)")
plt.title(r"Validation Pipeline")
plt.xscale('log')
plt.plot(ls, errorTrain, color='k', linestyle='--', label='training error')
plt.plot(ls, errorValid, color='grey',zorder=4,label="validation error")
plt.plot([bestLamb], [MSEtest], linestyle='', marker='x', color='k', label="test error")
plt.legend(loc='best')
plt.show()
Run models with different feature representation options and dictionary sizes
pipeline(2000, False, False)
l = 0.01, validation MSE = 0.8124083444098931 l = 0.1, validation MSE = 0.8041193851382906 l = 1, validation MSE = 0.7387154350328736 l = 10, validation MSE = 0.5361335778700643 l = 100, validation MSE = 0.3996322733152027 l = 1000, validation MSE = 0.40242091430095334 l = 10000, validation MSE = 0.4792032394453763
pipeline(1000, True, True)
l = 0.01, validation MSE = 0.4421864459799068 l = 0.1, validation MSE = 0.441830005249123 l = 1, validation MSE = 0.4384260891567443 l = 10, validation MSE = 0.41488435972033527 l = 100, validation MSE = 0.36584997844103806 l = 1000, validation MSE = 0.3710655728890064 l = 10000, validation MSE = 0.4509858498087078
Use item2vec to make recommendations (following code from exercises in Chapter 4)
def recScore(i, userHistory):
historyInVocab = [w for w in userHistory if w in model10.wv]
if len(historyInVocab) == 0:
return 0
sc = model10.wv.distance(str(i), historyInVocab[-1])
return sc
def rec(userHistory):
historyInVocab = [w for w in userHistory if w in model10.wv]
if len(historyInVocab) == 0:
return 0
return model10.wv.most_similar(positive = historyInVocab, topn=10)
recScore(20539, reviewLists[0])
0.20018476247787476
rec(reviewLists[0])
[('41688', 0.9940300583839417), ('44596', 0.9929537773132324), ('24082', 0.9924867749214172), ('26393', 0.9917975068092346), ('6824', 0.9912790060043335), ('8950', 0.9910286068916321), ('24473', 0.9906283020973206), ('8679', 0.9903963208198547), ('5972', 0.990136981010437), ('25918', 0.989347517490387)]
(see tf-idf retrieval examples above)
Predict the rating using item2vec item similarity scores. Adapts models from Chapter 4.
ratingMean = sum([d['review/overall'] for d in reviewDicts]) / len(reviewDicts)
itemAverages = defaultdict(list)
reviewsPerUser = defaultdict(list)
for d in reviewDicts:
i = d['beer/beerId']
u = d['user/profileName']
itemAverages[i].append(d['review/overall'])
reviewsPerUser[u].append(d)
for i in itemAverages:
itemAverages[i] = sum(itemAverages[i]) / len(itemAverages[i])
def predictRating(user,item):
ratings = []
similarities = []
if not str(item) in model10.wv:
return ratingMean
for d in reviewsPerUser[user]:
i2 = d['beer/beerId']
if i2 == item: continue
ratings.append(d['review/overall'] - itemAverages[i2])
if str(i2) in model10.wv:
similarities.append(model10.wv.distance(str(item), str(i2)))
if (sum(similarities) > 0):
weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
return itemAverages[item] + sum(weightedRatings) / sum(similarities)
else:
return ratingMean
u,i = reviewDicts[0]['user/profileName'],reviewDicts[0]['beer/beerId']
predictRating(u,i)
3.88871
ratingMean
3.88871
alwaysPredictMean = [ratingMean for _ in reviewDicts]
labels = [d['review/overall'] for d in reviewDicts]
predictions = [predictRating(d['user/profileName'],d['beer/beerId']) for d in reviewDicts]
def MSE(predictions, labels):
differences = [(x-y)**2 for x,y in zip(predictions,labels)]
return sum(differences) / len(differences)
MSE(alwaysPredictMean, labels)
0.4924295358999677
MSE(predictions, labels)
0.4199912117748809