In [1]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict # Dictionaries with default values
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
import ast
In [2]:
def parseData(fname):
  for l in urllib.urlopen(fname):
    yield ast.literal_eval(l)
In [3]:
def parseDataFromFile(fname):
  for l in open(fname):
    yield ast.literal_eval(l)
In [4]:
#data = list(parseData("http://jmcauley.ucsd.edu/cse255/data/beer/beer_50000.json"))
data_ = list(parseDataFromFile("C:/Users/Julian McAuley/Documents/class_files/beer_50000.json"))
In [5]:
data = data_[:5000]

Count words

In [6]:
wordCount = defaultdict(int)
totalWords = 0
In [7]:
punct = string.punctuation
stemmer = PorterStemmer()
In [8]:
data[0]
Out[8]:
{'review/appearance': 2.5,
 'beer/style': 'Hefeweizen',
 'review/palate': 1.5,
 'review/taste': 1.5,
 'beer/name': 'Sausa Weizen',
 'review/timeUnix': 1234817823,
 'beer/ABV': 5.0,
 'beer/beerId': '47986',
 'beer/brewerId': '10325',
 'review/timeStruct': {'isdst': 0,
  'mday': 16,
  'hour': 20,
  'min': 57,
  'sec': 3,
  'mon': 2,
  'year': 2009,
  'yday': 47,
  'wday': 0},
 'review/overall': 1.5,
 'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.',
 'user/profileName': 'stcules',
 'review/aroma': 2.0}
In [9]:
for d in data:
    t = d['review/text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    for w in words:
        #w = stemmer.stem(w)
        totalWords += 1
        wordCount[w] += 1
In [10]:
totalWords
Out[10]:
636392
In [11]:
len(wordCount)
Out[11]:
19426
In [12]:
counts = [(wordCount[w], w) for w in wordCount]
In [13]:
counts[:10]
Out[13]:
[(30695, 'a'),
 (556, 'lot'),
 (15935, 'of'),
 (389, 'foam'),
 (6836, 'but'),
 (7033, 'in'),
 (27569, 'the'),
 (1771, 'smell'),
 (3946, 'some'),
 (151, 'banana')]
In [14]:
counts.sort()
counts.reverse()
In [15]:
counts[:10]
Out[15]:
[(30695, 'a'),
 (27569, 'the'),
 (19512, 'and'),
 (15935, 'of'),
 (12623, 'is'),
 (11298, 'with'),
 (9466, 'to'),
 (9068, 'this'),
 (8471, 'i'),
 (8144, 'it')]
In [16]:
counts[5000:5010]
Out[16]:
[(4, 'mist'),
 (4, 'minuscule'),
 (4, 'minimum'),
 (4, 'minimally'),
 (4, 'minerally'),
 (4, 'midtaste'),
 (4, 'midpoint'),
 (4, 'michelob'),
 (4, 'mgd'),
 (4, 'message')]
In [17]:
words = [w[1] for w in counts[:1000]]
In [18]:
words[:10]
Out[18]:
['a', 'the', 'and', 'of', 'is', 'with', 'to', 'this', 'i', 'it']
In [ ]:
 
In [19]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
In [20]:
wordId['murky']
Out[20]:
997

Train a regressor

In [21]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['review/text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    return feat
In [22]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]
In [23]:
X[0][:20]
Out[23]:
[4, 3, 3, 1, 0, 2, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0]
In [24]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
In [25]:
theta[:20]
Out[25]:
array([-0.0016723 ,  0.00288483, -0.00804013, -0.00051687, -0.00021117,
        0.00626303, -0.00246684, -0.00060558,  0.00369268,  0.01251837,
        0.01318618, -0.00180721, -0.00229857,  0.00011039,  0.01874699,
        0.06749427, -0.03537173,  0.00259992,  0.01630909,  0.00045103])
In [26]:
weights = list(zip(theta, words + ['constant_feat']))
In [27]:
weights[:10]
Out[27]:
[(-0.0016723019917446696, 'a'),
 (0.0028848272443045783, 'the'),
 (-0.008040125740582861, 'and'),
 (-0.000516868948364271, 'of'),
 (-0.00021117082786571167, 'is'),
 (0.006263030371556265, 'with'),
 (-0.002466840592419176, 'to'),
 (-0.0006055839936351281, 'this'),
 (0.0036926765461769307, 'i'),
 (0.012518367641350978, 'it')]
In [28]:
weights.sort()
In [29]:
weights[-10:]
Out[29]:
[(0.1559141004754898, 'easy'),
 (0.15603351131457813, 'keeps'),
 (0.1610770254760091, 'easily'),
 (0.1692466411661966, 'drank'),
 (0.1778341243724371, 'impressed'),
 (0.17854018204569164, 'summer'),
 (0.18035500950870653, 'wonderful'),
 (0.2460877952835916, 'always'),
 (0.2481145347490139, 'exceptional'),
 (3.505465514082479, 'constant_feat')]
In [30]:
wordId['skunky']
Out[30]:
685

Train a classifier

In [31]:
y_class = [d['beer/style'] == 'Hefeweizen' for d in data]
In [32]:
from sklearn.linear_model import LogisticRegression
In [33]:
clf = LogisticRegression()
In [34]:
clf.fit(X,y_class)
C:\Users\Julian McAuley\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[34]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [35]:
theta = clf.coef_[0]
In [36]:
weights = list(zip(theta, words + ['constant_feat']))
In [37]:
weights.sort()
In [38]:
weights[:10]
Out[38]:
[(-1.561609352964642, 'constant_feat'),
 (-1.0537430587850531, 'belgian'),
 (-0.875538202607064, 'be'),
 (-0.7992006710492214, 'hops'),
 (-0.7459797910147663, 'malt'),
 (-0.7261341453520288, 'hop'),
 (-0.6981962706956231, 'pumpkin'),
 (-0.6386303936099067, 'abv'),
 (-0.5796429217788306, 'caramel'),
 (-0.5651950715176873, 'grapefruit')]
In [39]:
# Pipeline:
# Sort words by count
# Build feature vector
# Train model
# Build mapping between words and weights
# Show values of theta