In [1]:
import numpy
import urllib
import scipy.optimize
import random
from sklearn.decomposition import PCA # PCA library
from sklearn import linear_model
import ast
In [2]:
def parseData(fname):
  for l in urllib.urlopen(fname):
    yield ast.literal_eval(l)
In [3]:
def parseDataFromFile(fname):
  for l in open(fname):
    yield ast.literal_eval(l)
In [4]:
# from "http://jmcauley.ucsd.edu/cse255/data/beer/beer_50000.json"
data = list(parseDataFromFile("C:/Users/Julian McAuley/Documents/class_files/beer_50000.json"))
In [5]:
data[0]
Out[5]:
{'review/appearance': 2.5,
 'beer/style': 'Hefeweizen',
 'review/palate': 1.5,
 'review/taste': 1.5,
 'beer/name': 'Sausa Weizen',
 'review/timeUnix': 1234817823,
 'beer/ABV': 5.0,
 'beer/beerId': '47986',
 'beer/brewerId': '10325',
 'review/timeStruct': {'isdst': 0,
  'mday': 16,
  'hour': 20,
  'min': 57,
  'sec': 3,
  'mon': 2,
  'year': 2009,
  'yday': 47,
  'wday': 0},
 'review/overall': 1.5,
 'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.',
 'user/profileName': 'stcules',
 'review/aroma': 2.0}
In [6]:
X = [[x['review/overall'], x['review/taste'], x['review/aroma'], x['review/appearance'], x['review/palate']] for x in data]
In [7]:
X[:10]
Out[7]:
[[1.5, 1.5, 2.0, 2.5, 1.5],
 [3.0, 3.0, 2.5, 3.0, 3.0],
 [3.0, 3.0, 2.5, 3.0, 3.0],
 [3.0, 3.0, 3.0, 3.5, 2.5],
 [4.0, 4.5, 4.5, 4.0, 4.0],
 [3.0, 3.5, 3.5, 3.5, 3.0],
 [3.5, 4.0, 3.5, 3.5, 4.0],
 [3.0, 3.5, 2.5, 3.5, 2.0],
 [4.0, 4.0, 3.0, 3.5, 3.5],
 [4.5, 4.0, 3.5, 5.0, 4.0]]
In [8]:
pca = PCA(n_components=5)
In [9]:
pca.fit(X)
Out[9]:
PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)
In [10]:
pca.components_
Out[10]:
array([[-0.48201817, -0.51561105, -0.44450326, -0.32114146, -0.44842921],
       [-0.46600016, -0.26162178,  0.31604305,  0.78071848, -0.07066358],
       [-0.18730152,  0.11729702,  0.78013615, -0.424744  , -0.4026657 ],
       [-0.5195416 , -0.09404298,  0.11758039, -0.31877216,  0.77832615],
       [ 0.49547654, -0.80193305,  0.28301251, -0.07296345,  0.16120371]])
In [11]:
psi = numpy.matrix(pca.components_)
In [12]:
psi[4]
Out[12]:
matrix([[ 0.49547654, -0.80193305,  0.28301251, -0.07296345,  0.16120371]])
In [13]:
# Predict using reduced dimensions
In [14]:
transformed = numpy.matmul(X, pca.components_.T)
In [15]:
transformed
Out[15]:
array([[-3.86094779e+00,  1.38645402e+00, -2.10592996e-01,
        -3.14657264e-01,  1.65737206e-01],
       [-6.41285778e+00,  7.37406508e-01, -7.41902212e-01,
        -1.68140794e-01,  5.28825439e-02],
       [-6.41285778e+00,  7.37406508e-01, -7.41902212e-01,
        -1.68140794e-01,  5.28825439e-02],
       ...,
       [-8.64135735e+00,  1.13807171e+00, -7.06880988e-01,
         8.15831922e-03, -5.31025986e-01],
       [-9.10461806e+00,  1.06309316e+00, -4.10463670e-01,
        -1.92822287e-01, -1.41781462e-01],
       [-9.08782162e+00,  9.60903966e-01, -5.62762940e-01,
        -4.05571600e-01,  5.06923333e-01]])
In [16]:
# y in the slides
reduced = [x[:4] for x in transformed]
In [17]:
# ABV > 7.0
y = [d['beer/ABV'] > 7 for d in data]
In [18]:
sum(y)
Out[18]:
23314
In [19]:
mod = linear_model.LogisticRegression()
In [20]:
mod.fit(X, y)
C:\Users\Julian McAuley\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[20]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [21]:
correct = mod.predict(X) == y
In [22]:
sum(correct) / len(correct)
Out[22]:
0.68822
In [23]:
mod.fit(reduced, y)
C:\Users\Julian McAuley\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[23]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [24]:
correct = mod.predict(reduced) == y
In [25]:
sum(correct) / len(correct)
Out[25]:
0.67562