import numpy
import urllib
import scipy.optimize
import random
from sklearn.decomposition import PCA # PCA library
from sklearn import linear_model
import ast
def parseData(fname):
for l in urllib.urlopen(fname):
yield ast.literal_eval(l)
def parseDataFromFile(fname):
for l in open(fname):
yield ast.literal_eval(l)
# from "http://jmcauley.ucsd.edu/cse255/data/beer/beer_50000.json"
data = list(parseDataFromFile("C:/Users/Julian McAuley/Documents/class_files/beer_50000.json"))
data[0]
X = [[x['review/overall'], x['review/taste'], x['review/aroma'], x['review/appearance'], x['review/palate']] for x in data]
X[:10]
pca = PCA(n_components=5)
pca.fit(X)
pca.components_
psi = numpy.matrix(pca.components_)
psi[4]
# Predict using reduced dimensions
transformed = numpy.matmul(X, pca.components_.T)
transformed
# y in the slides
reduced = [x[:4] for x in transformed]
# ABV > 7.0
y = [d['beer/ABV'] > 7 for d in data]
sum(y)
mod = linear_model.LogisticRegression()
mod.fit(X, y)
correct = mod.predict(X) == y
sum(correct) / len(correct)
mod.fit(reduced, y)
correct = mod.predict(reduced) == y
sum(correct) / len(correct)