import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict # Dictionaries with default values
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
import ast
def parseData(fname):
for l in urllib.urlopen(fname):
yield ast.literal_eval(l)
def parseDataFromFile(fname):
for l in open(fname):
yield ast.literal_eval(l)
#data = list(parseData("http://jmcauley.ucsd.edu/cse255/data/beer/beer_50000.json"))
data_ = list(parseDataFromFile("C:/Users/Julian McAuley/Documents/class_files/beer_50000.json"))
data = data_[:5000]
wordCount = defaultdict(int)
totalWords = 0
punct = string.punctuation
stemmer = PorterStemmer()
data[0]
for d in data:
t = d['review/text']
t = t.lower() # lowercase string
t = [c for c in t if not (c in punct)] # non-punct characters
t = ''.join(t) # convert back to string
words = t.strip().split() # tokenizes
for w in words:
#w = stemmer.stem(w)
totalWords += 1
wordCount[w] += 1
totalWords
len(wordCount)
counts = [(wordCount[w], w) for w in wordCount]
counts[:10]
counts.sort()
counts.reverse()
counts[:10]
counts[5000:5010]
words = [w[1] for w in counts[:1000]]
words[:10]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
wordId['murky']
def feature(datum):
feat = [0]*len(wordSet)
t = datum['review/text']
t = t.lower() # lowercase string
t = [c for c in t if not (c in punct)] # non-punct characters
t = ''.join(t) # convert back to string
words = t.strip().split() # tokenizes
for w in words:
if not (w in wordSet): continue
feat[wordId[w]] += 1
feat.append(1)
return feat
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]
X[0][:20]
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
theta[:20]
weights = list(zip(theta, words + ['constant_feat']))
weights[:10]
weights.sort()
weights[-10:]
wordId['skunky']
y_class = [d['beer/style'] == 'Hefeweizen' for d in data]
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X,y_class)
theta = clf.coef_[0]
weights = list(zip(theta, words + ['constant_feat']))
weights.sort()
weights[:10]
# Pipeline:
# Sort words by count
# Build feature vector
# Train model
# Build mapping between words and weights
# Show values of theta