import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
def parse(f):
for l in gzip.open(f):
yield eval(l)
# Download data from below:
# https://cseweb.ucsd.edu/classes/fa21/cse258-b/files/
dataset = list(parse("trainRecipes.json.gz"))
len(dataset)
200000
train = dataset[:150000]
valid = dataset[150000:175000]
test = dataset[175000:]
dataset[1]
{'name': 'double delicious cookie bars', 'minutes': 40, 'contributor_id': '26865936', 'submitted': '2007-08-27', 'steps': 'preheat oven to 350f\tin 13x9-inch baking pan , melt butter in oven\tsprinkle crumbs evenly over butter\tpour milk evenly over crumbs\ttop with remaining ingredients\tpress down firmly\tbake 25-30 minutes or until lightly browned\tcool completely , chill if desired , and cut into bars', 'description': 'from "all time favorite recipes". for fun, try substituting butterscotch or white chocolate chips for the semi-sweet and/or peanut butter chips. make sure you cool it completely or the bottom will crumble!', 'ingredients': ['butter', 'graham cracker crumbs', 'sweetened condensed milk', 'semi-sweet chocolate chips', 'peanut butter chips'], 'recipe_id': '98015212'}
minYear = min([int(d['submitted'][:4]) for d in dataset])
maxYear = max([int(d['submitted'][:4]) for d in dataset])
ingredientCount = defaultdict(int)
for d in dataset:
for i in d['ingredients']:
ingredientCount[i] += 1
counts = [(ingredientCount[i], i) for i in ingredientCount]
counts.sort(reverse=True)
mostPopular = [x[1] for x in counts[:50]]
mostPopularInd = dict(zip(mostPopular,range(len(mostPopular))))
counts[:10]
[(74043, 'salt'), (47753, 'butter'), (38205, 'sugar'), (33826, 'onion'), (29801, 'water'), (29316, 'eggs'), (28554, 'olive oil'), (22749, 'flour'), (22420, 'milk'), (22157, 'garlic cloves')]
### Question 1:
def feat1a(d):
return [len(d['steps']), len(d['ingredients'])]
def feat1b(d):
fYear = [0] * (maxYear - minYear)
y = int(d['submitted'][:4]) - minYear
if y > 0:
fYear[y - 1] = 1
fMonth = [0]*11
m = int(d['submitted'][5:7]) - 1
if m > 0:
fMonth[m - 1] = 1
return fYear + fMonth
def feat1c(d):
fIng = [0] * 50
for i in d['ingredients']:
if i in mostPopularInd:
fIng[mostPopularInd[i]] = 1
return fIng
def feat1(d, a = True, b = True, c = True):
feat = [1]
if a:
feat += feat1a(d)
if b:
feat += feat1b(d)
if c:
feat += feat1c(d)
return feat
def MSE(y, ypred):
return sum([(a-b)**2 for (a,b) in zip(y,ypred)]) / len(y)
def experiment(a = True, b = True, c = True, mod = linear_model.LinearRegression()):
Xtrain = [feat1(d,a,b,c) for d in train]
ytrain = [d['minutes'] for d in train]
Xvalid = [feat1(d,a,b,c) for d in valid]
yvalid = [d['minutes'] for d in valid]
Xtest = [feat1(d,a,b,c) for d in test]
ytest = [d['minutes'] for d in test]
mod.fit(Xtrain,ytrain)
validPred = mod.predict(Xvalid)
testPred = mod.predict(Xtest)
validMSE = MSE(yvalid, validPred)
testMSE = MSE(ytest, testPred)
return validMSE, testMSE
experiment(a = True, b = False, c = False)
(6038.828856645362, 6169.549296366477)
experiment(a = False, b = True, c = False)
(6259.722112303221, 6396.833687711832)
experiment(a = False, b = False, c = True)
(5800.089393402364, 6000.948439855964)
### Question 2
experiment(a = True, b = True, c = True)
(5680.854075533758, 5861.253905671344)
experiment(a = False, b = True, c = True)
(5792.517969633835, 5992.663510100706)
experiment(a = True, b = False, c = True)
(5691.206767869182, 5870.115061656082)
experiment(a = True, b = True, c = False)
(6022.124892928371, 6157.7540943662)
### Question 3
def pipeline():
for lamb in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
mod = linear_model.Ridge(alpha=lamb)
print("lamb = " + str(lamb))
print(experiment(True, True, True, mod))
pipeline()
lamb = 0.001 (5680.854068641046, 5861.253934605403) lamb = 0.01 (5680.854006645431, 5861.254194968507) lamb = 0.1 (5680.853390511796, 5861.256794290246) lamb = 1 (5680.847599737167, 5861.282364316499) lamb = 10 (5680.817805211008, 5861.502338522176) lamb = 100 (5681.206429355591, 5862.475961451593) lamb = 1000 (5689.903073676705, 5868.098948867883)
### Question 4
#(open ended)
### Question 5
def BER(predictions, y):
TP = sum([(p and l) for (p,l) in zip(predictions, y)])
FP = sum([(p and not l) for (p,l) in zip(predictions, y)])
TN = sum([(not p and not l) for (p,l) in zip(predictions, y)])
FN = sum([(not p and l) for (p,l) in zip(predictions, y)])
TPR = TP / (TP + FN)
TNR = TN / (TN + FP)
BER = 1 - 1/2 * (TPR + TNR)
return BER
def feat2(d, dict_size, mostPopularInd):
fIng = [0] * dict_size
for i in d['ingredients']:
if i == 'butter':
continue
if i in mostPopularInd:
fIng[mostPopularInd[i]] = 1
return fIng
def experiment(reg = 1, dict_size = 50):
mostPopular = [x[1] for x in counts[:dict_size]]
mostPopularInd = dict(zip(mostPopular,range(len(mostPopular))))
Xtrain = [feat2(d, dict_size, mostPopularInd) for d in train]
ytrain = ['butter' in d['ingredients'] for d in train]
Xvalid = [feat2(d, dict_size, mostPopularInd) for d in valid]
yvalid = ['butter' in d['ingredients'] for d in valid]
Xtest = [feat2(d, dict_size, mostPopularInd) for d in test]
ytest = ['butter' in d['ingredients'] for d in test]
mod = linear_model.LogisticRegression(C=reg, class_weight='balanced', solver = 'lbfgs')
mod.fit(Xtrain,ytrain)
validPred = mod.predict(Xvalid)
testPred = mod.predict(Xtest)
validBER = BER(validPred, yvalid)
testBER = BER(testPred, ytest)
return validBER, testBER
experiment(1, 50)
(0.2894992495605814, 0.28898437523315856)
### Question 6
def pipeline():
for C in [0.01, 1, 100]:
for dsize in [50, 100, 500]:
print("C = " + str(C) + " dsize = " + str(dsize))
print(experiment(reg = C, dict_size = dsize))
pipeline()
C = 0.01 dsize = 50 (0.29064380312920035, 0.2896532604677795) C = 0.01 dsize = 100 (0.26484628915995734, 0.2665027575416885) C = 0.01 dsize = 500 (0.23055365206358203, 0.2303930628502353) C = 1 dsize = 50 (0.2894992495605814, 0.28898437523315856) C = 1 dsize = 100 (0.2645598522496887, 0.2663601266876059) C = 1 dsize = 500
/usr/lib/python3/dist-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning)
(0.2251436258445605, 0.22554565672439542) C = 100 dsize = 50 (0.28955638943956696, 0.28895820818270845) C = 100 dsize = 100 (0.2645598522496887, 0.26627526585936434) C = 100 dsize = 500
/usr/lib/python3/dist-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations. "of iterations.", ConvergenceWarning)
(0.22498945001865567, 0.22558527147151275)
### Question 7
#(open ended)
# Utility data structures
ingsPerItem = defaultdict(set)
itemsPerIng = defaultdict(set)
for d in dataset:
r = d['recipe_id']
for i in d['ingredients']:
ingsPerItem[r].add(i)
itemsPerIng[i].add(r)
def Jaccard(s1, s2):
numer = len(s1.intersection(s2))
denom = len(s1.union(s2))
return numer / denom
def mostSimilar(i, N):
similarities = []
ings = ingsPerItem[i]
for i2 in ingsPerItem:
if i2 == i: continue
sim = Jaccard(ings, ingsPerItem[i2])
similarities.append((sim,i2))
similarities.sort(reverse=True)
return similarities[:10]
mostSimilar('06432987', 5)
[(0.4166666666666667, '68523854'), (0.38461538461538464, '12679596'), (0.36363636363636365, '79675099'), (0.36363636363636365, '56301588'), (0.35714285714285715, '87359281'), (0.3333333333333333, '72851350'), (0.3333333333333333, '60752618'), (0.3333333333333333, '55267272'), (0.3333333333333333, '49449024'), (0.3333333333333333, '24606222')]
def mostSimilar(i, N):
similarities = []
items = itemsPerIng[i]
for i2 in itemsPerIng:
if i2 == i: continue
sim = Jaccard(items, itemsPerIng[i2])
similarities.append((sim,i2))
similarities.sort(reverse=True)
return similarities[:10]
mostSimilar('butter', 5)
[(0.22315311514274808, 'salt'), (0.2056685424969639, 'flour'), (0.19100394157199166, 'eggs'), (0.17882420717656095, 'sugar'), (0.17040052045973944, 'milk'), (0.12194817914700383, 'all-purpose flour'), (0.12049264941444487, 'baking powder'), (0.1119959483767311, 'brown sugar'), (0.11046609614052172, 'vanilla'), (0.10101947869955157, 'egg')]