import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
def parse(f):
for l in gzip.open(f):
yield eval(l)
# Download data from below:
# https://cseweb.ucsd.edu/classes/fa21/cse258-b/files/
dataset = list(parse("trainRecipes.json.gz"))
len(dataset)
200000
train = dataset[:150000]
valid = dataset[150000:175000]
test = dataset[175000:]
dataset[1]
{'name': 'double delicious cookie bars', 'minutes': 40, 'contributor_id': '26865936', 'submitted': '2007-08-27', 'steps': 'preheat oven to 350f\tin 13x9-inch baking pan , melt butter in oven\tsprinkle crumbs evenly over butter\tpour milk evenly over crumbs\ttop with remaining ingredients\tpress down firmly\tbake 25-30 minutes or until lightly browned\tcool completely , chill if desired , and cut into bars', 'description': 'from "all time favorite recipes". for fun, try substituting butterscotch or white chocolate chips for the semi-sweet and/or peanut butter chips. make sure you cool it completely or the bottom will crumble!', 'ingredients': ['butter', 'graham cracker crumbs', 'sweetened condensed milk', 'semi-sweet chocolate chips', 'peanut butter chips'], 'recipe_id': '98015212'}
### Question 1
def feat1a(d):
def feat1b(d):
def feat1c(d):
def feat(d, a = True, b = True, c = True):
# Hint: for Questions 1 and 2, might be useful to set up a function like this
# which allows you to "select" which features are included
def MSE(y, ypred):
# Can use library if you prefer
def experiment(a = True, b = True, c = True, mod):
# Hint: might be useful to write this function which extracts features and
# computes the performance of a particular model on those features
### Question 2
### Question 3
def pipeline():
for lamb in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
### Question 4
#(open ended)
### Question 5
def BER(predictions, y):
# Implement following this logic or otherwise
TP = sum([(p and l) for (p,l) in zip(predictions, y)])
FP = # (etc.)
TN =
FN =
def feat2(d, dict_size, mostPopularInd):
fIng = [0] * dict_size
for i in d['ingredients']:
if i == 'butter':
continue
if i in mostPopularInd:
fIng[mostPopularInd[i]] = 1
return fIng
def experiment(reg = 1, dict_size = 50):
# Hint: run an experiment with a particular regularization strength, and a particular one-hot encoding size
# extract features...
# (etc.)
mod = linear_model.LogisticRegression(C=reg, class_weight='balanced', solver = 'lbfgs')
# (etc.)
### Question 6
def pipeline():
for C in [0.01, 1, 100]:
for dsize in [50, 100, 500]:
# Example values, can pick any others...
### Question 7
#(open ended)
### Question 8
# Utility data structures
ingsPerItem = defaultdict(set)
itemsPerIng = defaultdict(set)
for d in dataset:
r = d['recipe_id']
for i in d['ingredients']:
ingsPerItem[r].add(i)
itemsPerIng[i].add(r)
def Jaccard(s1, s2):
def mostSimilar8(i, N):
### Question 9
def mostSimilar9(i, N):
### Question 10
#(open ended)