import time
import datetime
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy
import urllib
import scipy.optimize
events = {}
# Data is from:
# https://s3.amazonaws.com/babs-open-data/babs_open_data_year_1.zip
# Extract the time information from the events
for f in [
# "201508_trip_data.csv",
# "201608_trip_data.csv",
"201408_babs_open_data/201408_trip_data.csv",
"201402_babs_open_data/201402_trip_data.csv"
]:
f = open(f, 'r')
l = f.readline()
for l in f:
l = l.split(',')
tripID = l[0]
timeString = l[2]
timeUnix = time.mktime(datetime.datetime.strptime(timeString, "%m/%d/%Y %H:%M").timetuple())
events[tripID] = [timeUnix, timeString]
# Find the earliest event
earliest = None
for event in events:
if earliest == None or events[event][0] < earliest[0]:
earliest = events[event]
earliestTime = earliest[0]
hourly = defaultdict(int)
# Count events by hour
for event in events:
t = events[event][0]
hour = int(t - earliestTime) // (60*60)
hourly[hour] += 1
f = open("hourly.json", 'w')
f.write(str(dict(hourly)) + '\n')
hourly = eval(open("hourly.json").read())
# Observations sorted by hour
hourlySorted = []
for h in hourly:
hourlySorted.append((h,hourly[h]))
hourlySorted.sort()
X = [x[0] for x in hourlySorted]
Y = [x[1] for x in hourlySorted]
# Plot the raw observation data
plt.plot(X,Y)
plt.show()
# Plot using a sliding window
sliding = []
wSize = 24*7
tSum = sum([x[0] for x in hourlySorted[:wSize]])
rSum = sum([x[1] for x in hourlySorted[:wSize]])
for i in range(wSize,len(hourlySorted)-1):
tSum += hourlySorted[i][0] - hourlySorted[i-wSize][0]
rSum += hourlySorted[i][1] - hourlySorted[i-wSize][1]
sliding.append((tSum*1.0/wSize,rSum*1.0/wSize))
X = [x[0] for x in sliding]
Y = [x[1] for x in sliding]
plt.plot(X,Y)
plt.show()
# Autoregressive features
def feature(hour):
previousHours = []
for i in [1,2,3,4,5,24,24*7,24*7*365]:
previousHour = hour - i
previousHourExists = previousHour in hourly
if previousHourExists:
# Use the feature if it doesn't exist
previousHours += [0, hourly[previousHour]]
else:
# Otherwise add a "missing value" indicator
previousHours += [1, 0]
return previousHours
X = [feature(x) for x in hourly]
y = [hourly[x] for x in hourly]
theta,residuals,rank,s = numpy.linalg.lstsq(X, y)
theta