%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import geopy
from geopy.distance import vincenty
from pandas.plotting import scatter_matrix
import seaborn as sns
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from IPython.display import Image
from IPython.core.display import HTML
from IPython.display import HTML
from pylab import rcParams
import itertools
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf, adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit"
value="Click here to toggle on/off the raw code." style="background: #76B3FA; border-radius: 100px; padding: 15px 40px; color: #fff; text-decoration: none; font-size: 1.45em; margin: 0 15px;"></form>''')
Bike is fun and a green way to move around the city! I love to commute to work with my bike or go on a bike ride with my friends on a weekend.
With this analysis, I'm trying to answer if weather impacts bike-share habits and how.
I started with data preprocessing: cleaning and merging the datasets. Then I analyzed the data and correlation between weather and bike-share rides with EDA and correlation analysis. As the last step, I investigated if the weather is a good predictor for bike share rides. I built a time series model to predict bike-share trips based on historical data (SARIMA) and compare it with a model which considers weather (SARIMAX).
π¦π΄π»ββοΈπ STEPS:
What is not in the scope of this project:
wx = pd.read_csv('tmp/NOAA_SF_weather.csv')
# Checking count of hourly data points
wx['DATE']= pd.to_datetime(wx['DATE'])
#wx.DATE.groupby(wx['DATE'].dt.date).count().hist(bins=30, color='skyblue')
# Most of the dates has only one row, that's why I droped columns and rows associated with hourly data.
# Dropping hourly columns:
wx.drop(columns=['HOURLYPrecip', 'HOURLYAltimeterSetting'], inplace=True)
# Dropping rows with hourly data (NA) and removing timestamp
wx.dropna(inplace=True)
wx['DATE'] = wx.DATE.dt.date
# Removing couple of other columns to narrow the scope
wx.drop(columns=['DAILYDeptFromNormalAverageTemp','DAILYHeatingDegreeDays','DAILYCoolingDegreeDays'], inplace=True)
# Adding daylight feature - duration in minutes of daytime
wx['daylight'] = wx['DAILYSunset'] - wx['DAILYSunrise']
# Adding difference between day's temprerature and previos day's temperature
wx['daily_temp_diff'] = wx['DAILYAverageDryBulbTemp'] - wx['DAILYAverageDryBulbTemp'].shift(1)
wx['daily_temp_diff'].fillna(0, inplace = True)
# Renaming columns
wx.rename(columns= {'DATE': 'date', 'DAILYMaximumDryBulbTemp':'max_temp', 'DAILYMinimumDryBulbTemp': 'min_temp',
'DAILYAverageDryBulbTemp': 'avg_temp', 'DAILYSunrise': 'sunrise', 'DAILYSunset': 'sunset',
'DAILYPrecip': 'precip'}, inplace=True)
# Checking the descriptive statistics. Nothing looks suspisious.
wx.describe()
wx.to_csv('wx.csv', index=False)
# I used the files for 2017 and 2018, which corresponds with my weather data.
# Combined all files into one csv:
all_bike_files = [i for i in glob.glob('*[0-9]*.*')]
combined_csv = pd.concat([pd.read_csv(f) for f in all_bike_files])
combined_csv.to_csv("bike_shares.csv", index=False, encoding='utf-8-sig')
bike = pd.read_csv('bike_shares.csv')
print('df shape:')
bike.shape
# Adding date column
bike['date']= pd.to_datetime(bike['start_time']).dt.date
# Adding distance as a feature:
bike['distance'] = bike.apply(lambda x: geopy.distance.vincenty((x['start_station_latitude'], x['start_station_longitude']),
(x['end_station_latitude'], x['end_station_longitude'])).miles, axis=1)
# Adding user_type as a dummy feature:
df_user = pd.get_dummies(bike['user_type'])
bike = pd.concat([bike, df_user], axis=1)
# Dropping columns:
bike.drop(columns=['bike_share_for_all_trip', 'end_station_id', 'end_station_latitude',
'end_station_longitude', 'end_station_name', 'end_time', 'start_station_id', 'start_station_latitude',
'start_station_longitude', 'start_station_name', 'member_birth_year', 'start_time', 'member_gender'], inplace=True)
cols = bike.columns.tolist()
cols = ['date', 'bike_id',
'duration_sec', 'distance', 'user_type', 'Customer', 'Subscriber']
bike = bike[cols]
# Checking missing values
bike.isnull().sum()
bike.describe()
bike_wx = pd.merge(bike, wx, how='left', on='date')
# Dropping all dates we don't have temperature information
bike_wx.dropna(subset=['max_temp'], how='all', inplace = True)
bike_wx.to_csv('bike_wx.csv', index=False)
# I'm transforming the data to daily frequency for two reasons
# 1) the weather data set contains only daily data
# 2) it is more effiecient to model correlation and time series forecast on aggregated data.
# Aggregating bike data by date and creating features:
bike_gr = bike.groupby(by='date', as_index=False).agg({'bike_id': {'count_shares': 'count'},
'duration_sec': {'avg_duration':'mean'},
'distance': {'avg_distance': 'mean'},
'Customer': {'count_customer': 'sum'},
'Subscriber': {'count_subscriber': 'sum'}})
bt = pd.merge(bike_gr, wx, how='inner', on='date')
# Drop some columns
bt.drop(columns=['date'], inplace=True)
cols = bt.columns.tolist()
cols = ['date', 'count_shares', 'avg_duration', 'avg_distance', 'count_customer', 'count_subscriber'] + cols[6:]
bt.columns = cols
bt.drop(columns=['sunrise', 'sunset'], inplace=True)
# Adding total duration
bt['total_dur'] = bt.avg_duration * bt.count_shares
cols = bt.columns.tolist()
cols = cols[0:4] +cols[-1:] + cols[4:-1]
bt = bt[cols]
bt.to_csv('bike_sharing_weather_daily.csv', index=False)
bt = pd.read_csv('bike_sharing_weather_daily.csv')
bike_wx = pd.read_csv('bike_wx.csv')
To understand the type of data we are dealing with I used histograms of each numerical attribute.
attr = ['avg_distance', 'avg_duration', 'total_dur', 'count_shares','avg_temp','daylight', 'daily_temp_diff', 'precip']
bt[attr].hist(bins=50, figsize=(20,15), color='skyblue')
plt.show()
no_rain = 1- list(bt[bt.precip > 0].count() / bt.count())[0]
no_rain
A few things in these histograms:
Next, I plotted and analyzed weather factors such as Temperature, Precipitation, Daylight, and their impact on bike-share rides as well as take a look on differences in the behavior of Subscribers vs. Customers in context of weather conditions.
I categorized weather conditions splitting it into Cold (< 53F), Good (between 53F and 77F) and Hot (>77F).
def wx_feel(row):
"""Categorize the weather. The fuction might be very subjective π"""
if row['avg_temp'] < 53:
return 'cold'
elif row['avg_temp'] >= 53 and row['avg_temp'] <= 77:
return 'good'
else:
return 'hot'
bike_wx['wx_feel'] = bike_wx.apply(wx_feel, axis=1)
bike_wx.to_csv('bike_wx.csv', index=False)
days = list(bike_wx.groupby('date').nunique().count())[0]
rain_ct = list(bike_wx[bike_wx.precip>0].groupby('wx_feel').bike_id.count()/days)
no_rain_ct= list(bike_wx[bike_wx.precip<=0].groupby('wx_feel').bike_id.count()/days)
#avg = bike_wx.bike_id.count()/ days
# Defining a color palette
light_blue = 'rgb(142, 212, 229)'
pink = 'rgb(231, 84, 128)'
green = 'rgb(199, 204, 118)'
grey = 'rgb(169,169,169)'
yellow = 'rgb(248, 222, 126)'
trace1 = go.Bar(
x=['Cold & π§', 'Good & π§', 'Hot & π§'],
y=rain_ct, marker = dict(color=light_blue),
name='Rain')
trace2 = go.Bar(
x=['Cold & π€', 'Good & π€', 'Hot & π€'],
y=no_rain_ct,
name='No Rain', marker = dict(color=yellow))
# trace3 = go.Scatter(
# x=['Cold & π§', 'Good & π§','Cold & π€', 'Good & π€', 'Hot & π€'],
# y=[avg]*5,
# name='Overall average', marker = dict(color=pink))
data = [trace1, trace2]
layout = go.Layout(
barmode='stack', title = 'Daily average number of bike share rides by weather and precipitation', xaxis=dict(title='Weather condition'),
yaxis=dict(
title='Average number of bike share rides'),)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Weather")
Looking at the average number of bike-share rides by weather condition we can conclude that people use bike-share more often when the weather is good.
rain_avg_dur = list(bike_wx[bike_wx.precip>0].groupby('wx_feel').duration_sec.mean()/60)
no_rain_avg_dur = list(bike_wx[bike_wx.precip<=0].groupby('wx_feel').duration_sec.mean()/60)
#(np.mean(np.array(no_rain_avg_dur)) - np.mean(np.array(rain_avg_dur)))/np.mean(np.array(rain_avg_dur))
trace1 = go.Bar(
x=['Cold & π§', 'Good & π§', 'Hot & π§'],
y=rain_avg_dur, marker = dict(color=light_blue),
name='Rain')
trace2 = go.Bar(
x=['Cold & π€', 'Good & π€', 'Hot & π€'],
y=no_rain_avg_dur,
name='No Rain', marker = dict(color=yellow))
data = [trace1, trace2]
layout = go.Layout(
barmode='stack', title = 'Average duration of a bike share ride (in minutes) by weather and precipitation', xaxis=dict(title='Weather condition'),
yaxis=dict(
title='Average duration of a bike share ride'),)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Weather")
Temperature change during rainy does not impact ride duration; on the other hand, hot days with no rain have longer rides.
For temperature variability I'm analyzing the weekly changes (weekends included).
dates = [i[0] for i in bike_wx.groupby('date')]
dates_w = dates[5:]
dates_w = [dates_w[i] for i in range(0, len(dates_w), 7)]
weekly = bt[bt['date'].isin(dates_w)]
temp_diff = list((weekly.avg_temp-weekly.avg_temp.shift(1))/ weekly.avg_temp.shift(1))[1:]
rides_diff = list((weekly.count_shares-weekly.count_shares.shift(1))/ weekly.count_shares.shift(1))[1:]
avg_dur_diff = list((weekly.avg_duration-weekly.avg_duration.shift(1))/ weekly.avg_duration.shift(1))[1:]
trace1 = go.Scatter(x=dates_w[1:],
y=temp_diff, name='Delta in temperature', marker = dict(color=pink))
trace2 = go.Bar(
x=dates_w[1:],
y=rides_diff, name='Number of rides', marker = dict(color=yellow))
trace3 = go.Bar(
x=dates_w[1:],
y=avg_dur_diff, name='Average ride duration', marker = dict(color=light_blue))
data = [trace1, trace2, trace3]
layout = go.Layout(legend=dict(x=1.1, y=1), title = 'Impact of temperature drop/rise on increase/decrease in Bike Share rides',
xaxis=dict(title='Time'), yaxis=dict(title='in %', titlefont=dict(),tickfont=dict()))
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Weather")
This data is not very helpful. You can see that in most of the cases, the number of rides and average ride duration follows the weather variability pattern. But there are also some other factors like the peak of average ride duration is on December 25th (right after Christmas).
I assume that when it's getting dark early then, people bike less, let's take a look.
daylight = [i[0] for i in bike_wx.groupby('date').agg({'daylight':'mean'}).values]
rides = [i[0] for i in bike_wx.groupby('date').agg({'bike_id':'count'}).values]
avg_dur = [i[0] for i in bike_wx.groupby('date').agg({'duration_sec':'mean'}).values]
dates = [i[0] for i in bike_wx.groupby('date')]
trace1 = go.Scatter(x=dates, y=daylight, name="Daylight", marker = dict(color=pink))
trace2 = go.Scatter(x=dates, y=rides, name="Number of rides", marker = dict(color=yellow))
trace3 = go.Scatter(x=dates, y=avg_dur, name="Average ride duration", marker = dict(color=light_blue))
data = [trace1, trace2, trace3]
layout = go.Layout(legend=dict(x=1.1, y=1), title = 'Impact of daylight duration on Number of rides and Average ride duration',
xaxis=dict(title='Time'), yaxis=dict(title='Individual Measurements of Each Factor*', titlefont=dict(),tickfont=dict()))
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Weather")
*Daylight: time in minutes
*Numer of rides: count of rides
*Average ride duration: time in sec
We can clearly see that daylight duration is low in winter and the pattern of a number of rides and average ride duration follows this trend. Another interesting observation is that both the average ride duration and the number of trips increases on weekends.
Let's think about the monetization of bike share. The current Bay Wheels' business model is as follows:
So, based on that, we can assume that the longer the rides, the more revenue the company has. That's why I'm going to compare the Subscribers' vs. Customers' ride duration.
# 2700 sec is 45 min
sub_long_rain = bike_wx[(bike_wx.duration_sec > 2700) & (bike_wx.user_type == 'Subscriber')&\
(bike_wx.precip > 0)].bike_id.count() / bike_wx[(bike_wx.user_type == 'Subscriber')]\
.bike_id.count()
sub_short_rain = bike_wx[(bike_wx.duration_sec <= 2700) & (bike_wx.user_type == 'Subscriber')&\
(bike_wx.precip > 0)].bike_id.count() / bike_wx[(bike_wx.user_type == 'Subscriber')]\
.bike_id.count()
sub_long_norain = bike_wx[(bike_wx.duration_sec > 2700) & (bike_wx.user_type == 'Subscriber')&\
(bike_wx.precip <= 0)].bike_id.count() / bike_wx[(bike_wx.user_type == 'Subscriber')]\
.bike_id.count()
sub_short_norain = bike_wx[(bike_wx.duration_sec <= 2700) & (bike_wx.user_type == 'Subscriber')&\
(bike_wx.precip <= 0)].bike_id.count() / bike_wx[(bike_wx.user_type == 'Subscriber')]\
.bike_id.count()
cus_long_rain = bike_wx[(bike_wx.duration_sec > 2700) & (bike_wx.user_type == 'Customer')&\
(bike_wx.precip > 0)].bike_id.count() / bike_wx[(bike_wx.user_type == 'Customer')]\
.bike_id.count()
cus_short_rain = bike_wx[(bike_wx.duration_sec <= 2700) & (bike_wx.user_type == 'Customer')&\
(bike_wx.precip > 0)].bike_id.count() / bike_wx[(bike_wx.user_type == 'Customer')]\
.bike_id.count()
cus_long_norain = bike_wx[(bike_wx.duration_sec > 2700) & (bike_wx.user_type == 'Customer')&\
(bike_wx.precip <= 0)].bike_id.count() / bike_wx[(bike_wx.user_type == 'Customer')]\
.bike_id.count()
cus_short_norain = bike_wx[(bike_wx.duration_sec <= 2700) & (bike_wx.user_type == 'Customer')&\
(bike_wx.precip <= 0)].bike_id.count() / bike_wx[(bike_wx.user_type == 'Customer')]\
.bike_id.count()
trace1 = go.Bar(
x=['Subscriber <= 45min', 'Subscriber > 45min', 'Customer <= 45min', 'Customer > 45min'],
y=[sub_short_rain, sub_long_rain, cus_short_rain, cus_long_rain], marker = dict(color=light_blue),
name='Rain')
trace2 = go.Bar(
x=['Subscriber <= 45min', 'Subscriber > 45min', 'Customer <= 45min', 'Customer > 45min'],
y=[sub_short_norain, sub_long_norain, cus_short_norain, cus_long_norain],
name='NoRain', marker = dict(color=yellow))
data = [trace1, trace2]
layout = go.Layout(
barmode='stack', title = '% of bike share rides for subscribers vs customers by weather',
xaxis=dict(title='Subscriber vs. Customer'),
yaxis=dict(
title='% of ride number'),)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Bike Shares")
sub_long_rain = bike_wx[(bike_wx.duration_sec > 2700) & (bike_wx.user_type == 'Subscriber')&\
(bike_wx.precip > 0)].duration_sec.mean()/60
sub_short_rain = bike_wx[(bike_wx.duration_sec <= 2700) & (bike_wx.user_type == 'Subscriber')&\
(bike_wx.precip > 0)].duration_sec.mean()/60
sub_long_norain = bike_wx[(bike_wx.duration_sec > 2700) & (bike_wx.user_type == 'Subscriber')&\
(bike_wx.precip <= 0)].duration_sec.mean()/60
sub_short_norain = bike_wx[(bike_wx.duration_sec <= 2700) & (bike_wx.user_type == 'Subscriber')&\
(bike_wx.precip <= 0)].duration_sec.mean()/60
cus_long_rain = bike_wx[(bike_wx.duration_sec > 2700) & (bike_wx.user_type == 'Customer')&\
(bike_wx.precip > 0)].duration_sec.mean()/60
cus_short_rain = bike_wx[(bike_wx.duration_sec <= 2700) & (bike_wx.user_type == 'Customer')&\
(bike_wx.precip > 0)].duration_sec.mean()/60
cus_long_norain = bike_wx[(bike_wx.duration_sec > 2700) & (bike_wx.user_type == 'Customer')&\
(bike_wx.precip <= 0)].duration_sec.mean()/60
cus_short_norain = bike_wx[(bike_wx.duration_sec <= 2700) & (bike_wx.user_type == 'Customer')&\
(bike_wx.precip <= 0)].duration_sec.mean()/60
trace1 = go.Bar(
x=['Subscriber <= 45min', 'Subscriber > 45min', 'Customer <= 45min', 'Customer > 45min'],
y=[sub_short_rain, sub_long_rain, cus_short_rain, cus_long_rain], marker = dict(color=light_blue),
name='Rain')
trace2 = go.Bar(
x=['Subscriber <= 45min', 'Subscriber > 45min', 'Customer <= 45min', 'Customer > 45min'],
y=[sub_short_norain, sub_long_norain, cus_short_norain, cus_long_norain],
name='No Rain', marker = dict(color=yellow))
data = [trace1, trace2]
layout = go.Layout(
barmode='stack', title = 'Comparing subscribers vs. customers behaviour by weather and length of a ride',
xaxis=dict(title='Subscriber vs. Customer'),
yaxis=dict(
title='Average duration'),)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Bike Shares")
There is no big difference between Subscriber and Customer with regards to weather/precipitation. The Subscribers mostly take shorter rides, which are included in their subscription plan.
bt['wx_feel'] = bt['wx_feel'] = bt.apply(wx_feel, axis=1)
bt.to_csv('bike_sharing_weather_daily.csv', index=False)
# Good
cust_g = bt[bt.wx_feel == 'good'].count_customer.sum() / bt[bt.wx_feel == 'good'].count_shares.sum()
sub_g = bt[bt.wx_feel == 'good'].count_subscriber.sum() / bt[bt.wx_feel == 'good'].count_shares.sum()
# Cold
cust_c = bt[bt.wx_feel == 'cold'].count_customer.sum() / bt[bt.wx_feel == 'cold'].count_shares.sum()
sub_c = bt[bt.wx_feel == 'cold'].count_subscriber.sum() / bt[bt.wx_feel == 'cold'].count_shares.sum()
# Hot
cust_h = bt[bt.wx_feel == 'hot'].count_customer.sum() / bt[bt.wx_feel == 'hot'].count_shares.sum()
sub_h = bt[bt.wx_feel == 'hot'].count_subscriber.sum() / bt[bt.wx_feel == 'hot'].count_shares.sum()
trace1 = go.Bar(
x=['Cold', 'Good', 'Hot'],
y=[cust_c, cust_g, cust_h], marker = dict(color=grey),
name='Customer')
trace2 = go.Bar(
x=['Cold', 'Good', 'Hot'],
y=[sub_c, sub_g, sub_h],
name='Subscriber', marker = dict(color=pink))
data = [trace1, trace2]
layout = go.Layout(
barmode='stack', title = 'Comparing customers vs. subscribers behaviour by weather',
xaxis=dict(title='Weather condition'),
yaxis=dict(
title='% of total bike share rides'),)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Bike Shares")
Proportionally more customers use bike-share when weather temperature is good or hot.
I'm computing Pearson's correlations between every pair.
attr = ['count_shares', 'avg_temp', 'daylight', 'daily_temp_diff', 'precip']
corr_matrix = bt[attr].corr()
corr_matrix['count_shares'].sort_values(ascending=False)
The correlation between weather and count is low. The highest for weather feature is a negative correlation with precipitation. Let's check the average duration and total duration.
attr = ['avg_duration', 'avg_temp', 'daylight', 'daily_temp_diff', 'precip']
corr_matrix = bt[attr].corr()
corr_matrix['avg_duration'].sort_values(ascending=False)
The correlation for the duration of the ride is much higher compared to the number of bike rides. That means that the number of bike-share a day is not as impacted by the weather as much as the duration of the trip. My conclusion here is that if it's a rainy or cold day, people avoid long bike rides.
sns.regplot(x="avg_duration", y='avg_temp', data=bt)
sns.regplot(x="avg_duration", y='daylight', data=bt)
sns.regplot(x="avg_duration", y='precip', data=bt)
To vizualize the correlation, I'm plotting pandas scatter_matrix.
attr = ['avg_duration', 'count_shares', 'avg_temp', 'daylight', 'precip']
sns.set(style="ticks", color_codes=True)
sns.pairplot(bt[attr], kind="reg")
plt.show()
With the time series I'm going to check if weather is a good predictors of daily bike share rides.
I constructed a SARIMA and a SARIMAX model and compared their results to find out if the weather improves prediction accuracy. I focused below on predicting the number of bike-share rides per day, but similar models could be applied to predict the daily duration. Both metrics are important for bike-share business. The number of trips prediction can be used for planning the capacity. The duration would be helpful to forecast the company's revenue.
Let's plot time series of daily number of bike share rides over one year period of time:
bt.plot(x='date', y='count_shares', figsize=(15,6))
plt.xlabel('Date')
plt.ylabel('Duration')
plt.title('Time Series of bike share rides by day')
The time-series has seasonality pattern: the rides go down in wintertime and seems like there are peaks during weekends. There is also an upward trend.
I'm going to decompose the data to see: trend, seasonality, and noise.
bt_ts = bt.iloc[:, [0, 1, 10]].set_index('date')
# Selection 7 for weekly seasonality decomposition:
decomposed = sm.tsa.seasonal_decompose(bt_ts.count_shares, freq=7)
rcParams['figure.figsize'] = 15, 6
decomposed.plot()
plt.show()
With the frequency of 7 days, we see that there is a seasonality pattern in the data. The upward trend is also very prominent.
I'm going to apply one of the common methods for time series modeling, SARIMA stands for Seasonal Autoregressive Integrated Moving Average. SARIMA model has the following orders/parameters (p, d, q) p - parameter for AR portion and incorporates past values d - is an integrated part of the model, which shows the amount of differencing applied to the model q - parameter of MA of the model.
As we have seasonal ARIMA, there will be another set of parameters notated as (P,D,Q) which describes the same association as p, d, q but correspond with a seasonal component of the model. The goal is to select optimal orders. It could be done via a likelihood ratio test or comparison of goodness-of-fit metrics. I will use the iterative method and compare goodness-of-fit. Before doing that, I'm going to test the stationarity of the data since we need the time series to be stationary (having no trend). I will use the ADF test (Augmented-Dickey Fuller test).
def adf_test(data):
"""Augmented Dickey Fuller Test for stationarity and prints test
statistics as well as plots the data with rolling mean and
standard deviation"""
rmean = data.rolling(window=7, center=False).mean()
rstd = data.rolling(window=7, center=False).std()
fig = plt.figure(figsize=(15, 6))
orig = plt.plot(data, color='steelblue',label='Bike share rides', alpha=.7)
mean = plt.plot(rmean, color='deeppink', label='Rolling Mean', alpha=.7)
std = plt.plot(rstd, color='darkgrey', label = 'Rolling Std')
labels = []*len(bt_ts)
plt.xticks(labels, labels)
plt.legend(loc='best')
plt.show()
result = adfuller(data)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
adf_test(bt_ts.count_shares)
Based on p-value, we fail to reject the null hypothesis of stationarity of the data and conclude that time series is not stationary. I will need to apply differencing.
# Difference the time series with lag=1
bt_ts1 = bt_ts.diff(1).iloc[1::]
adf_test(bt_ts1.iloc[:,0])
Great, p-value is smaller than alpha = 0.05, thus I conclude, the differenced times series is stationary. I will use d=1 for my SARIMA model.
The p-value is smaller than alpha = 0.05, so I conclude, the differenced times series is stationary. I will use d=1 for my SARIMA model.
With the following test, I'm going to determine parameters using an interactive method and goodness-of-fit AIC - Akaike information criterion, an estimator of the relative quality of statistical models.
def best_param(data):
"""Finding best parameters for SARIMA model. Iterative method based on comparison of
goodness-of-fit metric AIC"""
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
PDQ = [(x[0], x[1], x[2], 7) for x in list(itertools.product(p, d, q))]
params_d = {}
for i in pdq:
for j in PDQ:
mod = sm.tsa.statespace.SARIMAX(data,
order=i,
seasonal_order=j,
enforce_stationarity=True,
enforce_invertibility=True)
results = mod.fit()
params_d[(i, j)] = results.aic
return print(f'SARIMA: {min(params_d, key=params_d.get)[0]}x{min(params_d, key=params_d.get)[1]}')
best_param(bt_ts.count_shares)
# I will slightly deviate from the aboove test result and select 0 for both p and q for simplicity reason.
model_1 = sm.tsa.statespace.SARIMAX(bt_ts.count_shares,
order=(0, 1, 0),
seasonal_order=(1, 1, 1, 7),
enforce_stationarity=False,
enforce_invertibility=False)
print(model_1.fit().summary().tables[1])
# Diagnostoc to check if there is anything unusual
model_1.fit().plot_diagnostics(figsize=(16, 8))
plt.show()
It not ideal, but the diagnostic suggests that the residuals are distibuted almost normally.
Since we have data on weather variables collected for the same frequency as the response series and these weather data as explored above is correlated with the response I'm going to exploit that relationship by using multivariate time series model. Weather influences the bike share rides, but not the other way round that means that the weather is an exogenous factor, so I'm using SARIMAX (X - stands for exogenous).
model_2 = sm.tsa.statespace.SARIMAX(bt_ts.count_shares,
order=(0, 1, 0),
seasonal_order=(1, 1, 1, 7),
enforce_stationarity=False,
enforce_invertibility=False,
exog=bt_ts.precip)
print(model_2.fit().summary().tables[1])
model_2.fit().plot_diagnostics(figsize=(16, 8))
plt.show()
Now let's test these models in terms of their predictive power and compare performance metrics RMSE and R2.
First, I'm going to train test split the data. Since it's a time series model, I will cut off the latest portion of the data for test purposes.
print(f'I will set a side 10% of the data: {round(bt_ts.shape[0]*0.1)} samples')
train = bt_ts['2017-06-28':'2018-04-20']
test = bt_ts['2018-04-21':]
First, I'm fitting SARIMA
result_1 = model_1.fit(train.count_shares)
# I'm showing here the last 3 months roughtly for better visualization:
pred_1 = result_1.get_prediction(start='2018-04-21', dynamic=False)
pred_ci = pred_1.conf_int()
pred = pred_1.predicted_mean.to_frame()
pred_plus_ci = pd.merge(pred_ci, pred, how='left', on='date')
pred_plus_ci.rename(columns={0: 'pred'}, inplace=True)
tr = bt_ts.iloc[-90:,0].to_frame()
pred_to_plot = pd.merge(tr, pred_plus_ci, how='left', on='date')
ax = pred_to_plot.count_shares.plot(label='Observed', color='dodgerblue')
pred_to_plot.pred.plot(ax=ax, label='Forecast', alpha=.7, figsize=(18, 6), color='deeppink')
ax.fill_between(pred_to_plot.index,
pred_to_plot.iloc[:, 1],
pred_to_plot.iloc[:, 2], color='darkgrey', alpha=.2)
xticks = pred_to_plot.index
ax.xaxis.set_ticks(xticks)
ax.set_xticklabels(xticks, alpha=1, rotation=45)
ax.set_xticks(ax.get_xticks()[::5])
ax.set_xlabel('Date')
ax.set_ylabel('Bike share rides')
plt.legend()
plt.show()
print('Results of SARIMA:')
print('------------------')
mse = mean_squared_error(test.count_shares, pred_1.predicted_mean)
print('The RMSE of our forecasts is {}'.format(round(np.sqrt(mse), 2)))
r2 = r2_score(test.count_shares, pred_1.predicted_mean)
print('The R2 of our forecasts is {}'.format(round(r2,2)))
Let's fit and evaluate SARIMAX model.
result_2 = model_2.fit(train.count_shares)
# I'm showing here the last 3 months roughtly for better visualization:
pred_2 = result_2.get_prediction(start='2018-04-21', dynamic=False)
pred_ci = pred_2.conf_int()
pred = pred_2.predicted_mean.to_frame()
pred_plus_ci = pd.merge(pred_ci, pred, how='left', on='date')
pred_plus_ci.rename(columns={0: 'pred'}, inplace=True)
tr = bt_ts.iloc[-90:,0].to_frame()
pred_to_plot = pd.merge(tr, pred_plus_ci, how='left', on='date')
ax = pred_to_plot.count_shares.plot(label='Observed', color='dodgerblue')
pred_to_plot.pred.plot(ax=ax, label='Forecast', alpha=.7, figsize=(18, 6), color='deeppink')
ax.fill_between(pred_to_plot.index,
pred_to_plot.iloc[:, 1],
pred_to_plot.iloc[:, 2], color='darkgrey', alpha=.2)
xticks = pred_to_plot.index
ax.xaxis.set_ticks(xticks)
ax.set_xticklabels(xticks, alpha=1, rotation=45)
ax.set_xticks(ax.get_xticks()[::5])
ax.set_xlabel('Date')
ax.set_ylabel('Bike share rides')
plt.legend()
plt.show()
plt.savefig('ts_SARIMAX.png')
print('Results of SARIMAX:')
print('-------------------')
mse = mean_squared_error(test.count_shares, pred_2.predicted_mean)
print('The RMSE of our forecasts is {}'.format(round(np.sqrt(mse), 2)))
r2 = r2_score(test.count_shares, pred_2.predicted_mean)
print('The R2 of our forecasts is {}'.format(round(r2,2)))
βοΈGreat, I reduced RMSE and increased R2, that shows that including weather features for our forecast improves predictions.
π To conclude, the weather does impact bike-share behavior. I could show that with EDA, correlation analysis as well as with time series forecasting. Including weather features improves model predictions.
π As a next step it would be interesting to try to improve SARIMAX model by 1) tuning the hyperparameters 2) adding more exogenous features.
π Another option would be trying more advanced techniques for modeling e.g., Random Forest Regressor or Neural Network models
π In terms of data analysis, we could consider hourly bike rides forecast with corresponding hourly weather data.
A weather data set is provided by NOAA https://www.ncdc.noaa.gov/data-access/land-based-station-data and the bike-share data set is from Lyft: https://www.lyft.com/bikes/bay-wheels/system-data. The time period: from 2017-06-28 to 2018-05-23.