Predicting attendances next year
Contents
Predicting attendances next year#
Overview#
This notebook contains the code to train the MGSR on data from 2018 and forecast ED demand for 2019.
The performance of the MGSR is assessed using the mean absolute percentage error (MAPE).
#turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings('ignore')
Import libraries#
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error as mape
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
Import data#
dta = pd.read_csv('https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/master_scaled_new_pop.csv',
index_col=0)
dta.columns = ['_'.join([c.split('/')[0],c.split('/')[-1]])
if '/' in c else c for c in dta.columns]
dta.head()
ccg | month | 111_111_offered | 111_111_answered | amb_sys_made | amb_sys_answered | gp_appt_available | ae_attendances_attendances | population | People | Places | Lives | year | %>65 | %<15 | N>65 | N<15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 00Q | Jan | 406.655830 | 308.945095 | 310.561801 | 234.716187 | 4568.019766 | 1179.855246 | 14.8942 | 97.2 | 99.7 | 94.4 | 2018 | 14.46402 | 21.763505 | 2.1543 | 3.2415 |
1 | 00Q | Feb | 349.933603 | 256.872981 | 261.756435 | 205.298797 | 3910.918344 | 1075.452189 | 14.8942 | 97.2 | 99.7 | 94.4 | 2018 | 14.46402 | 21.763505 | 2.1543 | 3.2415 |
2 | 00Q | Mar | 413.247659 | 300.690725 | 303.676215 | 234.716187 | 4051.778545 | 1210.874032 | 14.8942 | 97.2 | 99.7 | 94.4 | 2018 | 14.46402 | 21.763505 | 2.1543 | 3.2415 |
3 | 00Q | Apr | 349.608595 | 278.140171 | 264.973181 | 203.677924 | 3974.433001 | 1186.166427 | 14.8942 | 97.2 | 99.7 | 94.4 | 2018 | 14.46402 | 21.763505 | 2.1543 | 3.2415 |
4 | 00Q | May | 361.100544 | 284.419492 | 294.361403 | 227.926437 | 4232.385761 | 1299.297713 | 14.8942 | 97.2 | 99.7 | 94.4 | 2018 | 14.46402 | 21.763505 | 2.1543 | 3.2415 |
dta.shape
(1618, 17)
Function to group data#
def group_data(data, features):
features = ['population','%>65',
'People', 'Places',
'Lives']
#ensure no identical points in train and test
grouped = pd.DataFrame()
for pop, group in data.groupby('population'):
#if len(group.lives.unique())>1:
#print('multiple CCG with same population')
ccg_year = pd.Series(dtype='float64')
for f in features:
ccg_year[f] = group[f].unique()[0]
ccg_year['ae_attendances_attendances']\
= group.ae_attendances_attendances.mean()
grouped = grouped.append(ccg_year, ignore_index=True)
return grouped
def fit_capacity(dta, features, model):
y = dta['ae_attendances_attendances']
X = dta[features]
model.fit(X,y)
return model
def fit_final(dta, rf1, rf2, m1_features, m2_features):
final = LinearRegression()
#train capactiy model
rf1 = fit_capacity(dta, m1_features, rf1)
#predict monthly attendances
y_pred_1 = rf1.predict(dta[m1_features])
grouped = group_data(dta, m2_features)
y = grouped['ae_attendances_attendances']
X = grouped[m2_features]
rf2.fit(X, y)
y_pred_2 = rf2.predict(dta[m2_features])
X_f = np.vstack([y_pred_1, y_pred_2]).T
y_f = dta['ae_attendances_attendances']
final.fit(X_f,y_f)
print('Combined training score:',final.score(X_f,y_f))
return rf1,rf2, final
#capacity utility
capacity_features = ['gp_appt_available',
'111_111_offered', 'amb_sys_answered']
# '111_111_answered', 'amb_sys_made']
pophealth_features = ['population','%>65',
'People', 'Places', 'Lives']
Split data#
train = dta.loc[dta.year==2018]
test = dta.loc[dta.year==2019]
Fit to 2018#
#capacity model
rf1 = RandomForestRegressor(max_depth=5, n_estimators=6, random_state=0)
#population health model
rf2 = RandomForestRegressor(max_depth=7, n_estimators=4, random_state=0)
m1_features = capacity_features
m2_features = pophealth_features
rf1,rf2,final = fit_final(train, rf1, rf2, m1_features, m2_features)
Combined training score: 0.8739643763156378
Predict on 2019#
def stacked_predict(X, models, m1_features, m2_features):
rf1,rf2,final = models
y_pred_1 = rf1.predict(X[m1_features])
y_pred_2 = rf2.predict(X[m2_features])
X_f = np.vstack([y_pred_1, y_pred_2]).T
preds = final.predict(X_f)
return preds
preds = stacked_predict(test, [rf1,rf2,final], m1_features, m2_features)
MAPE#
mape(test.ae_attendances_attendances.values,preds)
0.25631315230223467
Plot#
test['preds'] = preds
fig,ax = plt.subplots(figsize=(8,6))
plt.plot(test.ae_attendances_attendances.values, 'r--', label = 'True')
plt.plot(test.preds.values, 'g--', label = 'Predicted')
plt.ylabel('Monthly ED attendances per 10,000 people', fontsize=14)
plt.legend(loc='best', fontsize=12)
plt.show()
![_images/fit_predict_future_29_0.png](_images/fit_predict_future_29_0.png)
Combine by month for total#
res = pd.DataFrame()
months = ['Jan','Feb','Mar','Apr','May','Jun',\
'Jul','Aug','Sep','Oct','Nov','Dec']
res['True'] = test.ae_attendances_attendances.values * test.population.values
res['Month'] = test.month.values
res['Pred'] = preds * test.population.values
true, pred = [],[]
for month in months:
true.append(np.mean(res.loc[res.Month==month]['True'].values, axis=0))
pred.append(np.mean(res.loc[res.Month==month]['Pred'].values, axis=0))
MAPE#
mape(true, pred)
0.034803448980532836
Plot#
fig,ax = plt.subplots(figsize=(8,5))
plt.plot(months,true, 'r--', label = 'True')
plt.plot(months,pred, 'g--', label = 'Predicted')
plt.legend(loc='best', fontsize=12)
plt.ylabel('Mean monthly ED attendances per CCG', fontsize=14)
plt.xlabel('Month', fontsize=14)
plt.savefig('2019_forecast_mean.png')
plt.show()
![_images/fit_predict_future_35_0.png](_images/fit_predict_future_35_0.png)
mape(np.concatenate((true[:2],[true[-1]])), np.concatenate((pred[:2],[pred[-1]])))
0.021009352150538733
Figure for paper#
fig,ax_list = plt.subplots(1,2,figsize=(16,6))
ax=ax_list[0]
ax.plot(test.ae_attendances_attendances.values, 'r--', label = 'True')
ax.plot(test.preds.values, 'g--', label = 'Predicted')
ax.set_ylabel('Monthly ED attendances per 10,000 people', fontsize=14)
ax.legend(loc='best', fontsize=12)
ax.xaxis.set_ticklabels([])
ax.text(0.1, 0.9, 'a', horizontalalignment='center',
verticalalignment='center', fontweight='bold',
fontsize=16,transform=ax.transAxes)
ax=ax_list[1]
ax.plot(months,true, 'r--', label = 'True')
ax.plot(months,pred, 'g--', label = 'Predicted')
ax.legend(loc='upper right', fontsize=12)
ax.set_ylabel('Mean monthly ED attendances per CCG', fontsize=14)
ax.set_xlabel('Month', fontsize=14)
ax.text(0.1, 0.9, 'b', horizontalalignment='center',
verticalalignment='center', fontweight='bold',
fontsize=16,transform=ax.transAxes)
plt.tight_layout()
plt.savefig('2019_forecast.png')
plt.show()
![_images/fit_predict_future_38_0.png](_images/fit_predict_future_38_0.png)