Tree-based models
Contents
Tree-based models#
Overview#
This notebook contains an initial exploration of tree-based regressions to predict monthly ED demand.
As the variables population, people, places and lives only vary annually they cannot be included in the model due to data leakage between the training and test sets.
For all models, variables used include:
Service capacity (111, GP, Ambulance)
Service utility (111, Ambulance)
#turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings('ignore')
Import libraries#
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
Import data#
dta = pd.read_csv('https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/master_scaled_new.csv',
index_col=0)
dta.columns = ['_'.join([c.split('/')[0],c.split('/')[-1]])
if '/' in c else c for c in dta.columns]
dta.ccg.unique().shape
(74,)
Add random feature#
# Adding random features
rng = np.random.RandomState(0)
rand_var = rng.rand(dta.shape[0])
dta['rand1'] = rand_var
dta.shape
(1618, 14)
Fitting function#
def fit_model(dta, model, features):
y = dta['ae_attendances_attendances']
X = dta[features]
#cross validate to get errors on performance and coefficients
cv_model = cross_validate(model, X,y,
cv=RepeatedKFold(n_splits=5, n_repeats=5,
random_state=0),
return_estimator=True,
return_train_score=True, n_jobs=2)
clf = model.fit(X, y)
return cv_model
Model Comparison#
Random Forest#
model = RandomForestRegressor()
features = ['gp_appt_available',
'111_111_offered', 'amb_sys_answered',
'111_111_answered', 'amb_sys_made']
results = fit_model(dta,model,features)
Performance#
res=pd.DataFrame()
res['test_score'] = results['test_score']
res['train_score'] = results['train_score']
res.describe()
test_score | train_score | |
---|---|---|
count | 25.000000 | 25.000000 |
mean | 0.337418 | 0.907160 |
std | 0.076642 | 0.003945 |
min | 0.171471 | 0.900363 |
25% | 0.262441 | 0.905089 |
50% | 0.348798 | 0.906921 |
75% | 0.390744 | 0.909421 |
max | 0.490176 | 0.914131 |
Coefficients#
coefs = pd.DataFrame(
[model.feature_importances_
for model in results['estimator']],
columns=features
)
coefs.describe()
gp_appt_available | 111_111_offered | amb_sys_answered | 111_111_answered | amb_sys_made | |
---|---|---|---|---|---|
count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
mean | 0.357002 | 0.123849 | 0.202960 | 0.075165 | 0.241024 |
std | 0.007679 | 0.009848 | 0.022245 | 0.003739 | 0.023569 |
min | 0.339813 | 0.110233 | 0.165946 | 0.068933 | 0.201287 |
25% | 0.353878 | 0.116783 | 0.184720 | 0.072909 | 0.227691 |
50% | 0.357553 | 0.122698 | 0.200751 | 0.075617 | 0.243067 |
75% | 0.360190 | 0.127030 | 0.216460 | 0.077222 | 0.254322 |
max | 0.374283 | 0.152080 | 0.244838 | 0.082096 | 0.280150 |
Extra Trees#
model = ExtraTreesRegressor()
features = ['gp_appt_available',
'111_111_offered', 'amb_sys_answered',
'111_111_answered', 'amb_sys_made']
results = fit_model(dta,model,features)
Performance#
res=pd.DataFrame()
res['test_score'] = results['test_score']
res['train_score'] = results['train_score']
res.describe()
test_score | train_score | |
---|---|---|
count | 25.000000 | 25.0 |
mean | 0.197374 | 1.0 |
std | 0.119562 | 0.0 |
min | -0.047068 | 1.0 |
25% | 0.066668 | 1.0 |
50% | 0.233617 | 1.0 |
75% | 0.275441 | 1.0 |
max | 0.393394 | 1.0 |
Coefficients#
coefs = pd.DataFrame(
[model.feature_importances_
for model in results['estimator']],
columns=features
)
coefs.describe()
gp_appt_available | 111_111_offered | amb_sys_answered | 111_111_answered | amb_sys_made | |
---|---|---|---|---|---|
count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
mean | 0.395870 | 0.115984 | 0.207734 | 0.084479 | 0.195932 |
std | 0.009711 | 0.004940 | 0.008573 | 0.003465 | 0.009288 |
min | 0.374678 | 0.106690 | 0.191756 | 0.076780 | 0.175861 |
25% | 0.390342 | 0.113776 | 0.202478 | 0.082096 | 0.190601 |
50% | 0.396914 | 0.115226 | 0.207818 | 0.084087 | 0.194176 |
75% | 0.404387 | 0.119624 | 0.212132 | 0.087006 | 0.201625 |
max | 0.407839 | 0.125445 | 0.223766 | 0.091355 | 0.214036 |
Gradient Boosted Trees#
model = GradientBoostingRegressor()
features = ['gp_appt_available',
'111_111_offered', 'amb_sys_answered',
'111_111_answered', 'amb_sys_made']
results = fit_model(dta,model,features)
Performance#
res=pd.DataFrame()
res['test_score'] = results['test_score']
res['train_score'] = results['train_score']
res.describe()
test_score | train_score | |
---|---|---|
count | 25.000000 | 25.000000 |
mean | 0.415361 | 0.568058 |
std | 0.048391 | 0.011582 |
min | 0.315104 | 0.551823 |
25% | 0.388389 | 0.556177 |
50% | 0.406757 | 0.568343 |
75% | 0.448337 | 0.575327 |
max | 0.495351 | 0.587684 |
Coefficients#
coefs = pd.DataFrame(
[model.feature_importances_
for model in results['estimator']],
columns=features
)
coefs.describe()
gp_appt_available | 111_111_offered | amb_sys_answered | 111_111_answered | amb_sys_made | |
---|---|---|---|---|---|
count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
mean | 0.189201 | 0.152739 | 0.274784 | 0.030169 | 0.353107 |
std | 0.011778 | 0.016073 | 0.027098 | 0.005649 | 0.028217 |
min | 0.170143 | 0.129167 | 0.216100 | 0.022042 | 0.296229 |
25% | 0.179190 | 0.140159 | 0.262216 | 0.026786 | 0.339826 |
50% | 0.188742 | 0.155633 | 0.275872 | 0.028190 | 0.349366 |
75% | 0.196874 | 0.160312 | 0.292362 | 0.032325 | 0.374658 |
max | 0.211001 | 0.199813 | 0.314280 | 0.045884 | 0.407943 |
ADA Boost#
model = AdaBoostRegressor()
features = ['gp_appt_available',
'111_111_offered', 'amb_sys_answered',
'111_111_answered', 'amb_sys_made']
results = fit_model(dta,model,features)
Performance#
res=pd.DataFrame()
res['test_score'] = results['test_score']
res['train_score'] = results['train_score']
res.describe()
test_score | train_score | |
---|---|---|
count | 25.000000 | 25.000000 |
mean | 0.353207 | 0.398218 |
std | 0.062387 | 0.020489 |
min | 0.214903 | 0.348453 |
25% | 0.305553 | 0.386198 |
50% | 0.338738 | 0.399499 |
75% | 0.396985 | 0.414336 |
max | 0.482296 | 0.440337 |
Coefficients#
coefs = pd.DataFrame(
[model.feature_importances_
for model in results['estimator']],
columns=features
)
coefs.describe()
gp_appt_available | 111_111_offered | amb_sys_answered | 111_111_answered | amb_sys_made | |
---|---|---|---|---|---|
count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
mean | 0.131013 | 0.136810 | 0.341245 | 0.007087 | 0.383844 |
std | 0.018367 | 0.051606 | 0.059225 | 0.007437 | 0.079703 |
min | 0.094617 | 0.069596 | 0.253128 | 0.000000 | 0.208886 |
25% | 0.114694 | 0.111798 | 0.292980 | 0.001893 | 0.319977 |
50% | 0.133880 | 0.125812 | 0.333916 | 0.004246 | 0.403523 |
75% | 0.141702 | 0.161672 | 0.382629 | 0.010357 | 0.438641 |
max | 0.172065 | 0.268240 | 0.496221 | 0.026677 | 0.505979 |
Summary#
Extra Trees does not preform well
Random forest with default parameters is overfitting to the training data
Gradient boosted trees performs best