Tree-based models#

Overview#

This notebook contains an initial exploration of tree-based regressions to predict monthly ED demand.

As the variables population, people, places and lives only vary annually they cannot be included in the model due to data leakage between the training and test sets.

For all models, variables used include:

  • Service capacity (111, GP, Ambulance)

  • Service utility (111, Ambulance)

#turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings('ignore')

Import libraries#

import os
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold

Import data#

dta = pd.read_csv('https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/master_scaled_new.csv',
                  index_col=0)
dta.columns = ['_'.join([c.split('/')[0],c.split('/')[-1]]) 
               if '/' in c else c for c in dta.columns]
dta.ccg.unique().shape
(74,)

Add random feature#

# Adding random features

rng = np.random.RandomState(0)
rand_var = rng.rand(dta.shape[0])
dta['rand1'] = rand_var
dta.shape
(1618, 14)

Fitting function#

def fit_model(dta, model, features):
    
    
    y = dta['ae_attendances_attendances']
    X = dta[features]
    
    #cross validate to get errors on performance and coefficients
    cv_model = cross_validate(model, X,y, 
                            cv=RepeatedKFold(n_splits=5, n_repeats=5,
                                             random_state=0),
                            return_estimator=True, 
                              return_train_score=True, n_jobs=2)
    
    clf = model.fit(X, y)

    
    return cv_model

Model Comparison#

Random Forest#

model = RandomForestRegressor()

features = ['gp_appt_available',
            '111_111_offered', 'amb_sys_answered',
            '111_111_answered', 'amb_sys_made']

results = fit_model(dta,model,features)

Performance#

res=pd.DataFrame()
res['test_score'] = results['test_score']
res['train_score'] = results['train_score']

res.describe()
test_score train_score
count 25.000000 25.000000
mean 0.337418 0.907160
std 0.076642 0.003945
min 0.171471 0.900363
25% 0.262441 0.905089
50% 0.348798 0.906921
75% 0.390744 0.909421
max 0.490176 0.914131

Coefficients#

coefs = pd.DataFrame(
   [model.feature_importances_
    for model in results['estimator']],
   columns=features
)

coefs.describe()
gp_appt_available 111_111_offered amb_sys_answered 111_111_answered amb_sys_made
count 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.357002 0.123849 0.202960 0.075165 0.241024
std 0.007679 0.009848 0.022245 0.003739 0.023569
min 0.339813 0.110233 0.165946 0.068933 0.201287
25% 0.353878 0.116783 0.184720 0.072909 0.227691
50% 0.357553 0.122698 0.200751 0.075617 0.243067
75% 0.360190 0.127030 0.216460 0.077222 0.254322
max 0.374283 0.152080 0.244838 0.082096 0.280150

Extra Trees#

model = ExtraTreesRegressor()

features = ['gp_appt_available',
            '111_111_offered', 'amb_sys_answered',
            '111_111_answered', 'amb_sys_made']

results = fit_model(dta,model,features)

Performance#

res=pd.DataFrame()
res['test_score'] = results['test_score']
res['train_score'] = results['train_score']

res.describe()
test_score train_score
count 25.000000 25.0
mean 0.197374 1.0
std 0.119562 0.0
min -0.047068 1.0
25% 0.066668 1.0
50% 0.233617 1.0
75% 0.275441 1.0
max 0.393394 1.0

Coefficients#

coefs = pd.DataFrame(
   [model.feature_importances_
    for model in results['estimator']],
   columns=features
)

coefs.describe()
gp_appt_available 111_111_offered amb_sys_answered 111_111_answered amb_sys_made
count 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.395870 0.115984 0.207734 0.084479 0.195932
std 0.009711 0.004940 0.008573 0.003465 0.009288
min 0.374678 0.106690 0.191756 0.076780 0.175861
25% 0.390342 0.113776 0.202478 0.082096 0.190601
50% 0.396914 0.115226 0.207818 0.084087 0.194176
75% 0.404387 0.119624 0.212132 0.087006 0.201625
max 0.407839 0.125445 0.223766 0.091355 0.214036

Gradient Boosted Trees#

model = GradientBoostingRegressor()

features = ['gp_appt_available',
            '111_111_offered', 'amb_sys_answered',
            '111_111_answered', 'amb_sys_made']

results = fit_model(dta,model,features)

Performance#

res=pd.DataFrame()
res['test_score'] = results['test_score']
res['train_score'] = results['train_score']

res.describe()
test_score train_score
count 25.000000 25.000000
mean 0.415361 0.568058
std 0.048391 0.011582
min 0.315104 0.551823
25% 0.388389 0.556177
50% 0.406757 0.568343
75% 0.448337 0.575327
max 0.495351 0.587684

Coefficients#

coefs = pd.DataFrame(
   [model.feature_importances_
    for model in results['estimator']],
   columns=features
)

coefs.describe()
gp_appt_available 111_111_offered amb_sys_answered 111_111_answered amb_sys_made
count 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.189201 0.152739 0.274784 0.030169 0.353107
std 0.011778 0.016073 0.027098 0.005649 0.028217
min 0.170143 0.129167 0.216100 0.022042 0.296229
25% 0.179190 0.140159 0.262216 0.026786 0.339826
50% 0.188742 0.155633 0.275872 0.028190 0.349366
75% 0.196874 0.160312 0.292362 0.032325 0.374658
max 0.211001 0.199813 0.314280 0.045884 0.407943

ADA Boost#

model = AdaBoostRegressor()

features = ['gp_appt_available',
            '111_111_offered', 'amb_sys_answered',
            '111_111_answered', 'amb_sys_made']

results = fit_model(dta,model,features)

Performance#

res=pd.DataFrame()
res['test_score'] = results['test_score']
res['train_score'] = results['train_score']

res.describe()
test_score train_score
count 25.000000 25.000000
mean 0.353207 0.398218
std 0.062387 0.020489
min 0.214903 0.348453
25% 0.305553 0.386198
50% 0.338738 0.399499
75% 0.396985 0.414336
max 0.482296 0.440337

Coefficients#

coefs = pd.DataFrame(
   [model.feature_importances_
    for model in results['estimator']],
   columns=features
)

coefs.describe()
gp_appt_available 111_111_offered amb_sys_answered 111_111_answered amb_sys_made
count 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.131013 0.136810 0.341245 0.007087 0.383844
std 0.018367 0.051606 0.059225 0.007437 0.079703
min 0.094617 0.069596 0.253128 0.000000 0.208886
25% 0.114694 0.111798 0.292980 0.001893 0.319977
50% 0.133880 0.125812 0.333916 0.004246 0.403523
75% 0.141702 0.161672 0.382629 0.010357 0.438641
max 0.172065 0.268240 0.496221 0.026677 0.505979

Summary#

  • Extra Trees does not preform well

  • Random forest with default parameters is overfitting to the training data

  • Gradient boosted trees performs best