Developing the Population Health Model#

Overview#

This notebook contains the code to develop the population health model.

Initially 3 different models are compared (Linear regression, Random Forest regresstion, Gradient Boosted regression).

Hyper-parameters of the best model are fine-tuned to maximise performance in unseen data while preventing over-fitting and minimising model complexity

#turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings('ignore')

Import libraries#

import os
import pandas as pd
import numpy as np
import pickle as pkl

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold


import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

Import data#

dta = pd.read_csv('https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/master_scaled_new_pop.csv',
                  index_col=0)
dta.columns = ['_'.join([c.split('/')[0],c.split('/')[-1]]) 
               if '/' in c else c for c in dta.columns]
dta.shape
(1618, 17)
dta.describe()
111_111_offered 111_111_answered amb_sys_made amb_sys_answered gp_appt_available ae_attendances_attendances population People Places Lives year %>65 %<15 N>65 N<15
count 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000 1618.000000
mean 330.444458 286.600228 455.922757 340.097999 4359.178114 486.615273 63.312790 99.208970 99.656618 100.787827 2018.501854 18.463638 18.056074 11.219593 11.545206
std 153.095006 132.348601 353.046416 275.127125 776.297557 275.070544 47.620833 5.817237 2.423154 5.738246 0.500151 3.794561 1.548570 7.767128 9.122457
min 0.707731 0.664178 141.067451 105.089248 1299.605358 101.447679 11.615900 84.800000 92.500000 87.400000 2018.000000 9.306882 14.560279 2.154300 1.791500
25% 265.012202 239.420469 207.797181 149.382117 3878.647249 302.149393 26.529400 94.600000 98.000000 97.000000 2018.000000 16.073826 16.951092 4.781900 5.036400
50% 310.420752 265.552822 269.156617 216.466007 4376.215538 384.680285 50.270400 100.150000 99.550000 101.350000 2019.000000 18.517093 18.153428 8.342500 8.471700
75% 349.608595 300.690725 628.356628 452.675621 4829.604169 581.037869 86.540900 103.685714 101.400000 105.533333 2019.000000 20.560113 19.123772 16.364700 14.872900
max 1281.625143 1206.970308 1698.974593 1377.465562 6937.180617 1460.369210 210.807400 111.200000 105.200000 113.300000 2019.000000 27.403900 21.763505 36.459800 42.117100

Add random feature#

# Adding random features

rng = np.random.RandomState(0)
rand_var = rng.rand(dta.shape[0])
dta['rand1'] = rand_var
dta.shape
(1618, 18)

Function to aggregate data#

def group_data(data, features):

    #ensure no identical points in train and test

    grouped = pd.DataFrame()
    
    for pop, group in data.groupby('population'):

        #if len(group.lives.unique())>1:

        #    print('multiple CCG with same population')

        ccg_year = pd.Series(dtype='float64')

        for f in features:

            ccg_year[f] = group[f].unique()[0]

        ccg_year['ae_attendances_attendances'] \
        = group.ae_attendances_attendances.mean()
        

        grouped = grouped.append(ccg_year, ignore_index=True)
        
    return grouped

Model Comparison#

Features in the dataset that measure population and population health are:

  • population:

  • People:

  • Places:

  • Lives:

  • %>65:

Pair plot#

features = ['population',
            'People', 'Places',
            'Lives', '%>65','rand1']

grouped = group_data(dta, features)

fig = sns.pairplot(grouped.select_dtypes(include=np.number),
                   kind="reg",
                   plot_kws={'line_kws':{'color':'black'}, 
                            'scatter_kws': 
                             {'color':'green','alpha': 0.1}},
                   diag_kws={'color':'red'})
_images/population_health_model_19_0.png

Linear Regression#

model = LinearRegression()

features = ['population',
            'People', 'Places',
            'Lives', '%>65','rand1']

grouped = group_data(dta, features)
    
y = grouped['ae_attendances_attendances']
X = grouped[features]

cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
    
scores_train, scores_test, feats = [],[],[]
    
for train_index, test_index in cv.split(X, y):
        
    model.fit(X.iloc[train_index], y.iloc[train_index])
        
    scores_test.append(model.score(X.iloc[test_index],
                                   y.iloc[test_index]))
    scores_train.append(model.score(X.iloc[train_index], 
                                    y.iloc[train_index]))

    feats.append(model.coef_)

Performance#

results=pd.DataFrame()
results['train'] = scores_train
results['test'] = scores_test
results.describe()
train test
count 25.000000 25.000000
mean 0.552068 0.371351
std 0.035222 0.343497
min 0.476669 -1.013811
25% 0.528894 0.346533
50% 0.556832 0.455708
75% 0.578657 0.556435
max 0.613870 0.656112

Feature Importance#

feat_imp = pd.DataFrame()

for i,f in enumerate(features):
    
    feat_imp[f] = np.array(feats)[:,i]

feat_imp.describe()
population People Places Lives %>65 rand1
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean -1.435290 5.844501 15.737868 -34.576050 -9.079814 123.746553
std 0.175190 3.047085 5.660999 4.188117 3.984531 27.108885
min -1.730007 1.434910 4.349379 -43.042262 -18.441213 88.434422
25% -1.513520 3.678942 9.979621 -36.339932 -11.124901 102.146007
50% -1.421376 5.312968 16.724959 -34.109459 -9.472548 121.696265
75% -1.358518 7.613663 19.380730 -32.524713 -5.919612 136.721486
max -0.896284 12.445317 25.457699 -25.979856 -1.672032 191.783194

Random Forest#

model = RandomForestRegressor(max_depth=4, n_estimators=2, 
                              random_state=0)

y = grouped['ae_attendances_attendances']

X = grouped[features]

cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
    
scores_train, scores_test, feats = [],[],[]
    
for train_index, test_index in cv.split(X, y):
        
    model.fit(X.iloc[train_index], y.iloc[train_index])
        
    scores_test.append(model.score(X.iloc[test_index],
                                   y.iloc[test_index]))
    scores_train.append(model.score(X.iloc[train_index],
                                    y.iloc[train_index]))
    
    feats.append(model.feature_importances_)

Performance#

results=pd.DataFrame()
results['train'] = scores_train
results['test'] = scores_test
results.describe()
train test
count 25.000000 25.000000
mean 0.807108 0.565794
std 0.042312 0.287908
min 0.720547 -0.598879
25% 0.776415 0.560282
50% 0.809265 0.629445
75% 0.836854 0.685060
max 0.890000 0.834171

Feature Importance#

feat_imp = pd.DataFrame()

for i,f in enumerate(features):
    
    feat_imp[f] = np.array(feats)[:,i]

feat_imp.describe()
population People Places Lives %>65 rand1
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.435902 0.066185 0.062171 0.348578 0.066302 0.020863
std 0.171014 0.094662 0.052771 0.187946 0.034714 0.021494
min 0.117128 0.000000 0.000000 0.040872 0.009537 0.000000
25% 0.343149 0.011843 0.020668 0.222201 0.041677 0.000415
50% 0.419343 0.021413 0.052408 0.349634 0.071220 0.012603
75% 0.602196 0.100430 0.093383 0.447492 0.084021 0.030384
max 0.759784 0.306692 0.223416 0.766654 0.155546 0.075973

Gradient boosted tress#

model = GradientBoostingRegressor(max_depth=5, n_estimators=5,
                                  random_state=1)

y = grouped['ae_attendances_attendances']

X = grouped[features]

cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
    
scores_train, scores_test, feats = [],[],[]
    
for train_index, test_index in cv.split(X, y):
        
    model.fit(X.iloc[train_index], y.iloc[train_index])
        
    scores_test.append(model.score(X.iloc[test_index],
                                   y.iloc[test_index]))
    scores_train.append(model.score(X.iloc[train_index],
                                    y.iloc[train_index]))
    
    feats.append(model.feature_importances_)

Performance#

results=pd.DataFrame()
results['train'] = scores_train
results['test'] = scores_test
results.describe()
train test
count 25.000000 25.000000
mean 0.593587 0.393527
std 0.014388 0.148094
min 0.564452 -0.128025
25% 0.584104 0.316253
50% 0.592172 0.401325
75% 0.605975 0.493652
max 0.625392 0.566740

Feature Importance#

feat_imp = pd.DataFrame()

for i,f in enumerate(features):
    
    feat_imp[f] = np.array(feats)[:,i]

feat_imp.describe()
population People Places Lives %>65 rand1
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.432119 0.051428 0.073296 0.326313 0.097161 0.019683
std 0.196302 0.051212 0.073320 0.229912 0.060103 0.014576
min 0.099755 0.004335 0.002015 0.033923 0.014763 0.000250
25% 0.264346 0.024739 0.017823 0.126553 0.052532 0.005641
50% 0.512337 0.036215 0.048561 0.203326 0.091596 0.015922
75% 0.604362 0.052902 0.091070 0.519404 0.138055 0.031943
max 0.718238 0.200017 0.264029 0.693263 0.266472 0.049490

Summary#

Logistic Regression

  • Variable performance with different splits: mean \(R^2\) = 0.4, minimum \(R^2\) = -0.5 in test set

Random Forest

  • Best performance with mean \(R^2\) = 0.5 in test data.

  • Performance also variable: minimum \(R^2\) = -0.8

  • Feature importance is stable: population is most important, followed by Lives, People then Places.

  • The random feature has low importnace which validates the importance of other features.

Gradient Boosted Trees

  • Doesn’t perform as well as a Random Forest, mean \(R^2\) = 0.4 in test data

  • Performance also variable: minimum \(R^2\) = -0.24

  • Feature importance is not agreement with the Random Forest, with Places more important than People.

Hyper parameter tuning#

The best model is the Random Forest. To ensure the model is not over fit to the training data we compare performance when the following parameters are varied:

  • max_depth: the maximum size of any tree

  • n_estimators: the number of trees in the forest

Maximum Depth#

d = [1,2,3,4,5,6,7,8,9]

res_train,res_test = [],[]

for depth in d:
    
    model = RandomForestRegressor(max_depth=depth, 
                                  n_estimators=3, random_state=0)

    y = grouped['ae_attendances_attendances']

    X = grouped[features]

    cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

    scores_train, scores_test = [],[]

    for train_index, test_index in cv.split(X, y):

        model.fit(X.iloc[train_index], y.iloc[train_index])

        scores_test.append(model.score(X.iloc[test_index],
                                       y.iloc[test_index]))
        scores_train.append(model.score(X.iloc[train_index]
                                        , y.iloc[train_index]))
        
    res_train.append(scores_train)
    res_test.append(scores_test)

Plot#

fig,ax = plt.subplots(figsize=(8,5))

plt.plot(d, np.mean(res_train, axis=1), 'b--', label='train')
plt.plot(d, np.mean(res_test, axis=1), 'r--', label='test')

plt.fill_between(d, y1=(np.mean(res_train, axis=1)-np.std(res_train, axis=1)),
                 y2=(np.mean(res_train, axis=1)+np.std(res_train, axis=1)),
                 color='b', alpha=0.2)

plt.fill_between(d, y1=(np.mean(res_test, axis=1)-np.std(res_test, axis=1)),
                 y2=(np.mean(res_test, axis=1)+np.std(res_test, axis=1)), 
                 color='r', alpha=0.2)

plt.legend(loc='best')
plt.xlabel('Maximum Tree Depth')
plt.ylabel('Model performance')

plt.show()
_images/population_health_model_48_0.png

A depth of 7 is optimal. After this, there is no improvement in performance on unseen data (test, red dashed line) and performance continues to increase in the training data (blue dashed line) suggesting overfitting.

Number of Trees#

n = [1,2,3,4,5,6,7]

res_train,res_test = [],[]

for est in n:
    
    model = RandomForestRegressor(max_depth=7, n_estimators=est,
                                  random_state=0)

    y = grouped['ae_attendances_attendances']

    X = grouped[features]

    cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

    scores_train, scores_test = [],[]

    for train_index, test_index in cv.split(X, y):

        model.fit(X.iloc[train_index], y.iloc[train_index])

        scores_test.append(model.score(X.iloc[test_index],
                                       y.iloc[test_index]))
        scores_train.append(model.score(X.iloc[train_index],
                                        y.iloc[train_index]))
        
    res_train.append(scores_train)
    res_test.append(scores_test)

Plot#

fig,ax = plt.subplots(figsize=(8,5))

plt.plot(n, np.mean(res_train, axis=1), 'b--', label='train')
plt.plot(n, np.mean(res_test, axis=1), 'r--', label='test')

plt.fill_between(n, y1=(np.mean(res_train, axis=1)-np.std(res_train, axis=1)),
                 y2=(np.mean(res_train, axis=1)+np.std(res_train, axis=1)),
                 color='b', alpha=0.2)

plt.fill_between(n, y1=(np.mean(res_test, axis=1)-np.std(res_test, axis=1)),
                 y2=(np.mean(res_test, axis=1)+np.std(res_test, axis=1)),
                 color='r', alpha=0.2)

plt.legend(loc='best')
plt.xlabel('Number of Trees')
plt.ylabel('Model performance')

plt.show()
_images/population_health_model_53_0.png

The optimal number of trees is 4, beyond which there is no improvement in the training or test set.

Final Model for paper#

Fit the Random forest with optimal parameters

model = RandomForestRegressor(max_depth=7, n_estimators=4,
                              random_state=0)

y = grouped['ae_attendances_attendances']

X = grouped[features]

cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
    
scores_train, scores_test, feat = [],[],[]
    
for train_index, test_index in cv.split(X, y):
        
    model.fit(X.iloc[train_index], y.iloc[train_index])
        
    scores_test.append(model.score(X.iloc[test_index],
                                   y.iloc[test_index]))
    scores_train.append(model.score(X.iloc[train_index],
                                    y.iloc[train_index]))
    
    feat.append(model.feature_importances_)

Performance#

results=pd.DataFrame()
results['train'] = scores_train
results['test'] = scores_test
results.describe()
train test
count 25.000000 25.000000
mean 0.911347 0.622299
std 0.021012 0.219899
min 0.866569 0.066347
25% 0.902049 0.552788
50% 0.914108 0.681962
75% 0.926453 0.793663
max 0.938922 0.874207

Feature Importance#

feat_imp = pd.DataFrame()

for i,f in enumerate(features):
    
    feat_imp[f] = np.array(feats)[:,i]

feat_imp.describe()
population People Places Lives %>65 rand1
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.432119 0.051428 0.073296 0.326313 0.097161 0.019683
std 0.196302 0.051212 0.073320 0.229912 0.060103 0.014576
min 0.099755 0.004335 0.002015 0.033923 0.014763 0.000250
25% 0.264346 0.024739 0.017823 0.126553 0.052532 0.005641
50% 0.512337 0.036215 0.048561 0.203326 0.091596 0.015922
75% 0.604362 0.052902 0.091070 0.519404 0.138055 0.031943
max 0.718238 0.200017 0.264029 0.693263 0.266472 0.049490