Model performance with imputed data
Contents
Model performance with imputed data#
Overview#
This notebook demonstrates that using KNN to impute missing data leads to a drop in model performance.
Code to generate the imputed datasets can be found here
#turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings('ignore')
Import libraries#
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score as r2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
Function to group data#
def group_data(data, features):
features = ['population',
'People', 'Places',
'Lives']
#ensure no identical points in train and test
grouped = pd.DataFrame()
for pop, group in data.groupby('population'):
#if len(group.lives.unique())>1:
#print('multiple CCG with same population')
ccg_year = pd.Series(dtype='float64')
for f in features:
ccg_year[f] = group[f].unique()[0]
ccg_year['ae_attendances_attendances']\
= group.ae_attendances_attendances.mean()
grouped = grouped.append(ccg_year, ignore_index=True)
return grouped
Functions to fit MGSR#
def fit_ph(dta, features, model):
if 'ae_predicted' in dta.columns:
dta = dta.drop(['ae_predicted'], axis=1)
grouped = group_data(dta, features)
y = grouped['ae_attendances_attendances']
X = grouped[features]
# dont set random state so that function can be used in overall cv
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1)
results = pd.DataFrame()
for train_index, test_index in cv.split(X, y):
model.fit(X.iloc[train_index], y.iloc[train_index])
test = X.iloc[test_index].copy()
test['ae_predicted'] = model.predict(X.iloc[test_index])
results = results.append(test, ignore_index=True)
dta = dta.merge(results[['population','ae_predicted']],
left_on='population', right_on='population')
return dta
def fit_capacity(dta, features, model):
y = dta['ae_attendances_attendances']
X = dta[features]
model.fit(X,y)
return model
def fit_combined(train, rf1, m1_features, train_size=7/8):
final = LinearRegression()
#split training data into two sets
train_0, train_1 = train_test_split(train,
train_size=train_size,
random_state=29)
#train capactiy model
rf1 = fit_capacity(train_0, m1_features, rf1)
#predict monthly attendances
y_pred_1 = rf1.predict(train_1[m1_features])
#use pre-predicted average attendances
y_pred_2 = train_1['ae_predicted']
#final
X_f = np.vstack([y_pred_1, y_pred_2]).T
y_f = train_1['ae_attendances_attendances']
final.fit(X_f,y_f)
return rf1,final
def cv_combined(dta, rf1, rf2):
# splitter for cross validation
cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
scores_final, scores_rf1, scores_rf2, coefs = [],[],[],[]
k=1
capacity_features = ['gp_appt_available',
'111_111_offered', 'amb_sys_answered']
pophealth_features = ['population',
'People', 'Places', 'Lives']
dta_pred = pd.DataFrame()
#fit population health independently to avoid data leakage
dta = fit_ph(dta, pophealth_features, rf2)
for train_index, test_index in cv.split(dta):
#print(f'\n Split {k} \n')
train = dta.iloc[train_index]
test = dta.iloc[test_index]
#final models
rf1, final = fit_combined(train, rf1, capacity_features)
coefs.append(final.coef_)
#predict on test data
y_pred_cu = rf1.predict(test[capacity_features])
scores_rf1.append(rf1.score(test[capacity_features],
test['ae_attendances_attendances']))
y_pred_ph = test['ae_predicted']
scores_rf2.append(r2(test['ae_attendances_attendances'],
test['ae_predicted']))
preds = final.predict(np.vstack([y_pred_cu, y_pred_ph]).T)
scores_final.append(final.score(np.vstack([y_pred_cu, y_pred_ph]).T,
test['ae_attendances_attendances']))
test_pred = test.copy()
test_pred['predicted'] = preds
test_pred['true'] = test['ae_attendances_attendances'].values
test_pred['iter'] = [k for i in test_pred.index]
dta_pred = dta_pred.append(test_pred, ignore_index=False)
k+=1
return scores_final, scores_rf1, scores_rf2, dta_pred, coefs
def fit_(dta):
#capacity model
rf1 = RandomForestRegressor(max_depth=5, n_estimators=6, random_state=0)
#population health model
rf2 = RandomForestRegressor(max_depth=5, n_estimators=4, random_state=0)
scores_final, scores_rf1, scores_rf2, \
dta_pred, coefs = cv_combined(dta, rf1, rf2)
return scores_final, scores_rf1, scores_rf2
Performance with increasing N#
results_f, results_ph, results_c = \
pd.DataFrame(),pd.DataFrame(),pd.DataFrame()
for N in range(3,12):
print(f'Running for {N} neighbours')
dta = pd.read_csv(f'https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/imputed/master_imputed_{N}.csv')
scores_final, scores_rf1, scores_rf2 = fit_(dta)
results_f[N] = scores_final
results_c[N] = scores_rf1
results_ph[N] = scores_rf2
Running for 3 neighbours
Running for 4 neighbours
Running for 5 neighbours
Running for 6 neighbours
Running for 7 neighbours
Running for 8 neighbours
Running for 9 neighbours
Running for 10 neighbours
Running for 11 neighbours
results_f.describe()
3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | |
---|---|---|---|---|---|---|---|---|---|
count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
mean | 0.677970 | 0.656065 | 0.652599 | 0.667776 | 0.676278 | 0.674339 | 0.675685 | 0.678412 | 0.669751 |
std | 0.027362 | 0.030733 | 0.031020 | 0.030533 | 0.033196 | 0.029792 | 0.032136 | 0.031935 | 0.032517 |
min | 0.637825 | 0.585474 | 0.566965 | 0.597345 | 0.605514 | 0.604092 | 0.603620 | 0.598016 | 0.589145 |
25% | 0.656302 | 0.641851 | 0.639891 | 0.651375 | 0.655613 | 0.656757 | 0.654456 | 0.665271 | 0.651121 |
50% | 0.677774 | 0.659111 | 0.650244 | 0.671206 | 0.683557 | 0.678927 | 0.681292 | 0.677280 | 0.673027 |
75% | 0.693470 | 0.672966 | 0.671929 | 0.686971 | 0.700101 | 0.689730 | 0.703985 | 0.702120 | 0.697082 |
max | 0.740517 | 0.712382 | 0.707859 | 0.717031 | 0.721431 | 0.721879 | 0.717853 | 0.728476 | 0.718087 |
results_c.describe()
3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | |
---|---|---|---|---|---|---|---|---|---|
count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
mean | 0.367470 | 0.364291 | 0.369041 | 0.367097 | 0.366545 | 0.366735 | 0.363228 | 0.363488 | 0.363705 |
std | 0.045226 | 0.034816 | 0.031281 | 0.031746 | 0.030744 | 0.033396 | 0.030966 | 0.031947 | 0.030925 |
min | 0.272866 | 0.293281 | 0.319399 | 0.314474 | 0.317208 | 0.303608 | 0.307112 | 0.306028 | 0.300983 |
25% | 0.334477 | 0.345569 | 0.348645 | 0.346713 | 0.341673 | 0.339048 | 0.346306 | 0.343710 | 0.352902 |
50% | 0.369753 | 0.358013 | 0.362384 | 0.362563 | 0.360874 | 0.362219 | 0.360776 | 0.363622 | 0.363868 |
75% | 0.402880 | 0.392535 | 0.388410 | 0.389388 | 0.387030 | 0.395536 | 0.374270 | 0.379922 | 0.379235 |
max | 0.453787 | 0.426858 | 0.436122 | 0.427093 | 0.421189 | 0.426304 | 0.420213 | 0.420988 | 0.412676 |
results_ph.describe()
3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | |
---|---|---|---|---|---|---|---|---|---|
count | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
mean | 0.621935 | 0.599715 | 0.582703 | 0.612405 | 0.641546 | 0.619368 | 0.639331 | 0.645083 | 0.632755 |
std | 0.045133 | 0.046658 | 0.056971 | 0.046093 | 0.047715 | 0.043564 | 0.046994 | 0.045441 | 0.046717 |
min | 0.536850 | 0.502103 | 0.407050 | 0.507080 | 0.521780 | 0.512097 | 0.517877 | 0.519797 | 0.506491 |
25% | 0.602810 | 0.575509 | 0.567699 | 0.585493 | 0.618905 | 0.586784 | 0.616018 | 0.622951 | 0.606162 |
50% | 0.616120 | 0.599707 | 0.584805 | 0.621687 | 0.656599 | 0.621118 | 0.649596 | 0.651163 | 0.638635 |
75% | 0.656224 | 0.632352 | 0.626156 | 0.643512 | 0.682703 | 0.648872 | 0.678210 | 0.680461 | 0.671209 |
max | 0.696132 | 0.667365 | 0.662464 | 0.681470 | 0.705500 | 0.685195 | 0.702974 | 0.710714 | 0.695574 |