Model performance with imputed data#


This notebook demonstrates that using KNN to impute missing data leads to a drop in model performance.

Code to generate the imputed datasets can be found here

#turn warnings off to keep notebook tidy
import warnings

Import libraries#

import os
import pandas as pd
import numpy as np
import pickle as pkl

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score as r2

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline'ggplot')

Function to group data#

def group_data(data, features):

    features = ['population',
                'People', 'Places',

    #ensure no identical points in train and test

    grouped = pd.DataFrame()
    for pop, group in data.groupby('population'):

        #if len(group.lives.unique())>1:

            #print('multiple CCG with same population')

        ccg_year = pd.Series(dtype='float64')

        for f in features:

            ccg_year[f] = group[f].unique()[0]

        = group.ae_attendances_attendances.mean()

        grouped = grouped.append(ccg_year, ignore_index=True)
    return grouped

Functions to fit MGSR#

def fit_ph(dta, features, model):
    if 'ae_predicted' in dta.columns:
        dta = dta.drop(['ae_predicted'], axis=1)
    grouped = group_data(dta, features)
    y = grouped['ae_attendances_attendances']

    X = grouped[features]

    # dont set random state so that function can be used in overall cv
    cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1)
    results = pd.DataFrame()
    for train_index, test_index in cv.split(X, y):
       [train_index], y.iloc[train_index])
        test = X.iloc[test_index].copy()
        test['ae_predicted'] = model.predict(X.iloc[test_index])
        results = results.append(test, ignore_index=True)
    dta = dta.merge(results[['population','ae_predicted']],
                    left_on='population', right_on='population')
    return dta
def fit_capacity(dta, features, model):
    y = dta['ae_attendances_attendances']
    X = dta[features],y)
    return model
def fit_combined(train, rf1, m1_features, train_size=7/8):
    final = LinearRegression()

    #split training data into two sets
    train_0, train_1 = train_test_split(train, 

    #train capactiy model
    rf1 = fit_capacity(train_0, m1_features, rf1)

    #predict monthly attendances

    y_pred_1 = rf1.predict(train_1[m1_features])

    #use pre-predicted average attendances
    y_pred_2 = train_1['ae_predicted']
    X_f = np.vstack([y_pred_1, y_pred_2]).T
    y_f = train_1['ae_attendances_attendances'],y_f)
    return rf1,final        
def cv_combined(dta, rf1, rf2):
    # splitter for cross validation 
    cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
    scores_final, scores_rf1, scores_rf2, coefs = [],[],[],[]
    capacity_features = ['gp_appt_available',
            '111_111_offered', 'amb_sys_answered']

    pophealth_features = ['population',
                            'People', 'Places', 'Lives']
    dta_pred = pd.DataFrame()
    #fit population health independently to avoid data leakage
    dta = fit_ph(dta, pophealth_features, rf2)
    for train_index, test_index in cv.split(dta):
        #print(f'\n Split {k} \n')
        train = dta.iloc[train_index]
        test = dta.iloc[test_index]

        #final models
        rf1, final = fit_combined(train, rf1, capacity_features)
        #predict on test data
        y_pred_cu = rf1.predict(test[capacity_features])


        y_pred_ph = test['ae_predicted']

        preds = final.predict(np.vstack([y_pred_cu, y_pred_ph]).T)

        scores_final.append(final.score(np.vstack([y_pred_cu, y_pred_ph]).T,
        test_pred = test.copy()
        test_pred['predicted'] = preds
        test_pred['true'] = test['ae_attendances_attendances'].values
        test_pred['iter'] = [k for i in test_pred.index]
        dta_pred = dta_pred.append(test_pred, ignore_index=False)
    return scores_final, scores_rf1, scores_rf2, dta_pred, coefs
def fit_(dta):
    #capacity model
    rf1 = RandomForestRegressor(max_depth=5, n_estimators=6, random_state=0)

    #population health model
    rf2 = RandomForestRegressor(max_depth=5, n_estimators=4, random_state=0)

    scores_final, scores_rf1, scores_rf2, \
    dta_pred, coefs = cv_combined(dta, rf1, rf2)

    return scores_final, scores_rf1, scores_rf2

Performance with increasing N#

results_f, results_ph, results_c = \

for N in range(3,12):
    print(f'Running for {N} neighbours')
    dta = pd.read_csv(f'{N}.csv')
    scores_final, scores_rf1, scores_rf2 = fit_(dta)
    results_f[N] = scores_final
    results_c[N] = scores_rf1
    results_ph[N] = scores_rf2
Running for 3 neighbours
Running for 4 neighbours
Running for 5 neighbours
Running for 6 neighbours
Running for 7 neighbours
Running for 8 neighbours
Running for 9 neighbours
Running for 10 neighbours
Running for 11 neighbours
3 4 5 6 7 8 9 10 11
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.677970 0.656065 0.652599 0.667776 0.676278 0.674339 0.675685 0.678412 0.669751
std 0.027362 0.030733 0.031020 0.030533 0.033196 0.029792 0.032136 0.031935 0.032517
min 0.637825 0.585474 0.566965 0.597345 0.605514 0.604092 0.603620 0.598016 0.589145
25% 0.656302 0.641851 0.639891 0.651375 0.655613 0.656757 0.654456 0.665271 0.651121
50% 0.677774 0.659111 0.650244 0.671206 0.683557 0.678927 0.681292 0.677280 0.673027
75% 0.693470 0.672966 0.671929 0.686971 0.700101 0.689730 0.703985 0.702120 0.697082
max 0.740517 0.712382 0.707859 0.717031 0.721431 0.721879 0.717853 0.728476 0.718087
3 4 5 6 7 8 9 10 11
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.367470 0.364291 0.369041 0.367097 0.366545 0.366735 0.363228 0.363488 0.363705
std 0.045226 0.034816 0.031281 0.031746 0.030744 0.033396 0.030966 0.031947 0.030925
min 0.272866 0.293281 0.319399 0.314474 0.317208 0.303608 0.307112 0.306028 0.300983
25% 0.334477 0.345569 0.348645 0.346713 0.341673 0.339048 0.346306 0.343710 0.352902
50% 0.369753 0.358013 0.362384 0.362563 0.360874 0.362219 0.360776 0.363622 0.363868
75% 0.402880 0.392535 0.388410 0.389388 0.387030 0.395536 0.374270 0.379922 0.379235
max 0.453787 0.426858 0.436122 0.427093 0.421189 0.426304 0.420213 0.420988 0.412676
3 4 5 6 7 8 9 10 11
count 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000 25.000000
mean 0.621935 0.599715 0.582703 0.612405 0.641546 0.619368 0.639331 0.645083 0.632755
std 0.045133 0.046658 0.056971 0.046093 0.047715 0.043564 0.046994 0.045441 0.046717
min 0.536850 0.502103 0.407050 0.507080 0.521780 0.512097 0.517877 0.519797 0.506491
25% 0.602810 0.575509 0.567699 0.585493 0.618905 0.586784 0.616018 0.622951 0.606162
50% 0.616120 0.599707 0.584805 0.621687 0.656599 0.621118 0.649596 0.651163 0.638635
75% 0.656224 0.632352 0.626156 0.643512 0.682703 0.648872 0.678210 0.680461 0.671209
max 0.696132 0.667365 0.662464 0.681470 0.705500 0.685195 0.702974 0.710714 0.695574