KNN imputation#

import os
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
master = pd.read_csv('https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/master_scaled_impute.csv',
                  index_col=0)
master.head()
ccg month 111_111_offered 111_111_answered amb_sys_made amb_sys_answered gp_appt_attended gp_appt_available ae_attendances_attendances population People Places Lives year
0 00N Jan 568.479362 459.899870 216.450677 148.439545 4338.335607 4738.295678 385.585466 15.0265 94.8 101.7 96.8 2018
1 00N Feb 472.022555 395.194004 204.869142 147.335865 3704.655109 4074.002595 347.519382 15.0265 94.8 101.7 96.8 2018
2 00N Mar 541.224032 453.863381 233.092813 168.121234 3907.696403 4321.498686 383.389345 15.0265 94.8 101.7 96.8 2018
3 00N Apr 506.059962 438.172926 210.990836 149.848422 3721.092736 4089.042691 396.299870 15.0265 94.8 101.7 96.8 2018
4 00N May 517.326603 452.985246 223.273261 164.595494 3848.800453 4224.337005 422.187469 15.0265 94.8 101.7 96.8 2018
master.shape
(1920, 14)
cols = master.columns[2:]

KNN#

%%time

for N in range(3,12):
    
    knn = KNNImputer(n_neighbors = N)

    master_imputed = knn.fit_transform(master[cols])
    
    master_imputed = pd.DataFrame(np.array(master_imputed), columns = cols )
    master_imputed['month'] = master['month']
    master_imputed['ccg'] = master['ccg']
    
    master_imputed.to_csv(f'../data/imputed/master_imputed_{N}.csv')
CPU times: user 2.71 s, sys: 972 ms, total: 3.68 s
Wall time: 829 ms