{ "cells": [ { "cell_type": "markdown", "id": "740a2d15", "metadata": {}, "source": [ "# KNN imputation " ] }, { "cell_type": "code", "execution_count": 1, "id": "bfae286d", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "\n", "from sklearn.experimental import enable_iterative_imputer\n", "from sklearn.impute import IterativeImputer\n", "from sklearn.impute import KNNImputer" ] }, { "cell_type": "code", "execution_count": 3, "id": "f65f7221", "metadata": {}, "outputs": [], "source": [ "master = pd.read_csv('https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/master_scaled_impute.csv',\n", " index_col=0)" ] }, { "cell_type": "code", "execution_count": 4, "id": "6ab9ea01", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ccgmonth111_111_offered111_111_answeredamb_sys_madeamb_sys_answeredgp_appt_attendedgp_appt_availableae_attendances_attendancespopulationPeoplePlacesLivesyear
000NJan568.479362459.899870216.450677148.4395454338.3356074738.295678385.58546615.026594.8101.796.82018
100NFeb472.022555395.194004204.869142147.3358653704.6551094074.002595347.51938215.026594.8101.796.82018
200NMar541.224032453.863381233.092813168.1212343907.6964034321.498686383.38934515.026594.8101.796.82018
300NApr506.059962438.172926210.990836149.8484223721.0927364089.042691396.29987015.026594.8101.796.82018
400NMay517.326603452.985246223.273261164.5954943848.8004534224.337005422.18746915.026594.8101.796.82018
\n", "
" ], "text/plain": [ " ccg month 111_111_offered 111_111_answered amb_sys_made \\\n", "0 00N Jan 568.479362 459.899870 216.450677 \n", "1 00N Feb 472.022555 395.194004 204.869142 \n", "2 00N Mar 541.224032 453.863381 233.092813 \n", "3 00N Apr 506.059962 438.172926 210.990836 \n", "4 00N May 517.326603 452.985246 223.273261 \n", "\n", " amb_sys_answered gp_appt_attended gp_appt_available \\\n", "0 148.439545 4338.335607 4738.295678 \n", "1 147.335865 3704.655109 4074.002595 \n", "2 168.121234 3907.696403 4321.498686 \n", "3 149.848422 3721.092736 4089.042691 \n", "4 164.595494 3848.800453 4224.337005 \n", "\n", " ae_attendances_attendances population People Places Lives year \n", "0 385.585466 15.0265 94.8 101.7 96.8 2018 \n", "1 347.519382 15.0265 94.8 101.7 96.8 2018 \n", "2 383.389345 15.0265 94.8 101.7 96.8 2018 \n", "3 396.299870 15.0265 94.8 101.7 96.8 2018 \n", "4 422.187469 15.0265 94.8 101.7 96.8 2018 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "master.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "837248df", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1920, 14)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "master.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "d23fd377", "metadata": {}, "outputs": [], "source": [ "cols = master.columns[2:]" ] }, { "cell_type": "markdown", "id": "0a899816", "metadata": {}, "source": [ "## KNN" ] }, { "cell_type": "code", "execution_count": 7, "id": "59548750", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.27 s, sys: 796 ms, total: 3.07 s\n", "Wall time: 775 ms\n" ] } ], "source": [ "%%time\n", "\n", "for N in range(3,12):\n", " \n", " knn = KNNImputer(n_neighbors = N)\n", "\n", " master_imputed = knn.fit_transform(master[cols])\n", " \n", " master_imputed = pd.DataFrame(np.array(master_imputed), columns = cols )\n", " master_imputed['month'] = master['month']\n", " master_imputed['ccg'] = master['ccg']\n", " \n", " master_imputed.to_csv(f'../data/imputed/master_imputed_{N}.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "ba08cb68", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }