Linear Models
Contents
Linear Models#
Overview#
This notebook contains an initial exploration of different linear regressions to predict monthly ED demand.
For all models, variables used include:
Service capacity (111, GP, Ambulance)
Service utility (111, Ambulance)
Population
Population Health (People, Places, Lives)
#turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings('ignore')
Import libraries#
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLarsIC
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import OrthogonalMatchingPursuit as OMP
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.feature_selection import mutual_info_regression
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
Import data#
dta = pd.read_csv('https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/master_scaled_new.csv',
index_col=0)
dta.columns = ['_'.join([c.split('/')[0],c.split('/')[-1]])
if '/' in c else c for c in dta.columns]
dta.ccg.unique().shape
(74,)
Add random feature#
# Adding random features
rng = np.random.RandomState(0)
rand_var = rng.rand(dta.shape[0])
dta['rand1'] = rand_var
dta.shape
(1618, 14)
Train test split#
train, test = train_test_split(dta,random_state=29)
y = dta['ae_attendances_attendances']
X = dta.drop(['ae_attendances_attendances','ccg','month'], axis=1)
y_train = train['ae_attendances_attendances'].values
X_train = train.drop(['ae_attendances_attendances','ccg','month'], axis=1)
y_test = test['ae_attendances_attendances'].values
X_test = test.drop(['ae_attendances_attendances','ccg','month'], axis=1)
Ridge regression#
No scaling#
model = Ridge()
#model = make_pipeline(StandardScaler(), RidgeCV())
model.fit(X_train, y_train)
print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')
model score on training data: 0.5440194513191048
model score on testing data: 0.5336104269604207
coefs = pd.DataFrame(
model.coef_,
columns=['Coefficients'], index=X_train.columns
)
coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Ridge model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
![_images/model_comparison_19_0.png](_images/model_comparison_19_0.png)
Without scaling features it is not possible to compare their importance using model coefficients
Standardise features#
model = make_pipeline(StandardScaler(), Ridge())
model.fit(X_train, y_train)
print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')
model score on training data: 0.543812323234377
model score on testing data: 0.5336184653406383
coefs = pd.DataFrame(
model[1].coef_,
columns=['Coefficients'], index=X_train.columns
)
coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Ridge model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
![_images/model_comparison_23_0.png](_images/model_comparison_23_0.png)
Feature selection#
selector = RFECV(model[1], step=1, cv=5)
selector = selector.fit(X_train, y_train)
support = pd.DataFrame(
selector.support_,
columns=['Support'], index=X_train.columns
)
support
Support | |
---|---|
111_111_offered | True |
111_111_answered | True |
amb_sys_made | True |
amb_sys_answered | True |
gp_appt_available | False |
population | True |
People | True |
Places | True |
Lives | True |
year | True |
rand1 | True |
The random variable should not be supported
With cross-validation#
cv_model = cross_validate(
model, X,y,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=0),
return_estimator=True, n_jobs=2
)
coefs = pd.DataFrame(
[model[1].coef_
for model in cv_model['estimator']],
columns=X.columns
)
plt.figure(figsize=(9, 7))
sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
plt.axvline(x=0, color='.5')
plt.xlabel('Coefficient importance')
plt.title('Coefficient importance and its variability')
plt.subplots_adjust(left=.3)
![_images/model_comparison_30_0.png](_images/model_comparison_30_0.png)
Lasso#
Standardise Features#
model = make_pipeline(StandardScaler(), Lasso(alpha=.015, max_iter=100000))
model.fit(X_train, y_train)
print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')
model score on training data: 0.5440180818857432
model score on testing data: 0.5335852489275236
coefs = pd.DataFrame(
model[1].coef_,
columns=['Coefficients'], index=X_train.columns
)
coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Lasso model, strong regularization')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
![_images/model_comparison_34_0.png](_images/model_comparison_34_0.png)
Feature selection#
selector = RFECV(model[1], step=1, cv=5)
selector = selector.fit(X_train, y_train)
support = pd.DataFrame(
selector.support_,
columns=['Support'], index=X_train.columns
)
support
Support | |
---|---|
111_111_offered | True |
111_111_answered | True |
amb_sys_made | True |
amb_sys_answered | True |
gp_appt_available | False |
population | True |
People | True |
Places | True |
Lives | True |
year | True |
rand1 | True |
Cross validate#
cv_model = cross_validate(
model, X,y,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=0),
return_estimator=True, n_jobs=2
)
coefs = pd.DataFrame(
[model[1].coef_
for model in cv_model['estimator']],
columns=X.columns
)
plt.figure(figsize=(9, 7))
sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
plt.axvline(x=0, color='.5')
plt.xlabel('Coefficient importance')
plt.title('Coefficient importance and its variability')
plt.subplots_adjust(left=.3)
![_images/model_comparison_40_0.png](_images/model_comparison_40_0.png)
OLS#
Standardise Features#
model = make_pipeline(StandardScaler(), LinearRegression())
model.fit(X_train, y_train)
print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')
model score on training data: 0.5440195223174749
model score on testing data: 0.5336193991596547
coefs = pd.DataFrame(
model[1].coef_,
columns=['Coefficients'], index=X_train.columns
)
coefs.plot(kind='barh', figsize=(9, 7))
plt.title('OLS')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
![_images/model_comparison_44_0.png](_images/model_comparison_44_0.png)
Feature selection#
selector = RFECV(model[1], step=1, cv=5)
selector = selector.fit(X_train, y_train)
support = pd.DataFrame(
selector.support_,
columns=['Support'], index=X_train.columns
)
support
Support | |
---|---|
111_111_offered | True |
111_111_answered | True |
amb_sys_made | True |
amb_sys_answered | True |
gp_appt_available | False |
population | True |
People | True |
Places | True |
Lives | True |
year | True |
rand1 | True |
Cross validate#
cv_model = cross_validate(
model, X,y,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=0),
return_estimator=True, n_jobs=2
)
coefs = pd.DataFrame(
[model[1].coef_
for model in cv_model['estimator']],
columns=X.columns
)
plt.figure(figsize=(9, 7))
sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
plt.axvline(x=0, color='.5')
plt.xlabel('Coefficient importance')
#plt.title('Coefficient importance and its variability')
plt.tight_layout()
plt.savefig('ols_feature_importance.pdf')
plt.subplots_adjust(left=.3)
![_images/model_comparison_50_0.png](_images/model_comparison_50_0.png)
ElasticNet#
Standardise Features#
model = make_pipeline(StandardScaler(),
ElasticNet(max_iter=100000))
model.fit(X_train, y_train)
print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')
model score on training data: 0.4727309935289348
model score on testing data: 0.43772822174451687
coefs = pd.DataFrame(
model[1].coef_,
columns=['Coefficients'], index=X_train.columns
)
coefs.plot(kind='barh', figsize=(9, 7))
plt.title('ElasticNet CV')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
![_images/model_comparison_54_0.png](_images/model_comparison_54_0.png)
Cross validate#
cv_model = cross_validate(
model, X,y,
cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=0),
return_estimator=True, n_jobs=2
)
coefs = pd.DataFrame(
[model[1].coef_
for model in cv_model['estimator']],
columns=X.columns
)
plt.figure(figsize=(9, 7))
sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
plt.axvline(x=0, color='.5')
plt.xlabel('Coefficient importance')
plt.title('Coefficient importance and its variability')
plt.subplots_adjust(left=.3)
![_images/model_comparison_56_0.png](_images/model_comparison_56_0.png)
Feature Selection#
OMP#
model = make_pipeline(StandardScaler(),
OMP(n_nonzero_coefs = 2, normalize=False))
model.fit(X_train, y_train)
print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')
model score on training data: 0.4992217993534437
model score on testing data: 0.4823928451338607
coefs = pd.DataFrame(
model[1].coef_,
columns=['Coefficients'], index=X_train.columns
)
coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Orthogonal Matching Pursuit')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
![_images/model_comparison_60_0.png](_images/model_comparison_60_0.png)
Score vs n_feat#
num_features = [i+1 for i in range(X.shape[1])]
results_train, results_test = pd.DataFrame(), pd.DataFrame()
for k in num_features:
model = make_pipeline(StandardScaler(),
OMP(n_nonzero_coefs = k, normalize=False))
cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
scores_train, scores_test = [],[]
for train_index, test_index in cv.split(X, y):
model.fit(X.iloc[train_index], y.iloc[train_index])
scores_test.append(model.score(X.iloc[test_index], y.iloc[test_index]))
scores_train.append(model.score(X.iloc[train_index], y.iloc[train_index]))
results_test[k]=scores_test
results_train[k]=scores_train
fig = plt.figure(figsize=(8,5))
plt.plot(num_features, results_test.mean(), 'r-.', label='test')
plt.plot(num_features, results_train.mean(), 'b-.', label='train')
plt.xlabel('N variables')
plt.ylabel(r'$\bar{R^2}$')
plt.legend(loc='best')
#plt.savefig('rsq_features_omp.pdf')
plt.show()
![_images/model_comparison_63_0.png](_images/model_comparison_63_0.png)
LARS#
from sklearn.linear_model import Lars
model = make_pipeline(StandardScaler(),
Lars(n_nonzero_coefs = X.shape[1], normalize=False))
model.fit(X_train, y_train)
print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')
model score on training data: 0.5440195223174749
model score on testing data: 0.533619399159655
coefs = pd.DataFrame(
model[1].coef_,
columns=['Coefficients'], index=X_train.columns
)
coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Lars')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
![_images/model_comparison_66_0.png](_images/model_comparison_66_0.png)
Score vs number of features#
num_features = [i+1 for i in range(X.shape[1])]
results_train, results_test = pd.DataFrame(), pd.DataFrame()
for k in num_features:
model = make_pipeline(StandardScaler(),
Lars(n_nonzero_coefs = k, normalize=False))
cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
scores_train, scores_test = [],[]
for train_index, test_index in cv.split(X, y):
model.fit(X.iloc[train_index], y.iloc[train_index])
scores_test.append(model.score(X.iloc[test_index], y.iloc[test_index]))
scores_train.append(model.score(X.iloc[train_index], y.iloc[train_index]))
results_test[k]=scores_test
results_train[k]=scores_train
fig = plt.figure(figsize=(8,5))
plt.plot(num_features, results_test.mean(), 'r-.', label='test')
plt.plot(num_features, results_train.mean(), 'b-.', label='train')
plt.xlabel('N variables')
plt.ylabel(r'$\bar{R^2}$')
plt.legend(loc='best')
#plt.savefig('rsq_features_lars.pdf')
plt.show()
![_images/model_comparison_69_0.png](_images/model_comparison_69_0.png)
Mutual information#
mi = mutual_info_regression(X_train,y_train)
mi_df = pd.DataFrame(
mi,
columns=['Mutual Information'], index=X_train.columns
)
mi_df
Mutual Information | |
---|---|
111_111_offered | 0.368889 |
111_111_answered | 0.342691 |
amb_sys_made | 0.618272 |
amb_sys_answered | 0.716384 |
gp_appt_available | 0.087838 |
population | 1.394764 |
People | 1.275716 |
Places | 1.387013 |
Lives | 1.317528 |
year | 0.032590 |
rand1 | 0.038655 |
Cross validated#
num_features = [i+1 for i in range(X.shape[1])]
results_train, results_test = pd.DataFrame(), pd.DataFrame()
for k in num_features:
fs = SelectKBest(score_func=mutual_info_regression, k=k)
model = make_pipeline(fs, LinearRegression())
cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
scores_train, scores_test = [],[]
for train_index, test_index in cv.split(X, y):
model.fit(X.iloc[train_index], y.iloc[train_index])
scores_test.append(model.score(X.iloc[test_index], y.iloc[test_index]))
scores_train.append(model.score(X.iloc[train_index], y.iloc[train_index]))
results_test[k]=scores_test
results_train[k]=scores_train
print('>%d %.3f (%.3f)' % (k, np.mean(scores_test), np.std(scores_test)))
>1 0.077 (0.086)
>2 0.165 (0.066)
>3 0.496 (0.036)
>4 0.501 (0.035)
>5 0.502 (0.035)
>6 0.509 (0.035)
>7 0.524 (0.034)
>8 0.531 (0.033)
>9 0.531 (0.033)
>10 0.534 (0.034)
>11 0.534 (0.035)
fig,ax = plt.subplots(figsize=(8,5))
plt.plot(num_features, results_test.mean(), 'r-.', label='test')
plt.plot(num_features, results_train.mean(), 'b-.', label='train')
plt.fill_between(num_features, y1=(results_test.mean()-results_test.std()).values,
y2=(results_test.mean()+results_test.std()).values, alpha=0.2, color='r')
plt.fill_between(num_features, y1=results_train.mean()-results_train.std(),
y2=results_train.mean()+results_train.std(), alpha=0.2, color='b')
plt.xlabel('N variables')
plt.ylabel(r'$\bar{R^2}$')
plt.legend(loc='best')
#plt.savefig('rsq_features_mi.pdf')
plt.show()
![_images/model_comparison_75_0.png](_images/model_comparison_75_0.png)
Average importance#
fs = SelectKBest(score_func=mutual_info_regression, k='all')
fs.fit(X,y)
mi_df = pd.DataFrame(
fs.scores_,
columns=['Scores'], index=X.columns
)
mi_df.sort_values(by='Scores', ascending=False)
Scores | |
---|---|
Places | 1.436297 |
population | 1.436055 |
Lives | 1.391061 |
People | 1.351514 |
amb_sys_answered | 0.692218 |
amb_sys_made | 0.628977 |
111_111_offered | 0.383233 |
111_111_answered | 0.338584 |
gp_appt_available | 0.096549 |
year | 0.021728 |
rand1 | 0.007324 |
Summary#
Models perform comparatively well, with an \(R^2\) of ~ 0.56
The random feature is supported when using recursive feature elimination
When using the mutual information score, the random feature is excluded. Population and measures of population health are most important.
Mutual information accounts for non-linear relationships, suggesting a non-linear model may be more appropriate.