Contents

Mixed Linear Model

Contents

Mixed Linear Model#

import warnings
warnings.filterwarnings('ignore')

Import libraries#

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from statsmodels.regression.mixed_linear_model import MixedLM
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold

import statsmodels.api as sm
import scipy.stats as stats

dta = pd.read_csv('https://raw.githubusercontent.com/CharlotteJames/ed-forecast/main/data/master_scaled_new.csv',
                  index_col=0)

dta.columns = ['_'.join([c.split('/')[0],c.split('/')[-1]]) if '/' in c else c for c in dta.columns]

dta.ccg.unique().shape

(74,)

Add random feature#

# Adding random features

rng = np.random.RandomState(0)
rand_var = rng.rand(dta.shape[0])
dta['rand1'] = rand_var

dta.shape

(1618, 14)

Train test split#

train, test = train_test_split(dta,random_state=29)

y_train = train['ae_attendances_attendances'].values
X_train = train.drop(['ae_attendances_attendances','ccg','month'], axis=1)

y_test = test['ae_attendances_attendances'].values
X_test = test.drop(['ae_attendances_attendances','ccg','month'], axis=1)

Model#

Cross-validate#

cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
    
scores_train, scores_test = [],[]

y = dta['ae_attendances_attendances']
X = dta.drop(['ae_attendances_attendances','month', 'year'], axis=1)
    
for train_index, test_index in cv.split(X, y):
    
    model = MixedLM(endog=y.iloc[train_index].values, 
                    exog = X.iloc[train_index].drop(['ccg'],axis=1).values,
                    groups=X.iloc[train_index].ccg.values)

    ml_fit = model.fit()
    
    y_pred_test = model.predict(ml_fit.fe_params, 
                                 exog=X.iloc[test_index].drop(['ccg'],axis=1).values)
    y_pred_train = model.predict(ml_fit.fe_params, 
                                  exog=X.iloc[train_index].drop(['ccg'],axis=1).values)
    
    scores_test.append(r2_score(y.iloc[test_index],y_pred_test))
    scores_train.append(r2_score(y.iloc[train_index],y_pred_train))

res=pd.DataFrame()
res['test_score'] = scores_test
res['train_score'] = scores_train

res.describe()

	test_score	train_score
count	25.000000	25.000000
mean	0.092296	0.096086
std	0.052074	0.057714
min	-0.060924	-0.071581
25%	0.071561	0.082681
50%	0.099458	0.108742
75%	0.119344	0.126630
max	0.160848	0.188843

Coefficients#

model = MixedLM(endog=y, exog = X.drop(['ccg'],axis=1).values, 
                groups=X.ccg.values)


ml_fit = model.fit()

ml_fit.summary()

Model:	MixedLM	Dependent Variable:	ae_attendances_attendances
No. Observations:	1618	Method:	REML
No. Groups:	74	Scale:	2285.2604
Min. group size:	12	Log-Likelihood:	-8795.4714
Max. group size:	24	Converged:	Yes
Mean group size:	21.9

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
x1	0.348	0.069	5.059	0.000	0.213	0.483
x2	-0.324	0.077	-4.190	0.000	-0.476	-0.172
x3	-0.062	0.074	-0.838	0.402	-0.206	0.083
x4	0.086	0.100	0.863	0.388	-0.110	0.283
x5	0.014	0.002	5.805	0.000	0.009	0.019
x6	-1.392	0.641	-2.170	0.030	-2.649	-0.135
x7	-5.477	1.295	-4.231	0.000	-8.014	-2.940
x8	4.230	2.723	1.554	0.120	-1.107	9.566
x9	5.912	2.998	1.972	0.049	0.037	11.787
x10	6.558	4.167	1.574	0.116	-1.609	14.724
Group Var	67340.324	262.242

Residuals#

fig = plt.figure(figsize = (8, 5))


ax = sns.distplot(ml_fit.resid, hist = False, kde_kws = {"shade" : True, "lw": 1}, fit = stats.norm)

ax.set_title("KDE Plot of Model Residuals (Blue) and Normal Distribution (Black)")
ax.set_xlabel("Residuals")
plt.show()

_images/multi_level_model_21_0.png

Q-Q Plot#

fig = plt.figure(figsize = (8, 5))
ax = fig.add_subplot(111)

sm.qqplot(ml_fit.resid, dist = stats.norm, line = 's', ax = ax)

ax.set_title("Q-Q Plot")

plt.show()

_images/multi_level_model_23_0.png

Residuals by CCG#

fig = plt.figure(figsize = (15, 5))

ax = sns.boxplot(x = ml_fit.model.groups, y = ml_fit.resid)

ax.set_title("Distribution of Residuals for ED attendances by CCG ")
ax.set_ylabel("Residuals", fontsize=12)
ax.set_xlabel("CCG", fontsize=12)
plt.xticks(rotation=70)
plt.show()

_images/multi_level_model_25_0.png