Statsmodels introduction
Introducing different discrete choice models using statsmodels.
df = sm.datasets.get_rdataset("Guerry", "HistData")
type(df), type(df.data)
df = df.data
df.head()
vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']
df = df[vars]
df.shape
df[-5:]
df.isna().sum()
df = df.dropna()
df.shape, df.dtypes
df['Region'].value_counts()
y, X = patsy.dmatrices('Lottery ~ Literacy + Wealth + Region', data=df, return_type='dataframe')
y[:3], X[:3]
mod = sm.OLS(y, X) # Describe model
res = mod.fit()
res.summary()
res.params
res.rsquared
sm.stats.linear_rainbow(res)
sm.graphics.plot_partregress('Lottery', 'Wealth', ['Region', 'Literacy'],
....: data=df, obs_labels=False)
spector_data = sm.datasets.spector.load(as_pandas=False)
spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)
print(spector_data.exog[:5,:])
print(spector_data.endog[:5])
df = sm.datasets.spector.load(as_pandas=True).data
df[:5], type(df)
lpm_mod = sm.OLS(spector_data.endog, spector_data.exog)
lpm_res = lpm_mod.fit()
print('Parameters: ', lpm_res.params[:-1])
y, X = patsy.dmatrices('GRADE ~ GPA + TUCE + PSI', df)
lpm2 = sm.OLS(y, X)
res2 = lpm2.fit()
res2.params[1:]
def test_eq(pred, targ):
diff = np.sum(pred - targ)
assert diff < 1e-10
test_eq(lpm_res.params[:-1], res2.params[1:])
logit_mod = sm.Logit(spector_data.endog, spector_data.exog)
logit_res = logit_mod.fit(disp=0)
print('Parameters: ', logit_res.params)
logit2 = sm.Logit(y,X)
res2 = logit2.fit()
res2.params
test_eq(logit_res.params[:-1], res2.params[1:])
# logit3 = sm.Logit('GRADE ~ GPA + TUCE + PSI', df)
marg = res2.get_margeff()
marg.summary()
probit_mod = sm.Probit(spector_data.endog, spector_data.exog)
probit_res = probit_mod.fit()
probit_margeff = probit_res.get_margeff()
print('Parameters: ', probit_res.params)
print('Marginal effects: ')
print(probit_margeff.summary())
anes_data = sm.datasets.anes96.load(as_pandas=False)
anes_exog = anes_data.exog
anes_exog = sm.add_constant(anes_exog, prepend=False)
print(anes_data.exog[:5,:])
print(anes_data.endog[:5])
mlogit_mod = sm.MNLogit(anes_data.endog, anes_exog)
mlogit_res = mlogit_mod.fit()
print(mlogit_res.params)
mlogit_res.summary()
dta = sm.datasets.fair.load_pandas().data
dta['affair'] = (dta['affairs'] > 0).astype(float)
print(dta.head(10))
affair_mod = logit("affair ~ occupation + educ + occupation_husb"
"+ rate_marriage + age + yrs_married + children"
" + religious", dta).fit()
affair_mod.summary()
dta.shape, dta.affair.value_counts()
def acc(cfmtx):
return (cfmtx[0,0] + cfmtx[1,1])/np.sum(cfmtx)
#How well are we predicting?
affair_mod.pred_table(), acc(affair_mod.pred_table())
mfx = affair_mod.get_margeff()
mfx.summary()
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
support = np.linspace(-6, 6, 1000)
ax.plot(support, stats.logistic.cdf(support), 'r-', label='Logistic')
ax.plot(support, stats.norm.cdf(support), label='Probit')
ax.legend();
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
support = np.linspace(-6, 6, 1000)
ax.plot(support, stats.logistic.pdf(support), 'r-', label='Logistic')
ax.plot(support, stats.norm.pdf(support), label='Probit')
ax.legend();