Back to Statsmodels

Discrete Choice Models

examples/notebooks/discrete_choice_example.ipynb

0.15.0.dev06.5 KB
Original Source

Discrete Choice Models

Fair's Affair data

A survey of women only was conducted in 1974 by Redbook asking about extramarital affairs.

python
%matplotlib inline
python
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from statsmodels.formula.api import logit
python
print(sm.datasets.fair.SOURCE)
python
print(sm.datasets.fair.NOTE)
python
dta = sm.datasets.fair.load_pandas().data
python
dta["affair"] = (dta["affairs"] > 0).astype(float)
print(dta.head(10))
python
print(dta.describe())
python
affair_mod = logit(
    "affair ~ occupation + educ + occupation_husb"
    "+ rate_marriage + age + yrs_married + children"
    " + religious",
    dta,
).fit()
python
print(affair_mod.summary())

How well are we predicting?

python
affair_mod.pred_table()

The coefficients of the discrete choice model do not tell us much. What we're after is marginal effects.

python
mfx = affair_mod.get_margeff()
print(mfx.summary())
python
respondent1000 = dta.iloc[1000]
print(respondent1000)
python
resp = dict(
    zip(
        range(1, 9),
        respondent1000[
            [
                "occupation",
                "educ",
                "occupation_husb",
                "rate_marriage",
                "age",
                "yrs_married",
                "children",
                "religious",
            ]
        ].tolist(),
    )
)
resp.update({0: 1})
print(resp)
python
mfx = affair_mod.get_margeff(atexog=resp)
print(mfx.summary())

predict expects a DataFrame since patsy is used to select columns.

python
respondent1000 = dta.iloc[[1000]]
affair_mod.predict(respondent1000)
python
affair_mod.fittedvalues[1000]
python
affair_mod.model.cdf(affair_mod.fittedvalues[1000])

The "correct" model here is likely the Tobit model. We have an work in progress branch "tobit-model" on github, if anyone is interested in censored regression models.

Exercise: Logit vs Probit

python
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)
support = np.linspace(-6, 6, 1000)
ax.plot(support, stats.logistic.cdf(support), "r-", label="Logistic")
ax.plot(support, stats.norm.cdf(support), label="Probit")
ax.legend()
python
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)
support = np.linspace(-6, 6, 1000)
ax.plot(support, stats.logistic.pdf(support), "r-", label="Logistic")
ax.plot(support, stats.norm.pdf(support), label="Probit")
ax.legend()

Compare the estimates of the Logit Fair model above to a Probit model. Does the prediction table look better? Much difference in marginal effects?

Generalized Linear Model Example

python
print(sm.datasets.star98.SOURCE)
python
print(sm.datasets.star98.DESCRLONG)
python
print(sm.datasets.star98.NOTE)
python
dta = sm.datasets.star98.load_pandas().data
print(dta.columns)
python
print(
    dta[
        ["NABOVE", "NBELOW", "LOWINC", "PERASIAN", "PERBLACK", "PERHISP", "PERMINTE"]
    ].head(10)
)
python
print(
    dta[
        ["AVYRSEXP", "AVSALK", "PERSPENK", "PTRATIO", "PCTAF", "PCTCHRT", "PCTYRRND"]
    ].head(10)
)
python
formula = "NABOVE + NBELOW ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT "
formula += "+ PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF"

Aside: Binomial distribution

Toss a six-sided die 5 times, what's the probability of exactly 2 fours?

python
stats.binom(5, 1.0 / 6).pmf(2)
python
from scipy.special import comb

comb(5, 2) * (1 / 6.0) ** 2 * (5 / 6.0) ** 3
python
from statsmodels.formula.api import glm

glm_mod = glm(formula, dta, family=sm.families.Binomial()).fit()
python
print(glm_mod.summary())

The number of trials

python
glm_mod.model.data.orig_endog.sum(1)
python
glm_mod.fittedvalues * glm_mod.model.data.orig_endog.sum(1)

First differences: We hold all explanatory variables constant at their means and manipulate the percentage of low income households to assess its impact on the response variables:

python
exog = glm_mod.model.data.orig_exog  # get the dataframe
python
means25 = exog.mean()
print(means25)
python
means25["LOWINC"] = exog["LOWINC"].quantile(0.25)
print(means25)
python
means75 = exog.mean()
means75["LOWINC"] = exog["LOWINC"].quantile(0.75)
print(means75)

Again, predict expects a DataFrame since patsy is used to select columns.

python
resp25 = glm_mod.predict(pd.DataFrame(means25).T)
resp75 = glm_mod.predict(pd.DataFrame(means75).T)
diff = resp75 - resp25

The interquartile first difference for the percentage of low income households in a school district is:

python
print("%2.4f%%" % (diff[0] * 100))
python
nobs = glm_mod.nobs
y = glm_mod.model.endog
yhat = glm_mod.mu
python
from statsmodels.graphics.api import abline_plot

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, ylabel="Observed Values", xlabel="Fitted Values")
ax.scatter(yhat, y)
y_vs_yhat = sm.OLS(y, sm.add_constant(yhat, prepend=True)).fit()
fig = abline_plot(model_results=y_vs_yhat, ax=ax)

Plot fitted values vs Pearson residuals

Pearson residuals are defined to be

$$\frac{(y - \mu)}{\sqrt{(var(\mu))}}$$

where var is typically determined by the family. E.g., binomial variance is $np(1 - p)$

python
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(
    111,
    title="Residual Dependence Plot",
    xlabel="Fitted Values",
    ylabel="Pearson Residuals",
)
ax.scatter(yhat, stats.zscore(glm_mod.resid_pearson))
ax.axis("tight")
ax.plot([0.0, 1.0], [0.0, 0.0], "k-")

Histogram of standardized deviance residuals with Kernel Density Estimate overlaid

The definition of the deviance residuals depends on the family. For the Binomial distribution this is

$$r_{dev} = sign\left(Y-\mu\right)*\sqrt{2n(Y\log\frac{Y}{\mu}+(1-Y)\log\frac{(1-Y)}{(1-\mu)}}$$

They can be used to detect ill-fitting covariates

python
resid = glm_mod.resid_deviance
resid_std = stats.zscore(resid)
kde_resid = sm.nonparametric.KDEUnivariate(resid_std)
kde_resid.fit()
python
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, title="Standardized Deviance Residuals")
ax.hist(resid_std, bins=25, density=True)
ax.plot(kde_resid.support, kde_resid.density, "r")

QQ-plot of deviance residuals

python
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)
fig = sm.graphics.qqplot(resid, line="r", ax=ax)