{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "harmful-study",
   "metadata": {},
   "source": [
    "(heckman_code)=\n",
    "\n",
    "# Heckman Code\n",
    "\n",
    "The python code generating the toy data for the figures above is given\n",
    "below. This version examines Case 4.[^fn1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "challenging-swaziland",
   "metadata": {},
   "outputs": [],
   "source": [
    "# start a connected stata17 session\n",
    "from pystata import config\n",
    "config.init('be')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "handmade-password",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "      <th>x</th>\n",
       "      <th>w</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-2.174201</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.034267</td>\n",
       "      <td>0.068906</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-2.378358</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.765331</td>\n",
       "      <td>0.495672</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.952739</td>\n",
       "      <td>-1.519415</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-2.380697</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.453156</td>\n",
       "      <td>-0.074578</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.254649</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.180989</td>\n",
       "      <td>0.760963</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          y    z         x         w\n",
       "0 -2.174201  1.0 -0.034267  0.068906\n",
       "1 -2.378358  1.0 -0.765331  0.495672\n",
       "2       NaN  0.0 -0.952739 -1.519415\n",
       "3 -2.380697  1.0 -1.453156 -0.074578\n",
       "4  0.254649  1.0  0.180989  0.760963"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# true parameters\n",
    "rho_t = np.array([0.8])\n",
    "rho_x_w_t = np.array([0.8])\n",
    "gamma_t = np.array([.5,1.0])\n",
    "beta_t = np.array([-2.0,0.5])\n",
    "sigma_e_t = np.array([1.0])\n",
    "N =5000\n",
    "\n",
    "# generate toy data consistent with heckman:\n",
    "# generate potentially correlated x,w data\n",
    "mean_x_w = np.array([0,0])\n",
    "cov_x_w = np.array([[1,rho_x_w_t[0]],[rho_x_w_t[0], 1]])\n",
    "w, x = np.random.multivariate_normal(mean_x_w, cov_x_w, N).T\n",
    "\n",
    "# add constant to first position and convert to DataFrame\n",
    "w_ = pd.DataFrame(np.c_[np.ones(N),w],columns=['Constant (Selection)','Slope (Selection)'])\n",
    "x_ = pd.DataFrame(np.c_[np.ones(N),x], columns=['Constant','Slope'])\n",
    "\n",
    "# generate errors \n",
    "mean_u_eps = np.array([0,0])\n",
    "cov_u_eps = np.array([[1,rho_t[0]],[rho_t[0],sigma_e_t[0]]])\n",
    "u, epsilon = np.random.multivariate_normal(mean_u_eps, cov_u_eps, N).T\n",
    "\n",
    "# generate latent zstar\n",
    "zstar = w_.dot(gamma_t) + u\n",
    "# generate observed z (indicator=1 if zstar is positive)\n",
    "z = zstar > 0  \n",
    "\n",
    "# generate latent ystar\n",
    "ystar = x_.dot(beta_t) + epsilon\n",
    "y=ystar.copy()\n",
    "# generate observed y [if z=0, set y to NaN]\n",
    "y[~z] = np.nan\n",
    "\n",
    "stata_data = pd.DataFrame(np.c_[y,z,x,w], columns=['y','z','x','w'])\n",
    "stata_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "superb-nashville",
   "metadata": {},
   "source": [
    "We can estimate a Two-Step Heckman Model in Python using an [unmerged\n",
    "branch](https://github.com/statsmodels/statsmodels/blob/92ea62232fd63c7b60c60bee4517ab3711d906e3/statsmodels/regression/heckman.py)\n",
    "from StatsModels (this replicates the Stata two-step results)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "smaller-turkey",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       Heckman Regression Results      \n",
      "=======================================\n",
      "Dep. Variable:                        y\n",
      "Model:                          Heckman\n",
      "Method:                Heckman Two-Step\n",
      "Date:                  Wed, 12 May 2021\n",
      "Time:                          12:19:08\n",
      "No. Total Obs.:                    5000\n",
      "No. Censored Obs.:                 1742\n",
      "No. Uncensored Obs.:               3258\n",
      "==============================================================================\n",
      "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "Constant      -2.0018      0.035    -57.110      0.000      -2.071      -1.933\n",
      "Slope          0.5187      0.023     22.543      0.000       0.474       0.564\n",
      "========================================================================================\n",
      "                           coef    std err          z      P>|z|      [0.025      0.975]\n",
      "----------------------------------------------------------------------------------------\n",
      "Constant (Selection)     0.5567      0.022     25.017      0.000       0.513       0.600\n",
      "Slope (Selection)        0.9959      0.028     35.566      0.000       0.941       1.051\n",
      "================================================================================\n",
      "                   coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------\n",
      "IMR (Lambda)     0.8110      0.061     13.265      0.000       0.691       0.931\n",
      "=====================================\n",
      "rho:                            0.826\n",
      "sigma:                          0.982\n",
      "=====================================\n",
      "\n",
      "First table are the estimates for the regression (response) equation.\n",
      "Second table are the estimates for the selection equation.\n",
      "Third table is the estimate for the coef of the inverse Mills ratio (Heckman's Lambda).\n"
     ]
    }
   ],
   "source": [
    "import heckman as heckman\n",
    "res = heckman.Heckman(y, x_, w_).fit(method='twostep')\n",
    "print(res.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "scenic-savannah",
   "metadata": {},
   "source": [
    "And in Stata, we can estimate the Full Information Maximum Likelihood\n",
    "model over the toy dataset as"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "geographic-trash",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Iteration 0:   log likelihood = -6328.3022  \n",
      "Iteration 1:   log likelihood = -6328.1976  \n",
      "Iteration 2:   log likelihood = -6328.1976  \n",
      "\n",
      "Heckman selection model                         Number of obs     =      5,000\n",
      "(regression model with sample selection)              Selected    =      3,258\n",
      "                                                      Nonselected =      1,742\n",
      "\n",
      "                                                Wald chi2(1)      =     852.59\n",
      "Log likelihood = -6328.198                      Prob > chi2       =     0.0000\n",
      "\n",
      "------------------------------------------------------------------------------\n",
      "             | Coefficient  Std. err.      z    P>|z|     [95% conf. interval]\n",
      "-------------+----------------------------------------------------------------\n",
      "y            |\n",
      "           x |   .5220726   .0178798    29.20   0.000     .4870289    .5571163\n",
      "       _cons |  -2.005327   .0219433   -91.39   0.000    -2.048335   -1.962319\n",
      "-------------+----------------------------------------------------------------\n",
      "z            |\n",
      "           w |    .986842   .0260142    37.93   0.000     .9358551    1.037829\n",
      "       _cons |   .5533477   .0218632    25.31   0.000     .5104965    .5961988\n",
      "-------------+----------------------------------------------------------------\n",
      "     /athrho |   1.189239   .0599024    19.85   0.000     1.071832    1.306646\n",
      "    /lnsigma |  -.0171968   .0157981    -1.09   0.276    -.0481604    .0137668\n",
      "-------------+----------------------------------------------------------------\n",
      "         rho |   .8303427   .0186016                      .7901506    .8634242\n",
      "       sigma |   .9829502   .0155287                      .9529809    1.013862\n",
      "      lambda |   .8161856   .0281179                      .7610754    .8712957\n",
      "------------------------------------------------------------------------------\n",
      "LR test of indep. eqns. (rho = 0): chi2(1) = 268.10       Prob > chi2 = 0.0000\n"
     ]
    }
   ],
   "source": [
    "%%stata -d stata_data\n",
    "heckman y x, select(z=w)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "maritime-mexican",
   "metadata": {},
   "source": [
    "[^fn1]: Note, we are using Stata 17 Jupyter Notebook integration using the `%%stata` magic."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}