{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from patsy import ContrastMatrix\n", "from sklearn.model_selection import train_test_split\n", "import statsmodels.api as sm\n", "from statsmodels.graphics.mosaicplot import mosaic\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('./data/adult.data')\n", "cols = ['workclass', 'sex', 'age', 'education_num', 'capital_gain',\n", " 'capital_loss', 'hours_per_week', 'label']\n", "data = data[cols]\n", "data['label_code'] = pd.Categorical(data.label).codes" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| label | \n", "<=50K | \n", ">50K | \n", "
|---|---|---|
| sex | \n", "\n", " | \n", " |
| Female | \n", "9592 | \n", "1179 | \n", "
| Male | \n", "15128 | \n", "6662 | \n", "