{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# 使用pandas读取数据\n", "import pandas as pd\n", "\n", "\n", "# 关闭不必要的warning\n", "pd.options.mode.chained_assignment = None\n", "data_path = \"./data/adult.data\"\n", "raw_data = pd.read_csv(data_path)\n", "## 选取需要使用的列\n", "cols = [\"age\", \"education_num\", \"capital_gain\", \"capital_loss\", \"hours_per_week\", \"label\"]\n", "data = raw_data[cols]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageeducation_numcapital_gaincapital_losshours_per_weeklabellabel_code
039132174040<=50K0
150130013<=50K0
23890040<=50K0
35370040<=50K0
428130040<=50K0
\n", "
" ], "text/plain": [ " age education_num capital_gain capital_loss hours_per_week label \\\n", "0 39 13 2174 0 40 <=50K \n", "1 50 13 0 0 13 <=50K \n", "2 38 9 0 0 40 <=50K \n", "3 53 7 0 0 40 <=50K \n", "4 28 13 0 0 40 <=50K \n", "\n", " label_code \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 将label转换为可以运算的变量\n", "data.loc[:, \"label_code\"] = pd.Categorical(data.label).codes\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# 将数据分为训练集和测试集\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "train_set, test_set = train_test_split(data, test_size=0.2, random_state=2310)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", " FutureWarning)\n" ] }, { "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='warn',\n", " n_jobs=None, penalty='l2', random_state=None, solver='warn',\n", " tol=0.0001, verbose=0, warm_start=False)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 训练模型\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "\n", "features = [\"age\", \"education_num\", \"capital_gain\", \"capital_loss\", \"hours_per_week\"]\n", "label = \"label_code\"\n", "model = LogisticRegression()\n", "model.fit(train_set[features], train_set[label])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# 得到预测结果\n", "test_set.loc[:, \"prob\"] = model.predict_proba(test_set[features])[:, 1]\n", "alpha = 0.5\n", "test_set.loc[:, \"pred\"] = test_set.apply(lambda x: 1 if x[\"prob\"] > alpha else 0, axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "查准率: 0.728, 查全率: 0.333, f1: 0.457\n" ] } ], "source": [ "# 计算查准查全率以及f1\n", "import numpy as np\n", "\n", "\n", "bins = np.array([0, 0.5, 1])\n", "tn, fp, fn, tp = np.histogram2d(test_set[label], test_set[\"pred\"], bins=bins)[0].flatten()\n", "precision = tp / (tp + fp) # 0.728\n", "recall = tp / (tp + fn) # 0.333\n", "f1 = 2 * precision * recall / (precision + recall) # 0.457\n", "print(\"查准率: %.3f, 查全率: %.3f, f1: %.3f\" % (precision, recall, f1))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(array([0.81854282, 0.72815534]), array([0.96029978, 0.33312183]), array([0.88377295, 0.45711798]), array([4937, 1576]))\n" ] } ], "source": [ "from sklearn.metrics import precision_recall_fscore_support\n", "\n", "\n", "print(precision_recall_fscore_support(test_set[label], test_set[\"pred\"]))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8301193386929114\n" ] } ], "source": [ "# 计算AUC\n", "from sklearn import metrics\n", "\n", "\n", "fpr, tpr, _ = metrics.roc_curve(test_set[label], test_set[\"prob\"])\n", "# 得到AUC\n", "auc = metrics.auc(fpr, tpr)\n", "print(auc)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# ROC曲线\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "plt.rcParams[\"font.sans-serif\"] = [\"SimHei\"]\n", "fig = plt.figure(figsize=(6, 6), dpi=80)\n", "# 在图形框里只画一幅图\n", "ax = fig.add_subplot(1, 1, 1)\n", "ax.plot(fpr, tpr, \"k\",\n", " label=\"%s; %s = %0.2f\" % (\"ROC曲线\", \"曲线下面积(AUC)\", auc))\n", "ax.fill_between(fpr, tpr, color=\"grey\", alpha=0.6)\n", "ax.plot([0, 1], [0, 1], \"r--\")\n", "ax.set_xlim([0, 1])\n", "ax.set_ylim([0, 1])\n", "legend = plt.legend(shadow=True)\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }