Browse Source

update comment for ch04

Gen TANG 2 years ago
parent
commit
e6c0e50630

+ 16 - 0
ch04_logit/README.md

@@ -0,0 +1,16 @@
+
+|代码|说明|
+|---|---|
+|[normal\_logit_approx.ipynb](normal_logit_approx.ipynb)| 逻辑分布与正态分布的关系 |
+|[logit_example.ipynb](logit_example.ipynb)| 展示为什么不能用线性回归模型解决分类问题 |
+|[logit_regression.ipynb](logit_regression.ipynb)| 对个人收入数据进行建模,展示逻辑回归的细节 |
+|[roc_curve.ipynb](roc_curve.ipynb)| ROC曲线与AUC |
+|[imbalanced_data.ipynb](imbalanced_data.ipynb)| 非均衡数据集对模型效果的影响以及应对方法 |
+|[multi\_logit_example.ipynb](multi_logit_example.ipynb)| 利用逻辑回归解决多元分类问题 |
+
+
+
+
+
+
+

+ 17 - 17
ch04_logit/imbalanced_data.ipynb

@@ -38,12 +38,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def unbalanced_data(X, Y, zeroTimes):\n",
-    "    \"\"\"\n",
-    "    通过将类别0的数据重复zeroTimes次,将均衡数据集变为非均衡数据集\n",
-    "    \"\"\"\n",
-    "    X0 = np.repeat(X[np.where(Y == 0)[0]], zeroTimes, axis=0)\n",
-    "    Y0 = np.repeat(Y[np.where(Y == 0)[0]], zeroTimes, axis=0)\n",
+    "def unbalanced_data(X, Y, zero_times):\n",
+    "    '''\n",
+    "    通过将类别0的数据重复zero_times次,将均衡数据集变为非均衡数据集\n",
+    "    '''\n",
+    "    X0 = np.repeat(X[np.where(Y == 0)[0]], zero_times, axis=0)\n",
+    "    Y0 = np.repeat(Y[np.where(Y == 0)[0]], zero_times, axis=0)\n",
     "    X1 = X[np.where(Y > 0)[0]]\n",
     "    Y1 = Y[np.where(Y > 0)[0]]\n",
     "    _X = np.append(X0, X1, axis=0)\n",
@@ -73,9 +73,9 @@
    "outputs": [],
    "source": [
     "def evaluate_model(Y, pred):\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    评估模型效果,其中包括ACC,AUC以及预测结果中类别1的个数\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    pred_positive = []\n",
     "    true_positive = []\n",
     "    aucs = []\n",
@@ -98,11 +98,11 @@
    "outputs": [],
    "source": [
     "def visualize(ratios, pred_positive, true_positive, aucs, accuracies):\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    将模型结果可视化\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    # 为在Matplotlib中显示中文,设置特殊字体\n",
-    "    plt.rcParams[\"font.sans-serif\"] = [\"SimHei\"]\n",
+    "    plt.rcParams['font.sans-serif'] = ['SimHei']\n",
     "    # 正确显示负号\n",
     "    plt.rcParams['axes.unicode_minus'] = False\n",
     "    plt.rcParams.update({'font.size': 13})\n",
@@ -110,18 +110,18 @@
     "    fig = plt.figure(figsize=(12, 6), dpi=100)\n",
     "    # 在图形框里画两幅图\n",
     "    ax = fig.add_subplot(1, 2, 1)\n",
-    "    ax.plot(ratios, pred_positive, label=\"%s\" % \"预测结果里类别1的个数\")\n",
-    "    ax.plot(ratios, true_positive, \"k--\", label=\"%s\" % \"原始数据里类别1的个数\")\n",
+    "    ax.plot(ratios, pred_positive, label='%s' % '预测结果里类别1的个数')\n",
+    "    ax.plot(ratios, true_positive, 'k--', label='%s' % '原始数据里类别1的个数')\n",
     "    ax.set_xlim([0, 0.5])\n",
     "    ax.invert_xaxis()\n",
-    "    legend = plt.legend(shadow=True, loc=\"best\")\n",
+    "    legend = plt.legend(shadow=True, loc='best')\n",
     "    ax1 = fig.add_subplot(1, 2, 2)\n",
-    "    ax1.plot(ratios, aucs, \"r\", label=\"%s\" % \"曲线下面积(AUC)\")\n",
-    "    ax1.plot(ratios, accuracies, \"k-.\", label=\"%s\" % \"准确度(ACC)\")\n",
+    "    ax1.plot(ratios, aucs, 'r', label='%s' % '曲线下面积(AUC)')\n",
+    "    ax1.plot(ratios, accuracies, 'k-.', label='%s' % '准确度(ACC)')\n",
     "    ax1.set_xlim([0, 0.5])\n",
     "    ax1.set_ylim([0.5, 1])\n",
     "    ax1.invert_xaxis()\n",
-    "    legend = plt.legend(shadow=True, loc=\"best\")\n",
+    "    legend = plt.legend(shadow=True, loc='best')\n",
     "    return fig"
    ]
   },

+ 2 - 1
ch04_logit/logit_example.ipynb

@@ -21,6 +21,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# 生成模型数据\n",
     "size = 10000\n",
     "x = np.random.normal(size=size)\n",
     "y = (x > 0).astype(np.float64)\n",
@@ -88,7 +89,7 @@
     "xline = np.linspace(-3, 5, 100)[:, np.newaxis]\n",
     "yline = model.predict(xline)\n",
     "ax.plot(xline.ravel(), yline, 'r')\n",
-    "\n",
+    "# 第二幅图\n",
     "ax1 = fig.add_subplot(1, 2, 2)\n",
     "residual = y - model.predict(x)\n",
     "n, bins, _ = ax1.hist(residual, 40, density=1, facecolor='grey', rwidth=0.8, alpha=0.6)\n",

+ 2 - 1
ch04_logit/logit_regression.ipynb

@@ -14,7 +14,7 @@
     "import matplotlib.pyplot as plt\n",
     "%matplotlib inline\n",
     "\n",
-    "\n",
+    "# 设置数据展示的格式\n",
     "pd.set_option('display.width', 1000)\n",
     "pd.set_option('display.precision', 1)"
    ]
@@ -325,6 +325,7 @@
     }
    ],
    "source": [
+    "# 展示各变量的分布情况\n",
     "plt.rcParams.update({'font.size': 13})\n",
     "fig, ax = plt.subplots(1, 4, figsize=(20, 5), dpi=100)\n",
     "hist_data = data[['age', 'education_num', 'hours_per_week', 'label_code']]\n",

+ 1 - 0
ch04_logit/roc_curve.ipynb

@@ -207,6 +207,7 @@
     }
    ],
    "source": [
+    "# False Positive Rate\n",
     "fpr"
    ]
   },