Bladeren bron

imbalanced data

Gen TANG 1 jaar geleden
bovenliggende
commit
c6f94d0327
2 gewijzigde bestanden met toevoegingen van 167 en 0 verwijderingen
  1. 167 0
      prerequisite/logit/imbalanced_data.ipynb
  2. BIN
      prerequisite/logit/pdf/6_非均衡分类问题.pdf

+ 167 - 0
prerequisite/logit/imbalanced_data.ipynb

@@ -0,0 +1,167 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 按照逻辑回归的假设产生数据\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "# 类别1的个数\n",
+    "n = 1000\n",
+    "# 类别0的倍数(相比于类别1)\n",
+    "zero_times = 20\n",
+    "np.random.seed(4060)\n",
+    "cov = [[1, 0], [0, 1]]\n",
+    "X = np.random.multivariate_normal([0, 0], cov, 2 * n * zero_times)\n",
+    "beta = np.array([1, -1]).reshape(2, 1)\n",
+    "## 产生随机扰动项\n",
+    "error = np.random.logistic(size=2 * n * zero_times).reshape(-1, 1)\n",
+    "Y = (np.dot(X, beta) + error > 0) + 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Y = 1的个数:1000\n",
+      "Y = 0的个数:20000\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 产生非均衡数据集\n",
+    "X1 = X[np.where(Y == 1)[0]][:n]\n",
+    "Y1 = Y[np.where(Y == 1)[0]][:n]\n",
+    "X0 = X[np.where(Y == 0)[0]]\n",
+    "Y0 = Y[np.where(Y == 0)[0]]\n",
+    "X = np.append(X0, X1, axis=0)\n",
+    "Y = np.append(Y0, Y1, axis=0)\n",
+    "print(\"Y = 1的个数:%s\" % Y[np.where(Y == 1)].shape)\n",
+    "print(\"Y = 0的个数:%s\" % Y[np.where(Y == 0)].shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义模型的评估方法\n",
+    "from sklearn import metrics\n",
+    "\n",
+    "\n",
+    "def evaluate_model(Y, pred):\n",
+    "    \"\"\"\n",
+    "    评估模型效果,其中包括ACC,AUC以及预测结果中类别1的个数\n",
+    "    \"\"\"\n",
+    "    print(\"数据中Y = 1的个数:%s\" % Y[np.where(Y == 1)].shape)\n",
+    "    print(\"预测结果中Y = 1的个数:%s\" % pred[np.where(pred == 1)].shape)\n",
+    "    print(\"模型准确度:%s\" % metrics.accuracy_score(Y, pred))\n",
+    "    fpr, tpr, _ = metrics.roc_curve(Y, pred)\n",
+    "    print(\"模型AUC:%s\" % metrics.auc(fpr, tpr))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "数据中Y = 1的个数:1000\n",
+      "预测结果中Y = 1的个数:42\n",
+      "模型准确度:0.9525714285714286\n",
+      "模型AUC:0.511025\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
+      "  FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 直接训练模型得到结果\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "\n",
+    "\n",
+    "## 为了消除惩罚项的干扰,将惩罚系数设为很大\n",
+    "model = LogisticRegression(C=1e4)\n",
+    "model.fit(X, Y.ravel())\n",
+    "pred = model.predict(X)\n",
+    "evaluate_model(Y, pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "数据中Y = 1的个数:1000\n",
+      "预测结果中Y = 1的个数:6347\n",
+      "模型准确度:0.7197619047619047\n",
+      "模型AUC:0.7251\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
+      "  FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 修正类别权重训练模型得到结果\n",
+    "positive_weight = len(Y[Y > 0]) / float(len(Y))\n",
+    "## 用类别占比的倒数作为类别权重,使数据集“回归均衡”\n",
+    "class_weight = {1: 1. / positive_weight, 0: 1. / (1 - positive_weight)}\n",
+    "## 为了消除惩罚项的干扰,将惩罚系数设为很大\n",
+    "model = LogisticRegression(class_weight=class_weight, C=1e4)\n",
+    "model.fit(X, Y.ravel())\n",
+    "pred = model.predict(X)\n",
+    "evaluate_model(Y, pred)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

BIN
prerequisite/logit/pdf/6_非均衡分类问题.pdf