|
|
@@ -0,0 +1,167 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 1,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 按照逻辑回归的假设产生数据\n",
|
|
|
+ "import numpy as np\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "# 类别1的个数\n",
|
|
|
+ "n = 1000\n",
|
|
|
+ "# 类别0的倍数(相比于类别1)\n",
|
|
|
+ "zero_times = 20\n",
|
|
|
+ "np.random.seed(4060)\n",
|
|
|
+ "cov = [[1, 0], [0, 1]]\n",
|
|
|
+ "X = np.random.multivariate_normal([0, 0], cov, 2 * n * zero_times)\n",
|
|
|
+ "beta = np.array([1, -1]).reshape(2, 1)\n",
|
|
|
+ "## 产生随机扰动项\n",
|
|
|
+ "error = np.random.logistic(size=2 * n * zero_times).reshape(-1, 1)\n",
|
|
|
+ "Y = (np.dot(X, beta) + error > 0) + 0"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 2,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Y = 1的个数:1000\n",
|
|
|
+ "Y = 0的个数:20000\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# 产生非均衡数据集\n",
|
|
|
+ "X1 = X[np.where(Y == 1)[0]][:n]\n",
|
|
|
+ "Y1 = Y[np.where(Y == 1)[0]][:n]\n",
|
|
|
+ "X0 = X[np.where(Y == 0)[0]]\n",
|
|
|
+ "Y0 = Y[np.where(Y == 0)[0]]\n",
|
|
|
+ "X = np.append(X0, X1, axis=0)\n",
|
|
|
+ "Y = np.append(Y0, Y1, axis=0)\n",
|
|
|
+ "print(\"Y = 1的个数:%s\" % Y[np.where(Y == 1)].shape)\n",
|
|
|
+ "print(\"Y = 0的个数:%s\" % Y[np.where(Y == 0)].shape)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 3,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 定义模型的评估方法\n",
|
|
|
+ "from sklearn import metrics\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "def evaluate_model(Y, pred):\n",
|
|
|
+ " \"\"\"\n",
|
|
|
+ " 评估模型效果,其中包括ACC,AUC以及预测结果中类别1的个数\n",
|
|
|
+ " \"\"\"\n",
|
|
|
+ " print(\"数据中Y = 1的个数:%s\" % Y[np.where(Y == 1)].shape)\n",
|
|
|
+ " print(\"预测结果中Y = 1的个数:%s\" % pred[np.where(pred == 1)].shape)\n",
|
|
|
+ " print(\"模型准确度:%s\" % metrics.accuracy_score(Y, pred))\n",
|
|
|
+ " fpr, tpr, _ = metrics.roc_curve(Y, pred)\n",
|
|
|
+ " print(\"模型AUC:%s\" % metrics.auc(fpr, tpr))"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 4,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "数据中Y = 1的个数:1000\n",
|
|
|
+ "预测结果中Y = 1的个数:42\n",
|
|
|
+ "模型准确度:0.9525714285714286\n",
|
|
|
+ "模型AUC:0.511025\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "stderr",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
|
|
|
+ " FutureWarning)\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# 直接训练模型得到结果\n",
|
|
|
+ "from sklearn.linear_model import LogisticRegression\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "## 为了消除惩罚项的干扰,将惩罚系数设为很大\n",
|
|
|
+ "model = LogisticRegression(C=1e4)\n",
|
|
|
+ "model.fit(X, Y.ravel())\n",
|
|
|
+ "pred = model.predict(X)\n",
|
|
|
+ "evaluate_model(Y, pred)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 5,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "数据中Y = 1的个数:1000\n",
|
|
|
+ "预测结果中Y = 1的个数:6347\n",
|
|
|
+ "模型准确度:0.7197619047619047\n",
|
|
|
+ "模型AUC:0.7251\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "stderr",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
|
|
|
+ " FutureWarning)\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# 修正类别权重训练模型得到结果\n",
|
|
|
+ "positive_weight = len(Y[Y > 0]) / float(len(Y))\n",
|
|
|
+ "## 用类别占比的倒数作为类别权重,使数据集“回归均衡”\n",
|
|
|
+ "class_weight = {1: 1. / positive_weight, 0: 1. / (1 - positive_weight)}\n",
|
|
|
+ "## 为了消除惩罚项的干扰,将惩罚系数设为很大\n",
|
|
|
+ "model = LogisticRegression(class_weight=class_weight, C=1e4)\n",
|
|
|
+ "model.fit(X, Y.ravel())\n",
|
|
|
+ "pred = model.predict(X)\n",
|
|
|
+ "evaluate_model(Y, pred)"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.6.6"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2
|
|
|
+}
|