{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 安装第三方库\n", "!pip install torcheval" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "import torch.nn.functional as F\n", "from utils import Linear\n", "import pandas as pd\n", "import os\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "# 固定随机数生成种子,使得计算结果可以复现\n", "torch.manual_seed(1024)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def read_data(path):\n", " \"\"\"\n", " 使用pandas读取数据\n", " \"\"\"\n", " data = pd.read_csv(path)\n", " cols = [\"age\", \"education_num\", \"capital_gain\", \"capital_loss\", \"hours_per_week\", \"label\"]\n", " return data[cols]\n", "\n", "\n", "if os.name == \"nt\":\n", " data_path = \".\\\\data\\\\adult.data\"\n", "else:\n", " data_path = \"./data/adult.data\"\n", "data = read_data(data_path)\n", "data[\"label_code\"] = pd.Categorical(data.label).codes" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageeducation_numcapital_gaincapital_losshours_per_weeklabellabel_code
039132174040<=50K0
150130013<=50K0
23890040<=50K0
35370040<=50K0
428130040<=50K0
........................
3255627120038<=50K0
325574090040>50K1
325585890040<=50K0
325592290020<=50K0
3256052915024040>50K1
\n", "

32561 rows × 7 columns

\n", "
" ], "text/plain": [ " age education_num capital_gain capital_loss hours_per_week label \\\n", "0 39 13 2174 0 40 <=50K \n", "1 50 13 0 0 13 <=50K \n", "2 38 9 0 0 40 <=50K \n", "3 53 7 0 0 40 <=50K \n", "4 28 13 0 0 40 <=50K \n", "... ... ... ... ... ... ... \n", "32556 27 12 0 0 38 <=50K \n", "32557 40 9 0 0 40 >50K \n", "32558 58 9 0 0 40 <=50K \n", "32559 22 9 0 0 20 <=50K \n", "32560 52 9 15024 0 40 >50K \n", "\n", " label_code \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "32556 0 \n", "32557 1 \n", "32558 0 \n", "32559 0 \n", "32560 1 \n", "\n", "[32561 rows x 7 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 展示数据\n", "data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "class LogitRegression:\n", " \n", " def __init__(self, neg, pos):\n", " '''\n", " 定义逻辑回归模型的结构\n", " 参数\n", " ----\n", " neg :Linear,负面的偏好,模型的形状为(k, 1)\n", " pos :Linear,正面的偏好,模型的形状为(k, 1)\n", " '''\n", " self.pos = pos\n", " self.neg = neg\n", " \n", " def __call__(self, x):\n", " '''\n", " 逻辑回归模型的向前传播\n", " 参数\n", " ----\n", " x :torch.FloatTensor,形状为(n, k),其中n表示批量数据的大小,k表示特征的个数\n", " '''\n", " self.out = torch.concat((self.neg(x), self.pos(x)), dim=1)\n", " return self.out # (n, 2)\n", " \n", " def parameters(self):\n", " return self.neg.parameters() + self.pos.parameters()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# 定义模型\n", "pos = Linear(5, 1)\n", "neg = Linear(5, 1)\n", "model = LogitRegression(neg, pos)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# 准备数据\n", "x = torch.tensor(data[['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']].values).float()\n", "x = F.normalize(x) # (32561, 5)\n", "y = torch.tensor(data['label_code']).long() # (32561)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(tensor([[ 1.2665, -1.7305]]), tensor([[0.9524, 0.0476]]), tensor([0]))" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 使用模型\n", "## 注意,模型输入数据的形状一定要是(n, 5)\n", "logits = model(x[[1]]) # (1, 2)\n", "probs = F.softmax(logits, dim=1) # (1, 2)\n", "pred = torch.where(probs[:, 1] > 0.5, 1, 0)\n", "logits, probs, pred" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(tensor(0.0487), tensor(0.0487))" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 计算模型在单点的损失\n", "loss = F.cross_entropy(logits, y[[1]])\n", "# cross_entropy的具体实现过程\n", "-probs[torch.arange(1), y[[1]]].log().mean(), loss" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# 对于模型参数,需要记录它们的梯度(为反向传播做准备)\n", "for p in model.parameters():\n", " p.requires_grad = True" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "step 0/20000, loss: 0.6580\n", "step 2000/20000, loss: 0.5092\n", "step 4000/20000, loss: 0.5066\n", "step 6000/20000, loss: 0.5037\n", "step 8000/20000, loss: 0.4958\n", "step 10000/20000, loss: 0.5046\n", "step 12000/20000, loss: 0.5015\n", "step 14000/20000, loss: 0.4952\n", "step 16000/20000, loss: 0.5086\n", "step 18000/20000, loss: 0.5086\n" ] } ], "source": [ "# 标准随机梯度下降法的超参数\n", "max_steps = 20000\n", "batch_size = 3000\n", "lossi = []\n", "\n", "for i in range(max_steps):\n", " # 构造批次训练数据\n", " ix = torch.randint(0, x.shape[0], (batch_size,))\n", " xb = x[ix]\n", " yb = y[ix]\n", " # 向前传播\n", " logits = model(xb)\n", " loss = F.cross_entropy(logits, yb)\n", " # 反向传播\n", " loss.backward()\n", " # 更新模型参数\n", " ## 学习速率衰减\n", " learning_rate = 0.1 if i < 10000 else 0.01\n", " with torch.no_grad():\n", " for p in model.parameters():\n", " p -= learning_rate * p.grad\n", " p.grad = None\n", " \n", " # 统计数据\n", " if i % 2000 == 0:\n", " print(f'step {i: 6d}/{max_steps}, loss: {loss.item(): .4f}')\n", " lossi.append(loss.item())" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 展示模型损失优化的过程\n", "plt.plot(torch.tensor(lossi))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([32561, 2]), torch.Size([32561, 2]), torch.Size([32561]))" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 关闭梯度追踪\n", "with torch.no_grad():\n", " logits = model(x) \n", " probs = F.softmax(logits, dim=1)\n", " pred = torch.where(probs[:, 1] > 0.5, 1, 0)\n", "logits.shape, probs.shape, pred.shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(tensor(0.2928), tensor(0.5902), tensor(0.3914))" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from torcheval.metrics.functional.classification import binary_recall\n", "from torcheval.metrics.functional import binary_precision, binary_f1_score\n", "binary_recall(pred, y), binary_precision(pred, y), binary_f1_score(pred, y)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# 展示如何利用排序数据得到对偏好的估计\n", "# 此处只做模型结构展示,并不训练和使用模型\n", "class PreferenceModel:\n", " \n", " def __init__(self, pref):\n", " self.pref = pref\n", " \n", " def __call__(self, x0, x1):\n", " self.out = torch.concat((self.pref(x0), self.pref(x1)), dim=1)\n", " return self.out\n", " \n", " def parameters(self):\n", " return self.pref.parameters()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# 预测偏好的模型\n", "preference = Linear(5, 1)\n", "# 将两个数据的偏好组合在一起,以便和排序数据结合在一起\n", "p_model = PreferenceModel(preference)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(tensor([[0.0679, 1.1044]]), tensor([[0.2618, 0.7382]]))" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 随机选取x0和x1\n", "x0 = x[[0]]\n", "x1 = x[[1]]\n", "p_logits = p_model(x0, x1)\n", "# 得到有偏好推导出来的排序概率,该数据可以与观测到的实际排序相结合,定义模型损失\n", "p_probs = F.softmax(p_logits, dim=1)\n", "p_logits, p_probs" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }