{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8D5tOVWvG6Ss", "outputId": "917998d3-b968-48d9-b7f1-a0f159ff1cd2" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader\n", "from datasets import load_dataset\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "\n", "torch.manual_seed(12046)\n", "# 计算设备为V100 16G\n", "# 如果使用CPU,需要非常长的时间,建议减少模型规模来加快速度(比如n_layer)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GwjXDvhCG6Su", "outputId": "0f0cff6f-c785-4259-d271-91217c170986" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[[ 1.0185, -1.3091, 1.2908, 0.5276],\n", " [-0.2985, 1.6259, 2.0433, -0.6417],\n", " [ 0.8795, -1.0512, 1.1491, 0.6116],\n", " [ 0.2128, -0.5512, 0.0450, 0.5010]]])\n", "tensor([[[ 1.0185, -inf, -inf, -inf],\n", " [-0.2985, 1.6259, -inf, -inf],\n", " [ 0.8795, -1.0512, 1.1491, -inf],\n", " [ 0.2128, -0.5512, 0.0450, 0.5010]]])\n", "tensor([[[1.0000, 0.0000, 0.0000, 0.0000],\n", " [0.1274, 0.8726, 0.0000, 0.0000],\n", " [0.4074, 0.0591, 0.5335, 0.0000],\n", " [0.2743, 0.1278, 0.2319, 0.3659]]])\n" ] } ], "source": [ "# 展示mask在注意力机制中的作用\n", "T = 4\n", "scores = torch.randn(1, T, T)\n", "print(scores)\n", "# 定义上三角矩阵\n", "tril = torch.tril(torch.ones(T, T))\n", "scores = scores.masked_fill(tril == 0, float('-inf'))\n", "print(scores)\n", "# 将scores转换为自回归学习的权重\n", "print(F.softmax(scores, dim=-1))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2o_FK0sUWW-t", "outputId": "af65728c-5920-4548-b8f5-5c9a925029b3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor(1.0026) tensor(1.0010) tensor(4.0152)\n", "tensor(1.0026) tensor(1.0010) tensor(1.0038)\n", "tensor([[0.0921, 0.1476, 0.1698, 0.4256, 0.0489, 0.0599, 0.0172, 0.0388]])\n", "tensor([[0., 0., 0., 1., 0., 0., 0., 0.]])\n" ] } ], "source": [ "# 展示对齐分数的方差放大效应\n", "B, T, head_size = 32, 100, 16\n", "\n", "k = torch.randn(B, T, head_size) # (B, T, H)\n", "q = torch.randn(B, T, head_size) # (B, T, H)\n", "scores = q @ k.transpose(-2, -1) # (B, T, T)\n", "print(k.std(), q.std(), scores.std())\n", "# 将对齐分数归一化\n", "scores = scores / head_size ** 0.5\n", "print(k.std(), q.std(), scores.std())\n", "\n", "# Softmax函数在处理方差较大的数据时,会发生聚集效应(结果过于集中在一个点上)\n", "# 这是为什么需要将对齐分数归一化\n", "x = torch.randn(1, 8)\n", "print(torch.softmax(x, dim=-1))\n", "print(torch.softmax(1000 * x, dim=-1))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "ujPVSgIgG6Sv" }, "outputs": [], "source": [ "def attention(query, key, value, dropout, mask=None):\n", " '''\n", " 注意力机制\n", " 参数\n", " ----\n", " query :torch.FloatTensor,查询向量,形状为(B, T, C)\n", " key :torch.FloatTensor,键向量,形状为(B, T, C)\n", " value :torch.FloatTensor,数值向量,形状为(B, T, C)\n", " dropout :随机失活\n", " mask :torch.FloatTensor,掩码,形状为(T, T)\n", " 返回\n", " ----\n", " out :torch.FloatTensor,根据注意力机制得到的背景向量,形状为(B, T, C)\n", " w_att :torch.FloatTensor,权重向量,形状为(B, T, T)\n", " '''\n", " # query, key, value都有相同的形状\n", " B, T, C = query.shape\n", " # (B, T, C) @ (B, C, T) --> (B, T, T)\n", " scores = query @ key.transpose(-2, -1) / (C ** 0.5)\n", " if mask is not None:\n", " # 如果没有mask,则表示词元可以使用左右两边的背景,也就是双向注意力\n", " # 如果mask是上三角矩阵,则表示自回归模式的单向注意力\n", " # mask的形状是(T, T)\n", " scores = scores.masked_fill(mask == 0, float('-inf'))\n", " w_att = dropout(F.softmax(scores, dim=-1)) # (B, T, T)\n", " out = w_att @ value # (B, T, C)\n", " return out, w_att" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "SbJ2z5U3G6Sw" }, "outputs": [], "source": [ "class MaskedAttention(nn.Module):\n", "\n", " def __init__(self, emb_size, head_size):\n", " '''\n", " 单头单向注意力\n", " 参数\n", " ----\n", " emb_size :int,特征长度\n", " head_size :int,背景向量长度\n", " '''\n", " super().__init__()\n", " self.key = nn.Linear(emb_size, head_size, bias=False)\n", " self.query = nn.Linear(emb_size, head_size, bias=False)\n", " self.value = nn.Linear(emb_size, head_size, bias=False)\n", " # 这个上三角矩阵不参与模型训练\n", " self.register_buffer(\n", " 'tril', torch.tril(torch.ones(sequence_len, sequence_len)))\n", " self.dropout = nn.Dropout(0.4)\n", "\n", " def forward(self, x):\n", " '''\n", " 向前传播\n", " 参数\n", " ----\n", " x :torch.FloatTensor\n", " 文本的特征向量,形状为(B, T, C),其中B表示批量大小,T表示文本长度,C表示特征长度(emb_size)\n", " 返回\n", " ----\n", " out :torch.FloatTensor\n", " 根据注意力机制得到的背景向量,形状为(B, T, H),其中H表示背景向量长度(head_size)\n", " '''\n", " B, T, C = x.shape\n", " q = self.query(x) # (B, T, H)\n", " k = self.key(x) # (B, T, H)\n", " v = self.value(x) # (B, T, H)\n", " mask = self.tril[:T, :T]\n", " out, _ = attention(q, k, v, self.dropout, mask)\n", " return out # (B, T, H)\n", "\n", "class MaskedMultiHeadAttention(nn.Module):\n", "\n", " def __init__(self, emb_size, head_size):\n", " '''\n", " 多头单向注意力\n", " 参数\n", " ----\n", " emb_size :int,特征长度\n", " head_size :int,背景向量长度\n", " '''\n", " super().__init__()\n", " # 确保特征长度是背景向量长度的倍数\n", " assert(emb_size % head_size == 0)\n", " # 定义单头注意力的个数\n", " n_head = emb_size // head_size\n", " heads = [MaskedAttention(emb_size, head_size) for _ in range(n_head)]\n", " self.heads = nn.ModuleList(heads)\n", " # 线性变换\n", " self.proj = nn.Linear(emb_size, emb_size)\n", " # 随机失活\n", " self.dropout = nn.Dropout(0.4)\n", "\n", " def forward(self, x):\n", " '''\n", " 向前传播\n", " 参数\n", " ----\n", " x :torch.FloatTensor\n", " 文本的特征向量,形状为(B, T, C),其中B表示批量大小,T表示文本长度,C表示特征长度(emb_size)\n", " 返回\n", " ----\n", " out :torch.FloatTensor,根据注意力机制得到的背景向量,形状为(B, T, C)\n", " '''\n", " # 将多个单头注意力的结果做张量拼接\n", " out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, C)\n", " out = self.dropout(self.proj(out))\n", " return out\n", "\n", "class FeedForward(nn.Module):\n", "\n", " def __init__(self, emb_size):\n", " '''\n", " 多层感知器\n", " '''\n", " super().__init__()\n", " self.l1 = nn.Linear(emb_size, 4 * emb_size)\n", " self.l2 = nn.Linear(4 * emb_size, emb_size)\n", " self.dropout = nn.Dropout(0.4)\n", "\n", " def forward(self, x):\n", " x = F.gelu(self.l1(x))\n", " out = self.dropout(self.l2(x))\n", " return out\n", "\n", "class Block(nn.Module):\n", "\n", " def __init__(self, emb_size, head_size):\n", " '''\n", " 解码块\n", " 参数\n", " ----\n", " emb_size :int,特征长度\n", " head_size :int,单头注意力中的背景向量长度\n", " '''\n", " super().__init__()\n", " self.mha = MaskedMultiHeadAttention(emb_size, head_size)\n", " self.ff = FeedForward(emb_size)\n", " # 层归一化\n", " self.ln1 = nn.LayerNorm(emb_size)\n", " self.ln2 = nn.LayerNorm(emb_size)\n", "\n", " def forward(self, x):\n", " '''\n", " 向前传播\n", " 参数\n", " ----\n", " x :torch.FloatTensor,文本的特征向量,形状为(B, T, C)\n", " 返回\n", " ----\n", " out :torch.FloatTensor,解码块的输出,形状为(B, T, C)\n", " '''\n", " # 残差连接\n", " x = x + self.mha(self.ln1(x)) # (B, T, C)\n", " out = x + self.ff(self.ln2(x)) # (B, T, C)\n", " return out" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "KlDmjSCMG6Sx" }, "outputs": [], "source": [ "# 一些超参数\n", "emb_size = 128\n", "head_size = 8\n", "n_layer = 12\n", "sequence_len = 64\n", "learning_rate = 1e-3\n", "eval_iters = 20\n", "batch_size=500\n", "# 如果有GPU,该脚本将使用GPU进行计算\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "fwr_fSVpG6Sx" }, "outputs": [], "source": [ "class CharGPT(nn.Module):\n", "\n", " def __init__(self, vs):\n", " '''\n", " 利用GPT-2进行自然语言的自回归学习\n", " 参数\n", " ----\n", " vs :int,字典大小\n", " '''\n", " super().__init__()\n", " # 文字嵌入层\n", " self.token_embedding = nn.Embedding(vs, emb_size)\n", " # 位置嵌入层\n", " self.position_embedding = nn.Embedding(sequence_len, emb_size)\n", " # 解码块\n", " blocks = [Block(emb_size, head_size) for _ in range(n_layer)]\n", " self.blocks = nn.Sequential(*blocks)\n", " self.ln = nn.LayerNorm(emb_size)\n", " # 语言建模头\n", " self.lm_head = nn.Linear(emb_size, vs)\n", "\n", " def forward(self, x):\n", " '''\n", " 向前传播\n", " 参数\n", " ----\n", " x :torch.LongTensor,当前字母在字典中的位置,形状为(B, T)\n", " 返回\n", " ----\n", " logits :torch.FloatTensor,预测结果的logits,形状为(B, T, vs)\n", " '''\n", " B, T = x.shape\n", " # 定义词元的位置,形状为(T)\n", " pos = torch.arange(0, T, dtype=torch.long, device=x.device)\n", " # 词元语义特征\n", " tok_emb = self.token_embedding(x) # (B, T, C)\n", " # 位置特征\n", " pos_emb = self.position_embedding(pos) # ( T, C)\n", " x = tok_emb + pos_emb # (B, T, C)\n", " x = self.blocks(x) # (B, T, C)\n", " x = self.ln(x) # (B, T, C)\n", " logits = self.lm_head(x) # (B, T, vs)\n", " return logits" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8LkvZyGjG6Sx", "outputId": "05aa4349-4ba4-4516-ee94-a19a08fe65cf" }, "outputs": [ { "data": { "text/plain": [ "98" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets = load_dataset('code_search_net', 'python')\n", "datasets = raw_datasets['train'].filter(lambda x: 'apache/spark' in x['repository_name'])\n", "\n", "class char_tokenizer:\n", "\n", " def __init__(self, data):\n", " # 数据中出现的所有字符构成字典\n", " chars = sorted(list(set(''.join(data))))\n", " # 预留一个位置给结尾的特殊字符\n", " self.char2ind = {s : i + 1 for i, s in enumerate(chars)}\n", " self.char2ind['<|e|>'] = 0\n", " self.ind2char = {i : s for s, i in self.char2ind.items()}\n", "\n", " def encode(self, text):\n", " return [self.char2ind[c] for c in text]\n", "\n", " def decode(self, enc):\n", " if isinstance(enc, int):\n", " return self.ind2char[enc]\n", " return [self.ind2char[i] for i in enc]\n", "\n", "tok = char_tokenizer(datasets['whole_func_string'])\n", "len(tok.char2ind)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "No_AQspwG6Sx", "outputId": "132ff3b8-1399-4982-c7a5-9400eab93eca" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2408290 parameters\n" ] }, { "data": { "text/plain": [ "CharGPT(\n", " (token_embedding): Embedding(98, 128)\n", " (position_embedding): Embedding(64, 128)\n", " (blocks): Sequential(\n", " (0): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (1): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (2): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (3): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (4): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (5): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (6): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (7): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (8): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (9): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (10): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (11): Block(\n", " (mha): MaskedMultiHeadAttention(\n", " (heads): ModuleList(\n", " (0-15): 16 x MaskedAttention(\n", " (key): Linear(in_features=128, out_features=8, bias=False)\n", " (query): Linear(in_features=128, out_features=8, bias=False)\n", " (value): Linear(in_features=128, out_features=8, bias=False)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " )\n", " (proj): Linear(in_features=128, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (l1): Linear(in_features=128, out_features=512, bias=True)\n", " (l2): Linear(in_features=512, out_features=128, bias=True)\n", " (dropout): Dropout(p=0.4, inplace=False)\n", " )\n", " (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " )\n", " )\n", " (ln): LayerNorm((128,), eps=1e-05, elementwise_affine=True)\n", " (lm_head): Linear(in_features=128, out_features=98, bias=True)\n", ")" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 展示模型结构\n", "model = CharGPT(len(tok.char2ind)).to(device)\n", "# 统计模型的参数个数\n", "print(f'{sum(p.numel() for p in model.parameters())} parameters')\n", "model" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "VOl4s229G6Sy" }, "outputs": [], "source": [ "@torch.no_grad()\n", "def generate_batch(model, idx, max_new_tokens=300):\n", " '''\n", " 利用模型生成文本(反复使用模型进行预测)\n", " 参数\n", " ----\n", " model :CharGPT,生成文本的模型\n", " idx :torch.LongTensor,当前字母在字典中的位置,形状为(1, T)\n", " max_new_tokens :int,生成文本的最大长度\n", " 返回\n", " ----\n", " out :list[int],生成的文本\n", " '''\n", " # 将模型切换至评估模式\n", " model.eval()\n", " for _ in range(max_new_tokens):\n", " # 限制背景长度,否则会报错\n", " context = idx[:, -sequence_len:]\n", " # 在文本生成时,模型的计算效率很低,因为有很多重复计算\n", " logits = model(context)\n", " # 只使用最后一个预测结果\n", " logits = logits[:, -1, :]\n", " probs = F.softmax(logits, dim=-1)\n", " # 根据模型预测的概率,得到最终的预测结果(下一个字母)\n", " # 这一步运算有一定随机性\n", " ix = torch.multinomial(probs, num_samples=1)\n", " idx = torch.cat((idx, ix), dim=1)\n", " if ix.item() == 0:\n", " break\n", " # 将模型切换至训练模式\n", " model.train()\n", " return idx.tolist()[0]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HQizE-2mG6Sz", "outputId": "21033dbb-9a8c-4ea2-9692-fcc000227b4c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def* O(h/of(\"YP`soE f|dwöR:1'_v?Q9)Nsx/Q=CKf\\M:iKcaI%+Q3m\n" ] } ], "source": [ "# 使用模型来生成文本\n", "begin_text = torch.tensor(tok.encode('def'), device=device).unsqueeze(0)\n", "print(''.join(tok.decode(generate_batch(model, begin_text))))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "q0Fw0TCyG6Sz", "outputId": "140dc25b-792d-45fa-a8c3-b975461237a7" }, "outputs": [ { "data": { "text/plain": [ "(torch.Size([605913, 64]), torch.Size([605913, 64]))" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def process(data, sequence_len=sequence_len):\n", " '''\n", " 根据文本生成训练数据\n", " '''\n", " # text是字符串列表\n", " text = data['whole_func_string']\n", " inputs, labels = [], []\n", " for i in text:\n", " enc = tok.encode(i)\n", " # 0对应着文本结束\n", " enc += [0]\n", " # 将文本转换为多个训练数据\n", " for i in range(len(enc) - sequence_len):\n", " inputs.append(enc[i: i + sequence_len])\n", " # 预测标签是下一个字母,因此只需要挪动一个位置即可\n", " labels.append(enc[i + 1: i + 1 + sequence_len])\n", " return {'inputs': inputs, 'labels': labels}\n", "\n", "# 将数据分为训练集和测试集\n", "tokenized = datasets.train_test_split(test_size=0.1, seed=1024, shuffle=True)\n", "# 将文本转换为训练数据,里面包含inputs和labels\n", "tokenized = tokenized.map(process, batched=True, remove_columns=datasets.column_names)\n", "tokenized.set_format(type='torch', device=device)\n", "\n", "tokenized['train']['inputs'].shape, tokenized['train']['labels'].shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-x4i2q1iG6S0", "outputId": "23ec73d3-566d-4bfa-98c0-d3c05be82e70" }, "outputs": [ { "data": { "text/plain": [ "{'inputs': tensor([[ 2, 2, 2, ..., 2, 2, 4],\n", " [81, 80, 88, ..., 2, 2, 10],\n", " [ 4, 37, 84, ..., 2, 2, 2],\n", " ...,\n", " [75, 85, 2, ..., 70, 71, 84],\n", " [ 2, 2, 2, ..., 67, 78, 53],\n", " [87, 84, 67, ..., 89, 2, 38]], device='cuda:0'),\n", " 'labels': tensor([[ 2, 2, 32, ..., 2, 4, 4],\n", " [80, 88, 71, ..., 2, 10, 70],\n", " [37, 84, 71, ..., 2, 2, 2],\n", " ...,\n", " [85, 2, 72, ..., 71, 84, 75],\n", " [ 2, 2, 2, ..., 78, 53, 81],\n", " [84, 67, 86, ..., 2, 38, 53]], device='cuda:0')}" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 构建数据读取器\n", "train_loader = DataLoader(tokenized['train'], batch_size=batch_size, shuffle=True)\n", "test_loader = DataLoader(tokenized['test'], batch_size=batch_size, shuffle=True)\n", "# 获取一个批量的数据\n", "next(iter(test_loader))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QD37LTDbG6S0", "outputId": "b51ce4ce-7f4a-41dc-c6a2-f518afaa9ce1" }, "outputs": [ { "data": { "text/plain": [ "{'train': 4.730088233947754, 'test': 4.726046085357666}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def estimate_loss(model):\n", " re = {}\n", " # 将模型切换至评估模式\n", " model.eval()\n", " re['train'] = _loss(model, train_loader)\n", " re['test'] = _loss(model, test_loader)\n", " # 将模型切换至训练模式\n", " model.train()\n", " return re\n", "\n", "@torch.no_grad()\n", "def _loss(model, data_loader):\n", " '''\n", " 计算模型在不同数据集下面的评估指标\n", " '''\n", " loss = []\n", " data_iter= iter(data_loader)\n", " # 随机使用多个批量数据来预估模型效果\n", " for k in range(eval_iters):\n", " data = next(data_iter, None)\n", " if data is None:\n", " data_iter = iter(data_loader)\n", " data = next(data_iter, None)\n", " inputs, labels = data['inputs'], data['labels']\n", " logits = model(inputs)\n", " # 根据cross_entropy的定义,需要对logits进行转置运算\n", " # 具体细节请参考cross_entropy的官方文档\n", " logits = logits.transpose(-2, -1)\n", " loss.append(F.cross_entropy(logits, labels).item())\n", " return torch.tensor(loss).mean().item()\n", "\n", "estimate_loss(model)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "TgKhC5TmG6S0" }, "outputs": [], "source": [ "def train_gpt(model, optimizer, data_loader, epochs=10):\n", " lossi = []\n", " for epoch in range(epochs):\n", " for i, data in enumerate(data_loader, 0):\n", " inputs, labels = data['inputs'], data['labels']\n", " optimizer.zero_grad()\n", " logits = model(inputs)\n", " # 根据cross_entropy的定义,需要对logits进行转置运算\n", " # 具体细节请参考cross_entropy的官方文档\n", " logits = logits.transpose(-2, -1)\n", " loss = F.cross_entropy(logits, labels)\n", " lossi.append(loss.item())\n", " loss.backward()\n", " optimizer.step()\n", " # 评估模型,并输出结果\n", " stats = estimate_loss(model)\n", " train_loss = f'train loss {stats['train']:.4f}'\n", " test_loss = f'test loss {stats['test']:.4f}'\n", " print(f'epoch {epoch:>2}: {train_loss}, {test_loss}')\n", " return lossi" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MCPIFH2dG6S1", "outputId": "68de3a92-b717-4147-ea07-bf456bae2190" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch 0: train loss 0.9037, test loss 1.1066\n", "epoch 1: train loss 0.7246, test loss 1.0086\n", "epoch 2: train loss 0.6448, test loss 0.9719\n", "epoch 3: train loss 0.5838, test loss 0.9607\n", "epoch 4: train loss 0.5468, test loss 0.9672\n", "epoch 5: train loss 0.5156, test loss 0.9663\n", "epoch 6: train loss 0.4891, test loss 0.9596\n", "epoch 7: train loss 0.4687, test loss 0.9652\n", "epoch 8: train loss 0.4517, test loss 0.9709\n", "epoch 9: train loss 0.4347, test loss 0.9761\n" ] } ], "source": [ "l = train_gpt(model, optim.AdamW(model.parameters(), lr=learning_rate), train_loader)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 448 }, "id": "pgRJxHwOG6S1", "outputId": "8787891f-92aa-4eca-a09b-83b8e5fdd3c9" }, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.plot(torch.tensor(l).view(-1, 10).mean(1).numpy())" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cPqZDUJ8I2AA", "outputId": "a6adc02a-a989-4bc5-ab8b-8840be56238d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "def _initialModel._to_java_impl():\n", " \"\"\"\n", " Deprecated in 2.3.0. Use :func:`pyspark.sql.types.DataType`, int or :class:`Column` expression in the given key (default param).\n", "\n", " >>> df = spark.range(1, 0).alias('age')).collect()\n", " [Row(name=u'Alice', age=1, name=u'Alice')]\n", " \"\n" ] } ], "source": [ "# 使用模型来生成文本\n", "begin_text = torch.tensor(tok.encode('def '), device=device).unsqueeze(0)\n", "print(''.join(tok.decode(generate_batch(model, begin_text))))" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "V100", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }