|
|
@@ -786,7 +786,7 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
- "# 使用模型来生成文本\n",
|
|
|
+ "# Test text generation\n",
|
|
|
"begin_text = torch.tensor(tok.encode('def'), device=device).unsqueeze(0)\n",
|
|
|
"print(''.join(tok.decode(generate_batch(model, begin_text))))"
|
|
|
]
|
|
|
@@ -816,25 +816,25 @@
|
|
|
"source": [
|
|
|
"def process(data, sequence_len=sequence_len):\n",
|
|
|
" '''\n",
|
|
|
- " 根据文本生成训练数据\n",
|
|
|
+ " Transform text to train data.\n",
|
|
|
" '''\n",
|
|
|
- " # text是字符串列表\n",
|
|
|
+ " # text is list[str]\n",
|
|
|
" text = data['whole_func_string']\n",
|
|
|
" inputs, labels = [], []\n",
|
|
|
" for i in text:\n",
|
|
|
" enc = tok.encode(i)\n",
|
|
|
- " # 0对应着文本结束\n",
|
|
|
+ " # 0 means the end of text\n",
|
|
|
" enc += [0]\n",
|
|
|
- " # 将文本转换为多个训练数据\n",
|
|
|
+ " # Transform one text to multiple train data\n",
|
|
|
" for i in range(len(enc) - sequence_len):\n",
|
|
|
" inputs.append(enc[i: i + sequence_len])\n",
|
|
|
- " # 预测标签是下一个字母,因此只需要挪动一个位置即可\n",
|
|
|
+ " # The label is the next char. It is sufficient to move one position.\n",
|
|
|
" labels.append(enc[i + 1: i + 1 + sequence_len])\n",
|
|
|
" return {'inputs': inputs, 'labels': labels}\n",
|
|
|
"\n",
|
|
|
- "# 将数据分为训练集和测试集\n",
|
|
|
+ "# Split data into train set and test set\n",
|
|
|
"tokenized = datasets.train_test_split(test_size=0.1, seed=1024, shuffle=True)\n",
|
|
|
- "# 将文本转换为训练数据,里面包含inputs和labels\n",
|
|
|
+ "# The train data merely contains inputs and labels.\n",
|
|
|
"tokenized = tokenized.map(process, batched=True, remove_columns=datasets.column_names)\n",
|
|
|
"tokenized.set_format(type='torch', device=device)\n",
|
|
|
"\n",
|
|
|
@@ -877,10 +877,10 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
- "# 构建数据读取器\n",
|
|
|
+ "# Build data loader\n",
|
|
|
"train_loader = DataLoader(tokenized['train'], batch_size=batch_size, shuffle=True)\n",
|
|
|
"test_loader = DataLoader(tokenized['test'], batch_size=batch_size, shuffle=True)\n",
|
|
|
- "# 获取一个批量的数据\n",
|
|
|
+ "# Get one batch data\n",
|
|
|
"next(iter(test_loader))"
|
|
|
]
|
|
|
},
|
|
|
@@ -909,22 +909,22 @@
|
|
|
"source": [
|
|
|
"def estimate_loss(model):\n",
|
|
|
" re = {}\n",
|
|
|
- " # 将模型切换至评估模式\n",
|
|
|
+ " # Put the model on evaluation mode\n",
|
|
|
" model.eval()\n",
|
|
|
" re['train'] = _loss(model, train_loader)\n",
|
|
|
" re['test'] = _loss(model, test_loader)\n",
|
|
|
- " # 将模型切换至训练模式\n",
|
|
|
+ " # Put the model on train mode\n",
|
|
|
" model.train()\n",
|
|
|
" return re\n",
|
|
|
"\n",
|
|
|
"@torch.no_grad()\n",
|
|
|
"def _loss(model, data_loader):\n",
|
|
|
" '''\n",
|
|
|
- " 计算模型在不同数据集下面的评估指标\n",
|
|
|
+ " Measure the performance of model based on different data sets.\n",
|
|
|
" '''\n",
|
|
|
" loss = []\n",
|
|
|
" data_iter= iter(data_loader)\n",
|
|
|
- " # 随机使用多个批量数据来预估模型效果\n",
|
|
|
+ " # Use eval_iters batch data to measure the performance\n",
|
|
|
" for k in range(eval_iters):\n",
|
|
|
" data = next(data_iter, None)\n",
|
|
|
" if data is None:\n",
|
|
|
@@ -932,8 +932,9 @@
|
|
|
" data = next(data_iter, None)\n",
|
|
|
" inputs, labels = data['inputs'], data['labels']\n",
|
|
|
" logits = model(inputs)\n",
|
|
|
- " # 根据cross_entropy的定义,需要对logits进行转置运算\n",
|
|
|
- " # 具体细节请参考cross_entropy的官方文档\n",
|
|
|
+ " # According to the definition of cross_entropy in PyTorch,\n",
|
|
|
+ " # we need to transpose the logits.\n",
|
|
|
+ " # More details can be found in official document.\n",
|
|
|
" logits = logits.transpose(-2, -1)\n",
|
|
|
" loss.append(F.cross_entropy(logits, labels).item())\n",
|
|
|
" return torch.tensor(loss).mean().item()\n",
|
|
|
@@ -956,14 +957,15 @@
|
|
|
" inputs, labels = data['inputs'], data['labels']\n",
|
|
|
" optimizer.zero_grad()\n",
|
|
|
" logits = model(inputs)\n",
|
|
|
- " # 根据cross_entropy的定义,需要对logits进行转置运算\n",
|
|
|
- " # 具体细节请参考cross_entropy的官方文档\n",
|
|
|
+ " # According to the definition of cross_entropy in PyTorch,\n",
|
|
|
+ " # we need to transpose the logits.\n",
|
|
|
+ " # More details can be found in official document.\n",
|
|
|
" logits = logits.transpose(-2, -1)\n",
|
|
|
" loss = F.cross_entropy(logits, labels)\n",
|
|
|
" lossi.append(loss.item())\n",
|
|
|
" loss.backward()\n",
|
|
|
" optimizer.step()\n",
|
|
|
- " # 评估模型,并输出结果\n",
|
|
|
+ " # Evaluate the performance\n",
|
|
|
" stats = estimate_loss(model)\n",
|
|
|
" train_loss = f'train loss {stats['train']:.4f}'\n",
|
|
|
" test_loss = f'test loss {stats['test']:.4f}'\n",
|
|
|
@@ -1066,7 +1068,7 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
- "# 使用模型来生成文本\n",
|
|
|
+ "# Text generation\n",
|
|
|
"begin_text = torch.tensor(tok.encode('def '), device=device).unsqueeze(0)\n",
|
|
|
"print(''.join(tok.decode(generate_batch(model, begin_text))))"
|
|
|
]
|