|
|
@@ -35,8 +35,9 @@
|
|
|
"\n",
|
|
|
"\n",
|
|
|
"torch.manual_seed(12046)\n",
|
|
|
- "# 计算设备为V100 16G\n",
|
|
|
- "# 如果使用CPU,需要非常长的时间,建议减少sequence_len、batch_size等参数来加快速度"
|
|
|
+ "# Device: V100 16G\n",
|
|
|
+ "# If using a CPU, it will take a considerable amount of time. \n",
|
|
|
+ "# Please consider reducing sequence_len, batch_size to speed up the process."
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
@@ -47,9 +48,9 @@
|
|
|
},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "# 一些超参数\n",
|
|
|
+ "# Some parameters\n",
|
|
|
"learning_rate = 1e-2\n",
|
|
|
- "# 模型支持的最大文本长度\n",
|
|
|
+ "# The max length of text that model supports\n",
|
|
|
"sequence_len = 1024\n",
|
|
|
"batch_size = 4\n",
|
|
|
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
|
|
|
@@ -108,7 +109,7 @@
|
|
|
"source": [
|
|
|
"tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
|
|
|
"model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)\n",
|
|
|
- "# 开源GPT-2模型的结构\n",
|
|
|
+ "# The structure of GPT-2\n",
|
|
|
"model"
|
|
|
]
|
|
|
},
|
|
|
@@ -172,21 +173,20 @@
|
|
|
"source": [
|
|
|
"def process(data):\n",
|
|
|
" '''\n",
|
|
|
- " 对文本进行分词\n",
|
|
|
+ " Tokenize the text\n",
|
|
|
" '''\n",
|
|
|
" ids = tokenizer.encode(data['text'])\n",
|
|
|
- " # 增加表示文本结束的特殊字符\n",
|
|
|
+ " # Add special token for the end of text\n",
|
|
|
" ids.append(tokenizer.eos_token_id)\n",
|
|
|
" out = {'ids': ids}\n",
|
|
|
" return out\n",
|
|
|
"\n",
|
|
|
"def concat_data(datasets):\n",
|
|
|
" '''\n",
|
|
|
- " 将文本分词,并将结果拼接成一个长长的字符串\n",
|
|
|
+ " Tokenize the text and then concatenate the results in a very long string.\n",
|
|
|
" '''\n",
|
|
|
" tokenized = datasets.map(process, remove_columns=datasets.column_names)\n",
|
|
|
" tokenized.set_format(type='torch', device=device)\n",
|
|
|
- " # 将所有文本拼接成一个长长的字符串\n",
|
|
|
" concat_text = torch.concatenate(tokenized['ids'])\n",
|
|
|
" return concat_text\n",
|
|
|
"\n",
|
|
|
@@ -230,12 +230,12 @@
|
|
|
"source": [
|
|
|
"def get_data(data, batch_size, sequence_len):\n",
|
|
|
" '''\n",
|
|
|
- " 生成训练数据\n",
|
|
|
+ " Generate train data\n",
|
|
|
" '''\n",
|
|
|
- " # 生成截取数据的起点,形状为(B),其中B等于batch_size,表示批量数据的大小\n",
|
|
|
+ " # Generate the start of data, shape (B), B means batch_size\n",
|
|
|
" ix = torch.randint(len(data) - sequence_len, (batch_size,))\n",
|
|
|
" x = torch.stack([data[i: i + sequence_len] for i in ix])\n",
|
|
|
- " # 预测标签是下一个字母,因此只需要挪动一个位置即可\n",
|
|
|
+ " # The predicted label is the next char, just postpone one position\n",
|
|
|
" y = torch.stack([data[i + 1: i + 1 + sequence_len] for i in ix])\n",
|
|
|
" return x, y\n",
|
|
|
"\n",
|
|
|
@@ -252,7 +252,7 @@
|
|
|
"source": [
|
|
|
"def print_trainable_parameters(model):\n",
|
|
|
" \"\"\"\n",
|
|
|
- " 输出模型中可供训练的参数个数\n",
|
|
|
+ " Print the number of trainable parameters\n",
|
|
|
" \"\"\"\n",
|
|
|
" trainable_params = 0\n",
|
|
|
" all_param = 0\n",
|
|
|
@@ -290,21 +290,21 @@
|
|
|
"from peft import LoraConfig, PeftModel\n",
|
|
|
"\n",
|
|
|
"def init_peft_model(model):\n",
|
|
|
- " # 设置LoRA参数\n",
|
|
|
+ " # Initialize parameters of LoRA\n",
|
|
|
" config = LoraConfig(\n",
|
|
|
" r=4,\n",
|
|
|
" lora_alpha=32,\n",
|
|
|
" target_modules=['c_attn'],\n",
|
|
|
" lora_dropout=0.1,\n",
|
|
|
- " # c_attn.weight的形状是(fan_in, fan_out),所以该参数设置为True\n",
|
|
|
- " # 但需注意的是,普通的线性模型权重参数的形状是(fan_out, fan_in)\n",
|
|
|
+ " # As the shape of c_attn.weight is (fan_in, fan_out), set this parameter to True\n",
|
|
|
+ " # Note: for linear model, the shape of weight is (fan_out, fan_in)\n",
|
|
|
" fan_in_fan_out=True,\n",
|
|
|
" bias='none')\n",
|
|
|
" return PeftModel(model, config, adapter_name='lora_alpaca')\n",
|
|
|
"\n",
|
|
|
"print_trainable_parameters(model)\n",
|
|
|
"model = init_peft_model(model)\n",
|
|
|
- "# 确保模型在训练状态\n",
|
|
|
+ "# Put the model on train mode\n",
|
|
|
"model.train()\n",
|
|
|
"print_trainable_parameters(model)"
|
|
|
]
|
|
|
@@ -375,7 +375,7 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
- "# 加入LoRA后,模型的结构\n",
|
|
|
+ "# Model structure after using LoRA\n",
|
|
|
"model"
|
|
|
]
|
|
|
},
|
|
|
@@ -406,33 +406,35 @@
|
|
|
"\n",
|
|
|
"def estimate_loss(model, ctx=nullcontext()):\n",
|
|
|
" '''\n",
|
|
|
- " 估计模型损失\n",
|
|
|
- " ctx参数是为禁用LoRA或者混合精度做准备,当ctx=nullcontext()时,没有任何作用\n",
|
|
|
+ " Estimate the performance of model.\n",
|
|
|
+ " Note: ctx is used for disabling LoRA or mixed precision.\n",
|
|
|
+ " When ctx=nullcontext(), it have no effect.\n",
|
|
|
" '''\n",
|
|
|
" re = {}\n",
|
|
|
- " # 将模型切换至评估模式\n",
|
|
|
+ " # Put the mode on evaluation mode\n",
|
|
|
" model.eval()\n",
|
|
|
" _train = lambda: get_data(train_set, batch_size, sequence_len)\n",
|
|
|
" re['train'] = _loss(model, _train, ctx)\n",
|
|
|
" _test = lambda: get_data(test_set, batch_size, sequence_len)\n",
|
|
|
" re['test'] = _loss(model, _test, ctx)\n",
|
|
|
- " # 将模型切换至训练模式\n",
|
|
|
+ " # Put the mode on train mode\n",
|
|
|
" model.train()\n",
|
|
|
" return re\n",
|
|
|
"\n",
|
|
|
"@torch.no_grad()\n",
|
|
|
"def _loss(model, data_loader, ctx):\n",
|
|
|
" \"\"\"\n",
|
|
|
- " 计算模型在不同数据集下面的评估指标\n",
|
|
|
+ " Measure the performance of model based on different data sets.\n",
|
|
|
" \"\"\"\n",
|
|
|
" loss = []\n",
|
|
|
- " # 随机使用多个批量数据来预估模型效果\n",
|
|
|
+ " # Use eval_iters batch data to measure the performance\n",
|
|
|
" for k in range(eval_iters):\n",
|
|
|
" inputs, labels = data_loader()\n",
|
|
|
" with ctx:\n",
|
|
|
" logits = model(inputs).logits\n",
|
|
|
- " # 根据cross_entropy的定义,需要对logits进行转置运算\n",
|
|
|
- " # 具体细节请参考cross_entropy的官方文档\n",
|
|
|
+ " # According to the definition of cross_entropy in PyTorch,\n",
|
|
|
+ " # we need to transpose the logits.\n",
|
|
|
+ " # More details can be found in official document.\n",
|
|
|
" logits = logits.transpose(-2, -1)\n",
|
|
|
" loss.append(F.cross_entropy(logits, labels).item())\n",
|
|
|
" return torch.tensor(loss).mean().item()\n",
|
|
|
@@ -454,8 +456,9 @@
|
|
|
" optimizer.zero_grad(set_to_none=True)\n",
|
|
|
" inputs, labels = data_loader()\n",
|
|
|
" logits = model(inputs).logits\n",
|
|
|
- " # 根据cross_entropy的定义,需要对logits进行转置运算\n",
|
|
|
- " # 具体细节请参考cross_entropy的官方文档\n",
|
|
|
+ " # According to the definition of cross_entropy in PyTorch,\n",
|
|
|
+ " # we need to transpose the logits.\n",
|
|
|
+ " # More details can be found in official document.\n",
|
|
|
" logits = logits.transpose(-2, -1)\n",
|
|
|
" loss = F.cross_entropy(logits, labels)\n",
|
|
|
" lossi.append(loss.item())\n",
|
|
|
@@ -498,7 +501,7 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
- "# 粗糙的训练方式可能会使模型效果急剧恶化\n",
|
|
|
+ "# Vanilla model training could get divergent results\n",
|
|
|
"data_loader = lambda: get_data(train_set, batch_size, sequence_len)\n",
|
|
|
"optimizer = optim.AdamW(model.parameters(), lr=learning_rate)\n",
|
|
|
"l = train_gpt(model, optimizer, data_loader, max_iters=500)"
|
|
|
@@ -549,8 +552,8 @@
|
|
|
},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "# 使用更精细的训练方式微调大语言模型\n",
|
|
|
- "# get_lr的实现参考自https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
|
|
|
+ "# More elegant model training for LLM\n",
|
|
|
+ "# The code of get_lr is inspired by https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
|
|
|
"import math\n",
|
|
|
"\n",
|
|
|
"learning_rate = 6e-4\n",
|
|
|
@@ -560,16 +563,16 @@
|
|
|
"\n",
|
|
|
"def get_lr(it):\n",
|
|
|
" '''\n",
|
|
|
- " 动态调整学习速率\n",
|
|
|
- " it表示训练次数\n",
|
|
|
+ " Adjust learning rate dynamically \n",
|
|
|
+ " it means the step of training\n",
|
|
|
" '''\n",
|
|
|
- " # 1、线性预热\n",
|
|
|
+ " # 1, Linear warmup\n",
|
|
|
" if it < warmup_iters:\n",
|
|
|
" return learning_rate * it / warmup_iters\n",
|
|
|
- " # 2、超出lr_decay_iters,则返回min_lr\n",
|
|
|
+ " # 2, If exceeding lr_decay_iters, return min_lr\n",
|
|
|
" if it > lr_decay_iters:\n",
|
|
|
" return min_lr\n",
|
|
|
- " # 3、逐步衰减学习速率\n",
|
|
|
+ " # 3, decay learning rate\n",
|
|
|
" decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)\n",
|
|
|
" assert 0 <= decay_ratio <= 1\n",
|
|
|
" coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))\n",
|
|
|
@@ -584,23 +587,22 @@
|
|
|
},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "# 梯度裁剪的超参数\n",
|
|
|
+ "# The parameter for gradient clipping\n",
|
|
|
"grad_clip = 1.0\n",
|
|
|
"\n",
|
|
|
"def train_gpt_optimum(model, optimizer, data_loader, max_iters=1000):\n",
|
|
|
" lossi = []\n",
|
|
|
" scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))\n",
|
|
|
" for iter_num in range(max_iters):\n",
|
|
|
- " # 动态调整学习率\n",
|
|
|
+ " # Get learning rate\n",
|
|
|
" lr = get_lr(iter_num + 1)\n",
|
|
|
" for param_group in optimizer.param_groups:\n",
|
|
|
" param_group['lr'] = lr\n",
|
|
|
- " # 梯度累积\n",
|
|
|
+ " # Gradient accumulation\n",
|
|
|
" for i in range(gra_acc_steps):\n",
|
|
|
" inputs, labels = data_loader()\n",
|
|
|
- " # 混合精度训练\n",
|
|
|
- " ## 如果是用CPU进行计算,可能需要将dtype变成torch.bfloat16\n",
|
|
|
- " ## 当然如果使用CPU,需要非常长的时间\n",
|
|
|
+ " # Mixed precision\n",
|
|
|
+ " # If using a CPU, set dtype to torch.bfloat16\n",
|
|
|
" ctx = torch.autocast(device_type=device, dtype=torch.float16)\n",
|
|
|
" with ctx:\n",
|
|
|
" logits = model(inputs).logits\n",
|
|
|
@@ -609,7 +611,7 @@
|
|
|
" lossi.append(loss.item())\n",
|
|
|
" loss *= 1 / gra_acc_steps\n",
|
|
|
" scaler.scale(loss).backward()\n",
|
|
|
- " # 梯度裁剪\n",
|
|
|
+ " # Gradient clipping\n",
|
|
|
" scaler.unscale_(optimizer)\n",
|
|
|
" clip_grad_norm_(model.parameters(), grad_clip)\n",
|
|
|
" scaler.step(optimizer)\n",
|
|
|
@@ -617,7 +619,7 @@
|
|
|
" optimizer.zero_grad(set_to_none=True)\n",
|
|
|
"\n",
|
|
|
" if iter_num % eval_interval == 0:\n",
|
|
|
- " # 预估模型损失时,也使用混合精度\n",
|
|
|
+ " # Measure the performance (use mixed precision)\n",
|
|
|
" stats = estimate_loss(model, ctx)\n",
|
|
|
" train_loss = f'train loss {stats[\"train\"]:.4f}'\n",
|
|
|
" test_loss = f'test loss {stats[\"test\"]:.4f}'\n",
|
|
|
@@ -634,10 +636,11 @@
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"# 调整批量数据的大小(如需要)\n",
|
|
|
+ "# Adjust batch size (if need)\n",
|
|
|
"batch_size = 4\n",
|
|
|
- "# 定义梯度累积的步数\n",
|
|
|
+ "# The step of gradient accumulation\n",
|
|
|
"gra_acc_steps = 8 * 2\n",
|
|
|
- "# 重新定义data loader\n",
|
|
|
+ "# Redefine data loader\n",
|
|
|
"data_loader = lambda: get_data(train_set, batch_size, sequence_len)"
|
|
|
]
|
|
|
},
|
|
|
@@ -680,13 +683,13 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
- "# 卸载LoRA适配器\n",
|
|
|
+ "# Unload LoRA adapter\n",
|
|
|
"model.unload()\n",
|
|
|
- "# 清空GPU缓存\n",
|
|
|
+ "# Empty GPU cache\n",
|
|
|
"torch.cuda.empty_cache()\n",
|
|
|
- "# 重新安装LoRA适配器\n",
|
|
|
+ "# Add new LoRA adapter\n",
|
|
|
"model = init_peft_model(model)\n",
|
|
|
- "# 设置最优化算法的参数\n",
|
|
|
+ "# Parameters for AdamW\n",
|
|
|
"weight_decay = 1e-1\n",
|
|
|
"beta1 = 0.9\n",
|
|
|
"beta2 = 0.95\n",
|
|
|
@@ -741,12 +744,12 @@
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"def make_inference(model, question):\n",
|
|
|
- " # 保存跟微调训练数据一致的格式\n",
|
|
|
+ " # Use the same template as train data\n",
|
|
|
" context = 'Below is an instruction that describes a task.' + \\\n",
|
|
|
" ' Write a response that appropriately completes the request.\\n\\n'\n",
|
|
|
" instruction = f'### Instruction:\\n{question}\\n\\n### Response:\\n'\n",
|
|
|
" token = tokenizer(f'{context}{instruction}', return_tensors='pt').to(device)\n",
|
|
|
- " # 生成文本时,需要将模型调整成评估模式\n",
|
|
|
+ " # Put the mode on evaluation mode when doing text generation\n",
|
|
|
" model.eval()\n",
|
|
|
" output_tokens = model.generate(**token, max_new_tokens=100, early_stopping=True)\n",
|
|
|
" model.train()\n",
|
|
|
@@ -845,7 +848,7 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
- "# 微调之前的模型效果(禁用LoRA相当于回到微调之前的模型状态)\n",
|
|
|
+ "# Disable LoRA adapter to show the result before SFT\n",
|
|
|
"with model.disable_adapter():\n",
|
|
|
" make_inference(model, 'Where is the capital of China?')"
|
|
|
]
|