2 years ago · 636815ed79
--- a/ch11_llm/gpt2_lora.ipynb
+++ b/ch11_llm/gpt2_lora.ipynb
@@ -566,13 +566,13 @@
 
				     "    Adjust learning rate dynamically \n",
			
 
				     "    it means the step of training\n",
			
 
				     "    '''\n",
			
 
				-    "    # 1, Linear warmup\n",
			
 
				+    "    # 1. Linear warmup\n",
			
 
				     "    if it < warmup_iters:\n",
			
 
				     "        return learning_rate * it / warmup_iters\n",
			
 
				-    "    # 2, If exceeding lr_decay_iters, return min_lr\n",
			
 
				+    "    # 2. If exceeding lr_decay_iters, return min_lr\n",
			
 
				     "    if it > lr_decay_iters:\n",
			
 
				     "        return min_lr\n",
			
 
				-    "    # 3, decay learning rate\n",
			
 
				+    "    # 3. decay learning rate\n",
			
 
				     "    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)\n",
			
 
				     "    assert 0 <= decay_ratio <= 1\n",
			
 
				     "    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))\n",
			
@@ -635,7 +635,6 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 调整批量数据的大小（如需要）\n",
			
 
				     "# Adjust batch size (if need)\n",
			
 
				     "batch_size = 4\n",
			
 
				     "# The step of gradient accumulation\n",
			
--- a/ch11_llm/gpt2_lora_optimum.ipynb
+++ b/ch11_llm/gpt2_lora_optimum.ipynb
@@ -35,8 +35,9 @@
 
				     "\n",
			
 
				     "\n",
			
 
				     "torch.manual_seed(12046)\n",
			
 
				-    "# 计算设备为A100 40G\n",
			
 
				-    "# 如果使用CPU，需要非常长的时间，建议减少sequence_len、batch_size等参数来加快速度"
			
 
				+    "# Device: A100 40G\n",
			
 
				+    "# If using a CPU, it will take a considerable amount of time. \n",
			
 
				+    "# Please consider reducing sequence_len, batch_size to speed up the process."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -47,9 +48,9 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 一些超参数\n",
			
 
				+    "# Some parameters\n",
			
 
				     "learning_rate = 6e-4\n",
			
 
				-    "# 模型支持的最大文本长度\n",
			
 
				+    "# The max length of text that model supports\n",
			
 
				     "sequence_len = 512\n",
			
 
				     "batch_size = 16\n",
			
 
				     "gra_acc_steps = 8\n",
			
@@ -135,20 +136,21 @@
 
				     "\n",
			
 
				     "def prepare_input(data):\n",
			
 
				     "    '''\n",
			
 
				-    "    利用模版生成训练文本，并对文本进行分词\n",
			
 
				+    "    Use template to generate train text and then do tokenization.\n",
			
 
				     "    '''\n",
			
 
				     "    context = context_template.format_map(data)\n",
			
 
				     "    whole_text = context + data['output'] + tokenizer.eos_token\n",
			
 
				     "    ids = tokenizer.encode(whole_text)\n",
			
 
				-    "    # 模型在微调时，不需要考虑模版文本\n",
			
 
				+    "    # Doing fine-tuning, we do NOT consider the text in template\n",
			
 
				     "    context_ids = tokenizer.encode(context)\n",
			
 
				-    "    # -100表示在计算模型损失时忽略该位置的数据（参考GPT2LMHeadModel的官方文档）\n",
			
 
				+    "    # -100 means that the model loss will ignore this position\n",
			
 
				+    "    # More details can be found in the official document\n",
			
 
				     "    labels = [-100] * len(context_ids) + ids[len(context_ids):]\n",
			
 
				     "    return {'input_ids': ids, 'labels': labels}\n",
			
 
				     "\n",
			
 
				-    "# 示例\n",
			
 
				+    "# An example\n",
			
 
				     "re = prepare_input(datasets[8])\n",
			
 
				-    "# 只有Response:\\n后面的内容才会被使用（计算模型损失）\n",
			
 
				+    "# Only the text after 'Response:\\n' will be used in the computation of model loss\n",
			
 
				     "tokenizer.decode(re['input_ids']), tokenizer.decode(list(filter(lambda x: x != -100, re['labels'])))"
			
 
				    ]
			
 
				   },
			
@@ -238,7 +240,7 @@
 
				     "tokenized = datasets.train_test_split(test_size=0.1, seed=1024, shuffle=False)\n",
			
 
				     "tokenized = tokenized.map(prepare_input, remove_columns=datasets.column_names)\n",
			
 
				     "tokenized.set_format(type='torch', device=device)\n",
			
 
				-    "# 将所有的数据拼接成一个长长的张量\n",
			
 
				+    "# Concatenate all the data\n",
			
 
				     "train_set = {\n",
			
 
				     "    'input_ids': torch.concatenate(tokenized['train']['input_ids']),\n",
			
 
				     "    'labels': torch.concatenate(tokenized['train']['labels'])\n",
			
@@ -289,15 +291,15 @@
 
				    "source": [
			
 
				     "def get_data(data, batch_size, sequence_len):\n",
			
 
				     "    '''\n",
			
 
				-    "    生成训练数据\n",
			
 
				+    "    Generate train data\n",
			
 
				     "    '''\n",
			
 
				     "    inputs = data['input_ids']\n",
			
 
				     "    labels = data['labels']\n",
			
 
				-    "    # 生成截取数据的起点，形状为(B)，其中B等于batch_size，表示批量数据的大小\n",
			
 
				+    "    # Generate the start of data, shape (B), B means batch_size\n",
			
 
				     "    ix = torch.randint(len(inputs) - sequence_len, (batch_size,))\n",
			
 
				     "    x = torch.stack([inputs[i: i + sequence_len] for i in ix])\n",
			
 
				-    "    # 如果直接使用transformers提供的封装来计算模型损失，则不需要对标签变量进行额外的移动操作\n",
			
 
				-    "    # 因为模型内部会自动进行移位操作，具体细节请参考官方文档\n",
			
 
				+    "    # We will use the implementation of transformers to compute model loss\n",
			
 
				+    "    # It will do data shift inside the model, thus we will NOT shift y\n",
			
 
				     "    y = torch.stack([labels[i: i + sequence_len] for i in ix])\n",
			
 
				     "    return x, y\n",
			
 
				     "\n",
			
@@ -352,21 +354,21 @@
 
				     "from peft import LoraConfig, PeftModel\n",
			
 
				     "\n",
			
 
				     "def init_peft_model(model):\n",
			
 
				-    "    # 设置LoRA参数\n",
			
 
				+    "    # Initialize parameters of LoRA\n",
			
 
				     "    config = LoraConfig(\n",
			
 
				     "        r=4,\n",
			
 
				     "        lora_alpha=32,\n",
			
 
				     "        target_modules=['c_attn'],\n",
			
 
				     "        lora_dropout=0.1,\n",
			
 
				-    "        # c_attn.weight的形状是(fan_in, fan_out)，所以该参数设置为True\n",
			
 
				-    "        # 但需注意的是，普通的线性模型权重参数的形状是(fan_out, fan_in)\n",
			
 
				+    "        # As the shape of c_attn.weight is (fan_in, fan_out), set this parameter to True\n",
			
 
				+    "        # Note: for linear model, the shape of weight is (fan_out, fan_in)\n",
			
 
				     "        fan_in_fan_out=True,\n",
			
 
				     "        bias='none')\n",
			
 
				     "    return PeftModel(model, config, adapter_name='lora_alpaca')\n",
			
 
				     "\n",
			
 
				     "print_trainable_parameters(model)\n",
			
 
				     "model = init_peft_model(model)\n",
			
 
				-    "# 确保模型在训练状态\n",
			
 
				+    "# Put the model on train mode\n",
			
 
				     "model.train()\n",
			
 
				     "print_trainable_parameters(model)"
			
 
				    ]
			
@@ -398,31 +400,32 @@
 
				     "\n",
			
 
				     "def estimate_loss(model, ctx=nullcontext()):\n",
			
 
				     "    '''\n",
			
 
				-    "    估计模型损失\n",
			
 
				-    "    ctx参数是为禁用LoRA或者混合精度做准备，当ctx=nullcontext()时，没有任何作用\n",
			
 
				+    "    Estimate the performance of model.\n",
			
 
				+    "    Note: ctx is used for disabling LoRA or mixed precision.\n",
			
 
				+    "    When ctx=nullcontext(), it have no effect.\n",
			
 
				     "    '''\n",
			
 
				     "    re = {}\n",
			
 
				-    "    # 将模型切换至评估模式\n",
			
 
				+    "    # Put the model on evaluation mode\n",
			
 
				     "    model.eval()\n",
			
 
				     "    _train = lambda: get_data(train_set, batch_size, sequence_len)\n",
			
 
				     "    re['train'] = _loss(model, _train, ctx)\n",
			
 
				     "    _test = lambda: get_data(test_set, batch_size, sequence_len)\n",
			
 
				     "    re['test'] = _loss(model, _test, ctx)\n",
			
 
				-    "    # 将模型切换至训练模式\n",
			
 
				+    "    # Put the model on train mode\n",
			
 
				     "    model.train()\n",
			
 
				     "    return re\n",
			
 
				     "\n",
			
 
				     "@torch.no_grad()\n",
			
 
				     "def _loss(model, data_loader, ctx):\n",
			
 
				     "    \"\"\"\n",
			
 
				-    "    计算模型在不同数据集下面的评估指标\n",
			
 
				+    "    Measure the performance of model based on different data sets.\n",
			
 
				     "    \"\"\"\n",
			
 
				     "    loss = []\n",
			
 
				-    "    # 随机使用多个批量数据来预估模型效果\n",
			
 
				+    "    # Use eval_iters batch data to measure the performance\n",
			
 
				     "    for k in range(eval_iters):\n",
			
 
				     "        inputs, labels = data_loader()\n",
			
 
				     "        with ctx:\n",
			
 
				-    "            # 使用transformers提供的封装来计算模型损失\n",
			
 
				+    "            # Use the method provided by transformers to compute model loss\n",
			
 
				     "            loss.append(model(input_ids=inputs, labels=labels).loss.item())\n",
			
 
				     "    return torch.tensor(loss).mean().item()\n",
			
 
				     "\n",
			
@@ -437,7 +440,7 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# get_lr的实现参考自https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
			
 
				+    "# The code of get_lr is inspired by https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
			
 
				     "import math\n",
			
 
				     "\n",
			
 
				     "warmup_iters = 200\n",
			
@@ -446,16 +449,16 @@
 
				     "\n",
			
 
				     "def get_lr(it):\n",
			
 
				     "    '''\n",
			
 
				-    "    动态调整学习速率\n",
			
 
				-    "    it表示训练次数\n",
			
 
				+    "    Adjust learning rate dynamically \n",
			
 
				+    "    it means the step of training\n",
			
 
				     "    '''\n",
			
 
				-    "    # 1、线性预热\n",
			
 
				+    "    # 1. Linear warmup\n",
			
 
				     "    if it < warmup_iters:\n",
			
 
				     "        return learning_rate * it / warmup_iters\n",
			
 
				-    "    # 2、超出lr_decay_iters，则返回min_lr\n",
			
 
				+    "    # 2. If exceeding lr_decay_iters, return min_lr\n",
			
 
				     "    if it > lr_decay_iters:\n",
			
 
				     "        return min_lr\n",
			
 
				-    "    # 3、逐步衰减学习速率\n",
			
 
				+    "    # 3. decay learning rate\n",
			
 
				     "    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)\n",
			
 
				     "    assert 0 <= decay_ratio <= 1\n",
			
 
				     "    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))\n",
			
@@ -470,30 +473,29 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 梯度裁剪的超参数\n",
			
 
				+    "# The parameter for gradient clipping\n",
			
 
				     "grad_clip = 1.0\n",
			
 
				     "\n",
			
 
				     "def train_gpt_optimum(model, optimizer, data_loader, max_iters=1000):\n",
			
 
				     "    lossi = []\n",
			
 
				     "    scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))\n",
			
 
				     "    for iter_num in range(max_iters):\n",
			
 
				-    "        # 动态调整学习率\n",
			
 
				+    "        # Get learning rate\n",
			
 
				     "        lr = get_lr(iter_num + 1)\n",
			
 
				     "        for param_group in optimizer.param_groups:\n",
			
 
				     "            param_group['lr'] = lr\n",
			
 
				-    "        # 梯度累积\n",
			
 
				+    "        # Gradient accumulation\n",
			
 
				     "        for i in range(gra_acc_steps):\n",
			
 
				     "            inputs, labels = data_loader()\n",
			
 
				-    "            # 混合精度训练\n",
			
 
				-    "            ## 如果是用CPU进行计算，可能需要将dtype变成torch.bfloat16\n",
			
 
				-    "            ## 当然如果使用CPU，需要非常长的时间\n",
			
 
				+    "            # Mixed precision\n",
			
 
				+    "            # If using a CPU, set dtype to torch.bfloat16\n",
			
 
				     "            ctx = torch.autocast(device_type=device, dtype=torch.float16)\n",
			
 
				     "            with ctx:\n",
			
 
				     "                loss = model(input_ids=inputs, labels=labels).loss\n",
			
 
				     "                lossi.append(loss.item())\n",
			
 
				     "                loss *= 1 / gra_acc_steps\n",
			
 
				     "            scaler.scale(loss).backward()\n",
			
 
				-    "        # 梯度裁剪\n",
			
 
				+    "        # Gradient clipping\n",
			
 
				     "        scaler.unscale_(optimizer)\n",
			
 
				     "        clip_grad_norm_(model.parameters(), grad_clip)\n",
			
 
				     "        scaler.step(optimizer)\n",
			
@@ -501,8 +503,8 @@
 
				     "        optimizer.zero_grad(set_to_none=True)\n",
			
 
				     "\n",
			
 
				     "        if iter_num % eval_interval == 0:\n",
			
 
				-    "            # 训练模型此脚本时，由于内存充足，忘记在评估模型损失时使用混合精度\n",
			
 
				-    "            # 这是一个小“bug”，虽然这并不影响训练结果\n",
			
 
				+    "            # Measure the performance\n",
			
 
				+    "            # We forget to use mixed precision here. It is a tiny bug\n",
			
 
				     "            stats = estimate_loss(model)\n",
			
 
				     "            train_loss = f'train loss {stats[\"train\"]:.4f}'\n",
			
 
				     "            test_loss = f'test loss {stats[\"test\"]:.4f}'\n",
			
@@ -550,7 +552,7 @@
 
				    ],
			
 
				    "source": [
			
 
				     "data_loader = lambda: get_data(train_set, batch_size, sequence_len)\n",
			
 
				-    "# 设置最优化算法的参数\n",
			
 
				+    "# Parameters for AdamW\n",
			
 
				     "weight_decay = 1e-1\n",
			
 
				     "beta1 = 0.9\n",
			
 
				     "beta2 = 0.95\n",
			
@@ -607,7 +609,7 @@
 
				     "def make_inference(model, question):\n",
			
 
				     "    context = context_template.format(instruction=question)\n",
			
 
				     "    token = tokenizer(context, return_tensors='pt').to(device)\n",
			
 
				-    "    # 生成文本时，需要将模型调整成评估模式\n",
			
 
				+    "    # Put the model on evaluation mode when doing text generation\n",
			
 
				     "    model.eval()\n",
			
 
				     "    output_tokens = model.generate(**token, max_new_tokens=100, early_stopping=True)\n",
			
 
				     "    model.train()\n",
			
@@ -706,7 +708,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 微调之前的模型效果（禁用LoRA相当于回到微调之前的模型状态）\n",
			
 
				+    "# Disable LoRA adapter to show the result before SFT\n",
			
 
				     "with model.disable_adapter():\n",
			
 
				     "    make_inference(model, 'Where is the capital of China?')"
			
 
				    ]
			
--- a/ch11_llm/gpt2_reward_modeling.ipynb
+++ b/ch11_llm/gpt2_reward_modeling.ipynb
@@ -45,7 +45,7 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 一些超参数\n",
			
 
				+    "# Some parameters\n",
			
 
				     "learning_rate = 6e-4\n",
			
 
				     "sequence_len = 1024\n",
			
 
				     "batch_size = 8\n",
			
@@ -64,7 +64,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
			
 
				-    "# 没有语言建模头的嵌入模型\n",
			
 
				+    "# The model without language modeling head, just embedding model\n",
			
 
				     "model = GPT2Model.from_pretrained('gpt2')"
			
 
				    ]
			
 
				   },
			
@@ -101,11 +101,11 @@
 
				     "    re = {}\n",
			
 
				     "    for i in range(2):\n",
			
 
				     "        key = 'tokens_%s' % i\n",
			
 
				-    "        # prefix和completion两个字段已经经过了分词处理\n",
			
 
				+    "        # 'prefix' and 'completion' already contain the results of tokenization\n",
			
 
				     "        re['input_ids_%s' % i] = data[key]['prefix'] + data[key]['completion']\n",
			
 
				-    "        # 记录文本的实际长度，用于后续的模型计算\n",
			
 
				+    "        # Record the length of text\n",
			
 
				     "        re['input_len_%s' % i] = len(re['input_ids_%s' % i])\n",
			
 
				-    "        # 根据数据说明，定义标签变量\n",
			
 
				+    "        # Define the label according to the score\n",
			
 
				     "        re['label'] = 0 if data['score_0'] > 0 else 1\n",
			
 
				     "    return re\n",
			
 
				     "\n",
			
@@ -156,15 +156,15 @@
 
				     "\n",
			
 
				     "def token_collect(batch):\n",
			
 
				     "    '''\n",
			
 
				-    "    由于文本的长度不一，对于同一批次的训练数据，需要进行数据填充，使得长度一致\n",
			
 
				+    "    As the length of text is different, we need do string padding.\n",
			
 
				     "    '''\n",
			
 
				     "    re = {}\n",
			
 
				     "    for i in range(2):\n",
			
 
				     "        ids = [data['input_ids_%s' % i] for data in batch]\n",
			
 
				-    "        # 对于较短的数据，用0在末尾进行填充\n",
			
 
				+    "        # Use 0 to do string padding\n",
			
 
				     "        re['input_ids_%s' % i] = pad_sequence(ids, batch_first=True)\n",
			
 
				     "        re['input_len_%s' % i] = torch.stack([data['input_len_%s' % i] for data in batch])\n",
			
 
				-    "    # 将标签变量也合并成一个张量\n",
			
 
				+    "    # Concatenate the label variable\n",
			
 
				     "    re['label'] = torch.stack([data['label'] for data in batch])\n",
			
 
				     "    return re"
			
 
				    ]
			
@@ -210,11 +210,11 @@
 
				    "source": [
			
 
				     "from torch.utils.data import DataLoader, random_split\n",
			
 
				     "\n",
			
 
				-    "# 划分训练集和测试集\n",
			
 
				+    "# Split data into train set and test set\n",
			
 
				     "train_set, test_set = random_split(dataset, [0.8, 0.2])\n",
			
 
				     "train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=token_collect)\n",
			
 
				     "test_loader = DataLoader(test_set, batch_size=3, shuffle=True, collate_fn=token_collect)\n",
			
 
				-    "# 训练数据示例\n",
			
 
				+    "# An example\n",
			
 
				     "next(iter(train_loader))"
			
 
				    ]
			
 
				   },
			
@@ -271,38 +271,40 @@
 
				     "\n",
			
 
				     "    def __init__(self, model):\n",
			
 
				     "        '''\n",
			
 
				-    "        评分模型\n",
			
 
				-    "        参数\n",
			
 
				+    "        Reward modeling\n",
			
 
				+    "        \n",
			
 
				+    "        Args:\n",
			
 
				     "        ----\n",
			
 
				-    "        model ：嵌入模型\n",
			
 
				+    "        model: Embedding model\n",
			
 
				     "        '''\n",
			
 
				     "        super().__init__()\n",
			
 
				     "        self.embedding = model\n",
			
 
				-    "        # 评分建模头\n",
			
 
				+    "        # Score modeling head\n",
			
 
				     "        self.score = nn.Linear(model.embed_dim, 1, bias=False)\n",
			
 
				     "\n",
			
 
				     "    def forward(self, x, seq_len):\n",
			
 
				     "        '''\n",
			
 
				-    "        向前传播\n",
			
 
				-    "        参数\n",
			
 
				+    "        Forward pass\n",
			
 
				+    "        \n",
			
 
				+    "        Args:\n",
			
 
				     "        ----\n",
			
 
				-    "        x ：torch.LongTensor，文本，形状为(B, T)\n",
			
 
				-    "        seq_len ：torch.LongTensor，文本的实际长度，形状为(B)\n",
			
 
				+    "        x: torch.LongTensor, text, shape (B, T)\n",
			
 
				+    "        seq_len: torch.LongTensor, The length of text before padding, shape (B)\n",
			
 
				     "        返回\n",
			
 
				     "        ----\n",
			
 
				-    "        score ：torch.FloatTensor，评分，形状为(B, 1)\n",
			
 
				+    "        score: torch.FloatTensor, scores, shape(B, 1)\n",
			
 
				     "        '''\n",
			
 
				     "        B, _ = x.shape\n",
			
 
				-    "        # 文本的嵌入向量\n",
			
 
				+    "        # The embedding of text\n",
			
 
				     "        emb = self.embedding(x).last_hidden_state  # (B, T, C)\n",
			
 
				     "        ind = torch.arange(B, device=seq_len.device)\n",
			
 
				-    "        # 获取最后一个词元的特征\n",
			
 
				+    "        # Get the feature of the last token\n",
			
 
				     "        pooled_emb = emb[ind, seq_len - 1]         # (B,    C)\n",
			
 
				     "        score = self.score(pooled_emb)             # (B,    1)\n",
			
 
				     "        return score\n",
			
 
				     "\n",
			
 
				     "r_model = RewardModel(model)\n",
			
 
				-    "# 展示模型结构\n",
			
 
				+    "# The structure of model\n",
			
 
				     "r_model"
			
 
				    ]
			
 
				   },
			
@@ -316,7 +318,7 @@
 
				    "source": [
			
 
				     "def print_trainable_parameters(model):\n",
			
 
				     "    \"\"\"\n",
			
 
				-    "    输出模型中可供训练的参数个数\n",
			
 
				+    "    Print the number of trainable parameters\n",
			
 
				     "    \"\"\"\n",
			
 
				     "    trainable_params = 0\n",
			
 
				     "    all_param = 0\n",
			
@@ -357,15 +359,15 @@
 
				     "    lora_alpha=8,\n",
			
 
				     "    target_modules=['c_attn'],\n",
			
 
				     "    lora_dropout=0.4,\n",
			
 
				-    "    # c_attn.weight的形状是(fan_in, fan_out)，所以该参数设置为True\n",
			
 
				-    "    # 但需注意的是，普通的线性模型权重参数的形状是(fan_out, fan_in)\n",
			
 
				+    "    # As the shape of c_attn.weight is (fan_in, fan_out), set this parameter to True\n",
			
 
				+    "    # Note: for linear model, the shape of weight is (fan_out, fan_in)\n",
			
 
				     "    fan_in_fan_out=True,\n",
			
 
				     "    bias='none',\n",
			
 
				-    "    # 评分模型中的score层（评分建模头）也参与模型微调\n",
			
 
				+    "    # The score modeling head also participats fine-tuning\n",
			
 
				     "    modules_to_save=['score']\n",
			
 
				     "    )\n",
			
 
				     "\n",
			
 
				-    "# 为评分模型添加LoRA适配器\n",
			
 
				+    "# Add LoRA adapter to model\n",
			
 
				     "r_model = PeftModel(r_model, config, adapter_name='lora')\n",
			
 
				     "print_trainable_parameters(r_model)"
			
 
				    ]
			
@@ -447,26 +449,29 @@
 
				     "\n",
			
 
				     "    def __init__(self, model):\n",
			
 
				     "        '''\n",
			
 
				-    "        借鉴逻辑回归的思路，进行偏好建模\n",
			
 
				-    "        参数\n",
			
 
				+    "        Build preference model according to the structure of logistic regression\n",
			
 
				+    "        \n",
			
 
				+    "        Args:\n",
			
 
				     "        ----\n",
			
 
				-    "        model ：评分模型\n",
			
 
				+    "        model: Reward model\n",
			
 
				     "        '''\n",
			
 
				     "        super().__init__()\n",
			
 
				     "        self.pref = model\n",
			
 
				     "\n",
			
 
				     "    def forward(self, data):\n",
			
 
				     "        '''\n",
			
 
				-    "        定义模型损失\n",
			
 
				-    "        参数\n",
			
 
				+    "        Define model loss\n",
			
 
				+    "        \n",
			
 
				+    "        Args:\n",
			
 
				     "        ----\n",
			
 
				-    "        data ：dict，训练数据\n",
			
 
				-    "        返回\n",
			
 
				+    "        data: dict, train data\n",
			
 
				+    "        \n",
			
 
				+    "        Returns:\n",
			
 
				     "        ----\n",
			
 
				-    "        out ：torch.FloatTensor，logits，形状为(B, 2)\n",
			
 
				-    "        loss ：torch.FloatTensor，模型损失\n",
			
 
				+    "        out: torch.FloatTensor, The predictions, shape (B, 2)\n",
			
 
				+    "        loss: torch.FloatTensor, model loss\n",
			
 
				     "        '''\n",
			
 
				-    "        # input0的形状是(B, T)，len0的形状是(B)\n",
			
 
				+    "        # The shape of input0 is (B, T), the shape of len0 is (B)\n",
			
 
				     "        input0, len0 = data['input_ids_0'], data['input_len_0']\n",
			
 
				     "        input1, len1 = data['input_ids_1'], data['input_len_1']\n",
			
 
				     "        score0 = self.pref(input0, len0)             # (B, 1)\n",
			
@@ -476,7 +481,7 @@
 
				     "        return out, loss\n",
			
 
				     "\n",
			
 
				     "p_model = PreferenceModel(r_model).to(device)\n",
			
 
				-    "# 模型结构\n",
			
 
				+    "# The structure of model\n",
			
 
				     "p_model"
			
 
				    ]
			
 
				   },
			
@@ -501,7 +506,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 利用示例数据验证模型是否搭建正确，并记录微调前的模型效果（方便与后续结果做对比）\n",
			
 
				+    "# Use an example to show test the model and record the result for latter comparison\n",
			
 
				     "example = test_set[:1]\n",
			
 
				     "with torch.no_grad():\n",
			
 
				     "    p_model.eval()\n",
			
@@ -537,28 +542,29 @@
 
				     "\n",
			
 
				     "def estimate_loss(model, ctx=nullcontext()):\n",
			
 
				     "    '''\n",
			
 
				-    "    估计模型损失\n",
			
 
				-    "    ctx参数是为禁用LoRA或者混合精度做准备，当ctx=nullcontext()时，没有任何作用\n",
			
 
				+    "    Estimate the performance of model.\n",
			
 
				+    "    Note: ctx is used for disabling LoRA or mixed precision.\n",
			
 
				+    "    When ctx=nullcontext(), it have no effect.\n",
			
 
				     "    '''\n",
			
 
				     "    re = {}\n",
			
 
				-    "    # 将模型切换至评估模式\n",
			
 
				+    "    # Put the model on evaluation mode\n",
			
 
				     "    model.eval()\n",
			
 
				     "    re['train'] = _loss(model, train_loader, ctx)\n",
			
 
				     "    re['test'] = _loss(model, test_loader, ctx)\n",
			
 
				-    "    # 将模型切换至训练模式\n",
			
 
				+    "    # Put the model on train mode\n",
			
 
				     "    model.train()\n",
			
 
				     "    return re\n",
			
 
				     "\n",
			
 
				     "@torch.no_grad()\n",
			
 
				     "def _loss(model, data_loader, ctx):\n",
			
 
				     "    \"\"\"\n",
			
 
				-    "    计算模型在不同数据集下面的评估指标\n",
			
 
				+    "    Measure the performance of model based on different data sets.\n",
			
 
				     "    \"\"\"\n",
			
 
				     "    lossi = []\n",
			
 
				     "    data_iter= iter(data_loader)\n",
			
 
				-    "    # 随机使用多个批量数据来预估模型效果\n",
			
 
				+    "    # Use eval_iters batch data to measure the performance\n",
			
 
				     "    for k in range(eval_iters):\n",
			
 
				-    "        # 如果数据遍历完了，则重新生成一个data loader\n",
			
 
				+    "        # After one iteration, create another data loader\n",
			
 
				     "        data = next(data_iter, None)\n",
			
 
				     "        if data is None:\n",
			
 
				     "            data_iter = iter(data_loader)\n",
			
@@ -579,7 +585,7 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# get_lr的实现参考自https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
			
 
				+    "# The code of get_lr is inspired by https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
			
 
				     "import math\n",
			
 
				     "\n",
			
 
				     "warmup_iters = 100\n",
			
@@ -588,16 +594,16 @@
 
				     "\n",
			
 
				     "def get_lr(it):\n",
			
 
				     "    '''\n",
			
 
				-    "    动态调整学习速率\n",
			
 
				-    "    it表示训练次数\n",
			
 
				+    "    Adjust learning rate dynamically \n",
			
 
				+    "    it means the step of training\n",
			
 
				     "    '''\n",
			
 
				-    "    # 1、线性预热\n",
			
 
				+    "    # 1. Linear warmup\n",
			
 
				     "    if it < warmup_iters:\n",
			
 
				     "        return learning_rate * it / warmup_iters\n",
			
 
				-    "    # 2、超出lr_decay_iters，则返回min_lr\n",
			
 
				+    "    # 2. If exceeding lr_decay_iters, return min_lr\n",
			
 
				     "    if it > lr_decay_iters:\n",
			
 
				     "        return min_lr\n",
			
 
				-    "    # 3、逐步衰减学习速率\n",
			
 
				+    "    # 3. decay learning rate\n",
			
 
				     "    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)\n",
			
 
				     "    assert 0 <= decay_ratio <= 1\n",
			
 
				     "    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))\n",
			
@@ -612,7 +618,7 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 梯度裁剪的超参数\n",
			
 
				+    "# The parameter for gradient clipping\n",
			
 
				     "grad_clip = 1.0\n",
			
 
				     "\n",
			
 
				     "def train_reward_optimum(model, optimizer, data_loader, max_iters=1000):\n",
			
@@ -621,25 +627,25 @@
 
				     "    data_iter = iter(data_loader)\n",
			
 
				     "\n",
			
 
				     "    for iter_num in range(max_iters):\n",
			
 
				-    "        # 动态调整学习率\n",
			
 
				+    "        # Get learning rate\n",
			
 
				     "        lr = get_lr(iter_num + 1)\n",
			
 
				     "        for param_group in optimizer.param_groups:\n",
			
 
				     "            param_group['lr'] = lr\n",
			
 
				+    "        # Gradient accumulation\n",
			
 
				     "        for i in range(gra_acc_steps):\n",
			
 
				     "            data = next(data_iter, None)\n",
			
 
				     "            if data is None:\n",
			
 
				     "                data_iter = iter(data_loader)\n",
			
 
				     "                data = next(data_iter, None)\n",
			
 
				-    "            # 混合进度训练\n",
			
 
				-    "            ## 如果是用CPU进行计算，可能需要将dtype变成torch.bfloat16\n",
			
 
				-    "            ## 当然如果使用CPU，需要非常长的时间\n",
			
 
				+    "            # Mixed precision\n",
			
 
				+    "            # If using a CPU, set dtype to torch.bfloat16\n",
			
 
				     "            ctx = torch.autocast(device_type=device, dtype=torch.float16)\n",
			
 
				     "            with ctx:\n",
			
 
				     "                _, loss = model(data)\n",
			
 
				     "                lossi.append(loss.item())\n",
			
 
				     "                loss *= 1 / gra_acc_steps\n",
			
 
				     "            scaler.scale(loss).backward()\n",
			
 
				-    "        # 梯度裁剪\n",
			
 
				+    "        # Gradient clipping\n",
			
 
				     "        scaler.unscale_(optimizer)\n",
			
 
				     "        clip_grad_norm_(model.parameters(), grad_clip)\n",
			
 
				     "        scaler.step(optimizer)\n",
			
@@ -647,7 +653,7 @@
 
				     "        optimizer.zero_grad(set_to_none=True)\n",
			
 
				     "\n",
			
 
				     "        if iter_num % eval_interval == 0:\n",
			
 
				-    "            # 预估模型损失时，也使用混合精度\n",
			
 
				+    "            # Measure the performance (use mixed precision)\n",
			
 
				     "            stats = estimate_loss(model, ctx)\n",
			
 
				     "            train_loss = f'train loss {stats[\"train\"]:.4f}'\n",
			
 
				     "            eval_loss = f'test loss {stats[\"test\"]:.4f}'\n",
			
@@ -694,7 +700,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 设置最优化算法的参数\n",
			
 
				+    "# Parameters for AdamW\n",
			
 
				     "weight_decay = 1e-1\n",
			
 
				     "beta1 = 0.9\n",
			
 
				     "beta2 = 0.95\n",
			
@@ -761,7 +767,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 经过模型微调之后，评分模型的效果有所提升\n",
			
 
				+    "# After fine-tuning, the performance of reward model has been improved\n",
			
 
				     "with torch.no_grad():\n",
			
 
				     "    p_model.eval()\n",
			
 
				     "    print(p_model(example), example['label'])\n",
			
--- a/ch11_llm/lora_tutorial.ipynb
+++ b/ch11_llm/lora_tutorial.ipynb
@@ -6,7 +6,6 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 安装第三方库\n",
			
 
				     "!pip install peft"
			
 
				    ]
			
 
				   },
			
@@ -47,21 +46,21 @@
 
				     "    \n",
			
 
				     "    def __init__(self, model, r=4, lora_alpha=16):\n",
			
 
				     "        '''\n",
			
 
				-    "        LoRA的实现示例：在线性模型中加入LoRA层\n",
			
 
				-    "        参数\n",
			
 
				+    "        Add LoRA adapter to linear model\n",
			
 
				+    "        \n",
			
 
				+    "        Args:\n",
			
 
				     "        ----\n",
			
 
				-    "        model ：线性模型\n",
			
 
				-    "        r ：int，LoRA的秩\n",
			
 
				-    "        lora_alpha ：int，LoRA算法里的alpha\n",
			
 
				+    "        model: Linear model\n",
			
 
				+    "        r: int, The rank of LoRA\n",
			
 
				+    "        lora_alpha: int, Alpha in LoRA algo\n",
			
 
				     "        '''\n",
			
 
				     "        super().__init__()\n",
			
 
				-    "        # model是线性模型\n",
			
 
				     "        self.model = model\n",
			
 
				-    "        # 冻结模型\n",
			
 
				+    "        # Freeze model\n",
			
 
				     "        self._freezing_model()\n",
			
 
				     "        self.lora_A = nn.Linear(model.in_features, r, bias=False)\n",
			
 
				     "        self.lora_B = nn.Linear(r, model.out_features, bias=False)\n",
			
 
				-    "        # 定义LoRA的缩放比例\n",
			
 
				+    "        # Define the scaling factor of LoRA\n",
			
 
				     "        self.scaling = lora_alpha / r\n",
			
 
				     "        \n",
			
 
				     "    def _freezing_model(self):\n",
			
@@ -93,20 +92,20 @@
 
				    "source": [
			
 
				     "def _test_lora(model, r=4, lora_alpha=16):\n",
			
 
				     "    '''\n",
			
 
				-    "    测试LoRA实现的准确性\n",
			
 
				+    "    Test LoRA\n",
			
 
				     "    '''\n",
			
 
				     "    lora_model = Lora(model, r, lora_alpha)\n",
			
 
				-    "    # 生成对比模型\n",
			
 
				+    "    # Define the reference model\n",
			
 
				     "    _model = nn.ModuleDict({'lin': model})\n",
			
 
				     "    config = LoraConfig(\n",
			
 
				     "        r=r, lora_alpha=lora_alpha,\n",
			
 
				     "        target_modules=['lin'],\n",
			
 
				-    "        # 为了测试，我们将随机产生LoRA的初始参数\n",
			
 
				-    "        # 正常情况下，我们并不更改这个参数的默认值（True）\n",
			
 
				+    "        # For test purpose, we randomly initialise LoRA\n",
			
 
				+    "        # In general, we do NOT change the following parameter\n",
			
 
				     "        init_lora_weights=False)\n",
			
 
				     "    peft_model = PeftModel(_model, config)\n",
			
 
				     "    lin = peft_model.base_model.model.lin\n",
			
 
				-    "    # 复制LoRA参数\n",
			
 
				+    "    # Copy the parameter of LoRA\n",
			
 
				     "    lora_model.lora_A.weight.data = lin.lora_A.default.weight.clone()\n",
			
 
				     "    lora_model.lora_B.weight.data = lin.lora_B.default.weight.clone()\n",
			
 
				     "    x = torch.randn(10, model.in_features)\n",
			
@@ -125,7 +124,7 @@
 
				    "source": [
			
 
				     "def print_trainable_parameters(model):\n",
			
 
				     "    \"\"\"\n",
			
 
				-    "    输出模型中可供训练的参数个数\n",
			
 
				+    "    Print the number of trainable parameters\n",
			
 
				     "    \"\"\"\n",
			
 
				     "    trainable_params = 0\n",
			
 
				     "    all_param = 0\n",
			
@@ -164,13 +163,13 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 展示LoRA的效果\n",
			
 
				+    "# Illustrate the effect of LoRA\n",
			
 
				     "linear_model = nn.Linear(10, 6)\n",
			
 
				     "x = torch.randn(3, 10)\n",
			
 
				-    "# 普通的线性模型\n",
			
 
				+    "# Linear model\n",
			
 
				     "print_trainable_parameters(linear_model)\n",
			
 
				     "print(linear_model(x))\n",
			
 
				-    "# 加入LoRA之后的模型\n",
			
 
				+    "# Add LoRA adapter to linear model\n",
			
 
				     "lora_model = Lora(linear_model)\n",
			
 
				     "print_trainable_parameters(lora_model)\n",
			
 
				     "print(lora_model(x))"
			
@@ -182,7 +181,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 借助多层感知器，展示LoRA的使用细节\n",
			
 
				+    "# Using MLP to show the usage of LoRA\n",
			
 
				     "class MLP(nn.Module):\n",
			
 
				     "    \n",
			
 
				     "    def __init__(self, bias=False):\n",
			
@@ -241,7 +240,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 普通多层感知器的结果\n",
			
 
				+    "# The output of normal MLP\n",
			
 
				     "origin_re = model(x)\n",
			
 
				     "origin_re"
			
 
				    ]
			
@@ -287,11 +286,11 @@
 
				     "    r=2,\n",
			
 
				     "    lora_alpha=16,\n",
			
 
				     "    target_modules=['lin0'],\n",
			
 
				-    "    # 为了展示方便，我们将随机产生LoRA的初始参数\n",
			
 
				-    "    # 正常情况下，我们并不更改这个参数的默认值（True）\n",
			
 
				+    "    # For test purpose, we randomly initialise LoRA\n",
			
 
				+    "    # In general, we do NOT change the following parameter\n",
			
 
				     "    init_lora_weights=False)\n",
			
 
				     "\n",
			
 
				-    "# 加入LoRA之后的模型\n",
			
 
				+    "# Add LoRA adapter to model\n",
			
 
				     "peft_model = PeftModel(model, config, adapter_name='lora1')\n",
			
 
				     "peft_model"
			
 
				    ]
			
@@ -315,7 +314,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 加入LoRA之后，原模型也更改了\n",
			
 
				+    "# After adding LoRA adapter, the original model has changed too\n",
			
 
				     "origin_re, peft_model(x), model(x)"
			
 
				    ]
			
 
				   },
			
@@ -334,7 +333,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 禁用LoRA之后，模型恢复到原模型状态\n",
			
 
				+    "# Disable LoRA to get original model\n",
			
 
				     "with peft_model.disable_adapter():\n",
			
 
				     "    print(peft_model(x))\n",
			
 
				     "print(origin_re)"
			
@@ -359,7 +358,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 将LoRA卸载之后，模型又恢复成初始状态\n",
			
 
				+    "# Unload LoRA, model returns to original state\n",
			
 
				     "peft_model.unload()\n",
			
 
				     "origin_re, peft_model(x), model(x)"
			
 
				    ]
			
@@ -417,7 +416,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 在模型中加入多个LoRA适配器\n",
			
 
				+    "# Add multiple LoRA adapters to model\n",
			
 
				     "config1 = LoraConfig(r=3, lora_alpha=16, target_modules=['lin0'])\n",
			
 
				     "config2 = LoraConfig(r=5, lora_alpha=16, target_modules=['lin0', 'lin1'])\n",
			
 
				     "\n",
			
@@ -443,9 +442,9 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 原始模型的参数不能被训练\n",
			
 
				+    "# The parameters of original model can NOT be trained\n",
			
 
				     "print(peft_model.base_model.model.lin1.weight.requires_grad)\n",
			
 
				-    "# 两个LoRA的参数可以被训练\n",
			
 
				+    "# The parameters of two LoRA adapters can be trained\n",
			
 
				     "print(peft_model.base_model.model.lin0.lora_B.lora2.weight.requires_grad)\n",
			
 
				     "print(peft_model.base_model.model.lin0.lora_B.lora1.weight.requires_grad)\n",
			
 
				     "optimizer = optim.SGD(peft_model.parameters(), lr=0.1)"
			
@@ -472,13 +471,13 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 使用其中一个适配器\n",
			
 
				+    "# Use one adapter\n",
			
 
				     "optimizer.zero_grad()\n",
			
 
				     "print(f'active adapter: {peft_model.active_adapter}')\n",
			
 
				     "print(f'before bp, lora1: {peft_model.base_model.model.lin0.lora_B.lora1.weight.grad}')\n",
			
 
				     "print(f'before bp, lora2: {peft_model.base_model.model.lin0.lora_B.lora2.weight.grad}')\n",
			
 
				     "peft_model(x).sum().backward()\n",
			
 
				-    "# 只有激活的（active）适配器才会计算梯度\n",
			
 
				+    "# When the adapter is active, we will compute the gradient\n",
			
 
				     "print(f'after bp, lora1: {peft_model.base_model.model.lin0.lora_B.lora1.weight.grad}')\n",
			
 
				     "print(f'after bp, lora2: {peft_model.base_model.model.lin0.lora_B.lora2.weight.grad}')"
			
 
				    ]
			
@@ -504,7 +503,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 切换适配器\n",
			
 
				+    "# Switch to another adapter\n",
			
 
				     "peft_model.set_adapter('lora2')\n",
			
 
				     "optimizer.zero_grad()\n",
			
 
				     "print(f'active adapter: {peft_model.active_adapter}')\n",