2 年之前 · cb693386eb
--- a/ch11_llm/gpt2.ipynb
+++ b/ch11_llm/gpt2.ipynb
@@ -6,7 +6,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 安装第三方库\n",
			
 
				+    "# Installation of transformers\n",
			
 
				     "!pip install transformers"
			
 
				    ]
			
 
				   },
			
@@ -43,7 +43,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 使用分词器对文本进行分词\n",
			
 
				+    "# An example to use tokenizer\n",
			
 
				     "question = 'What is the capital of China?'\n",
			
 
				     "ids = tokenizer(question, return_tensors='pt')\n",
			
 
				     "ids"
			
@@ -83,7 +83,8 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 由于GPT-2的模型效果较差，通过增大num_beams和no_repeat_ngram_size来优化生成的文本。\n",
			
 
				+    "# As GPT-2 is NOT very good at text generation,\n",
			
 
				+    "# we increase num_beams and no_repeat_ngram_size to get better generation.\n",
			
 
				     "res = model.generate(**ids, max_length=100, early_stopping=True,\n",
			
 
				     "                     num_beams=3, no_repeat_ngram_size=2)\n",
			
 
				     "print(res[0])\n",
			
@@ -96,7 +97,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 问答示例模版\n",
			
 
				+    "# Template for questions and answers\n",
			
 
				     "template = '''\n",
			
 
				     "Q: What is the capital of the United Kingdom?\n",
			
 
				     "A: London.\n",
			
@@ -174,7 +175,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 通过问答示例来获得想要的结果\n",
			
 
				+    "# Use template to get better text generation\n",
			
 
				     "res2 = model.generate(**ids2, max_length=100, early_stopping=True,\n",
			
 
				     "                      num_beams=3, no_repeat_ngram_size=2)\n",
			
 
				     "print(tokenizer.decode(res2[0], skip_special_tokens=True))"
			
@@ -203,7 +204,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 中文的效果较差\n",
			
 
				+    "# The performance of GPT-2 on Chinese is NOT good\n",
			
 
				     "question_zh = '中国的首都在哪里？'\n",
			
 
				     "ids_zh = tokenizer(question_zh, return_tensors='pt')\n",
			
 
				     "res_zh = model.generate(**ids_zh, max_length=100, early_stopping=True,\n",
			
@@ -241,7 +242,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 即使使用问答示例，也无法获得想要的结果\n",
			
 
				+    "# Even using template, we can NOT get better text generation\n",
			
 
				     "ids_zh2 = tokenizer(template % question_zh , return_tensors='pt')\n",
			
 
				     "res_zh2 = model.generate(**ids_zh2, max_length=100, early_stopping=True,\n",
			
 
				     "                         num_beams=3, no_repeat_ngram_size=2)\n",
			
--- a/ch11_llm/gpt2_lora.ipynb
+++ b/ch11_llm/gpt2_lora.ipynb
@@ -35,8 +35,9 @@
 
				     "\n",
			
 
				     "\n",
			
 
				     "torch.manual_seed(12046)\n",
			
 
				-    "# 计算设备为V100 16G\n",
			
 
				-    "# 如果使用CPU，需要非常长的时间，建议减少sequence_len、batch_size等参数来加快速度"
			
 
				+    "# Device: V100 16G\n",
			
 
				+    "# If using a CPU, it will take a considerable amount of time. \n",
			
 
				+    "# Please consider reducing sequence_len, batch_size to speed up the process."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -47,9 +48,9 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 一些超参数\n",
			
 
				+    "# Some parameters\n",
			
 
				     "learning_rate = 1e-2\n",
			
 
				-    "# 模型支持的最大文本长度\n",
			
 
				+    "# The max length of text that model supports\n",
			
 
				     "sequence_len = 1024\n",
			
 
				     "batch_size = 4\n",
			
 
				     "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
			
@@ -108,7 +109,7 @@
 
				    "source": [
			
 
				     "tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
			
 
				     "model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)\n",
			
 
				-    "# 开源GPT-2模型的结构\n",
			
 
				+    "# The structure of GPT-2\n",
			
 
				     "model"
			
 
				    ]
			
 
				   },
			
@@ -172,21 +173,20 @@
 
				    "source": [
			
 
				     "def process(data):\n",
			
 
				     "    '''\n",
			
 
				-    "    对文本进行分词\n",
			
 
				+    "    Tokenize the text\n",
			
 
				     "    '''\n",
			
 
				     "    ids = tokenizer.encode(data['text'])\n",
			
 
				-    "    # 增加表示文本结束的特殊字符\n",
			
 
				+    "    # Add special token for the end of text\n",
			
 
				     "    ids.append(tokenizer.eos_token_id)\n",
			
 
				     "    out = {'ids': ids}\n",
			
 
				     "    return out\n",
			
 
				     "\n",
			
 
				     "def concat_data(datasets):\n",
			
 
				     "    '''\n",
			
 
				-    "    将文本分词，并将结果拼接成一个长长的字符串\n",
			
 
				+    "    Tokenize the text and then concatenate the results in a very long string.\n",
			
 
				     "    '''\n",
			
 
				     "    tokenized = datasets.map(process, remove_columns=datasets.column_names)\n",
			
 
				     "    tokenized.set_format(type='torch', device=device)\n",
			
 
				-    "    # 将所有文本拼接成一个长长的字符串\n",
			
 
				     "    concat_text = torch.concatenate(tokenized['ids'])\n",
			
 
				     "    return concat_text\n",
			
 
				     "\n",
			
@@ -230,12 +230,12 @@
 
				    "source": [
			
 
				     "def get_data(data, batch_size, sequence_len):\n",
			
 
				     "    '''\n",
			
 
				-    "    生成训练数据\n",
			
 
				+    "    Generate train data\n",
			
 
				     "    '''\n",
			
 
				-    "    # 生成截取数据的起点，形状为(B)，其中B等于batch_size，表示批量数据的大小\n",
			
 
				+    "    # Generate the start of data, shape (B), B means batch_size\n",
			
 
				     "    ix = torch.randint(len(data) - sequence_len, (batch_size,))\n",
			
 
				     "    x = torch.stack([data[i: i + sequence_len] for i in ix])\n",
			
 
				-    "    # 预测标签是下一个字母，因此只需要挪动一个位置即可\n",
			
 
				+    "    # The predicted label is the next char, just postpone one position\n",
			
 
				     "    y = torch.stack([data[i + 1: i + 1 + sequence_len] for i in ix])\n",
			
 
				     "    return x, y\n",
			
 
				     "\n",
			
@@ -252,7 +252,7 @@
 
				    "source": [
			
 
				     "def print_trainable_parameters(model):\n",
			
 
				     "    \"\"\"\n",
			
 
				-    "    输出模型中可供训练的参数个数\n",
			
 
				+    "    Print the number of trainable parameters\n",
			
 
				     "    \"\"\"\n",
			
 
				     "    trainable_params = 0\n",
			
 
				     "    all_param = 0\n",
			
@@ -290,21 +290,21 @@
 
				     "from peft import LoraConfig, PeftModel\n",
			
 
				     "\n",
			
 
				     "def init_peft_model(model):\n",
			
 
				-    "    # 设置LoRA参数\n",
			
 
				+    "    # Initialize parameters of LoRA\n",
			
 
				     "    config = LoraConfig(\n",
			
 
				     "        r=4,\n",
			
 
				     "        lora_alpha=32,\n",
			
 
				     "        target_modules=['c_attn'],\n",
			
 
				     "        lora_dropout=0.1,\n",
			
 
				-    "        # c_attn.weight的形状是(fan_in, fan_out)，所以该参数设置为True\n",
			
 
				-    "        # 但需注意的是，普通的线性模型权重参数的形状是(fan_out, fan_in)\n",
			
 
				+    "        # As the shape of c_attn.weight is (fan_in, fan_out), set this parameter to True\n",
			
 
				+    "        # Note: for linear model, the shape of weight is (fan_out, fan_in)\n",
			
 
				     "        fan_in_fan_out=True,\n",
			
 
				     "        bias='none')\n",
			
 
				     "    return PeftModel(model, config, adapter_name='lora_alpaca')\n",
			
 
				     "\n",
			
 
				     "print_trainable_parameters(model)\n",
			
 
				     "model = init_peft_model(model)\n",
			
 
				-    "# 确保模型在训练状态\n",
			
 
				+    "# Put the model on train mode\n",
			
 
				     "model.train()\n",
			
 
				     "print_trainable_parameters(model)"
			
 
				    ]
			
@@ -375,7 +375,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 加入LoRA后，模型的结构\n",
			
 
				+    "# Model structure after using LoRA\n",
			
 
				     "model"
			
 
				    ]
			
 
				   },
			
@@ -406,33 +406,35 @@
 
				     "\n",
			
 
				     "def estimate_loss(model, ctx=nullcontext()):\n",
			
 
				     "    '''\n",
			
 
				-    "    估计模型损失\n",
			
 
				-    "    ctx参数是为禁用LoRA或者混合精度做准备，当ctx=nullcontext()时，没有任何作用\n",
			
 
				+    "    Estimate the performance of model.\n",
			
 
				+    "    Note: ctx is used for disabling LoRA or mixed precision.\n",
			
 
				+    "    When ctx=nullcontext(), it have no effect.\n",
			
 
				     "    '''\n",
			
 
				     "    re = {}\n",
			
 
				-    "    # 将模型切换至评估模式\n",
			
 
				+    "    # Put the mode on evaluation mode\n",
			
 
				     "    model.eval()\n",
			
 
				     "    _train = lambda: get_data(train_set, batch_size, sequence_len)\n",
			
 
				     "    re['train'] = _loss(model, _train, ctx)\n",
			
 
				     "    _test = lambda: get_data(test_set, batch_size, sequence_len)\n",
			
 
				     "    re['test'] = _loss(model, _test, ctx)\n",
			
 
				-    "    # 将模型切换至训练模式\n",
			
 
				+    "    # Put the mode on train mode\n",
			
 
				     "    model.train()\n",
			
 
				     "    return re\n",
			
 
				     "\n",
			
 
				     "@torch.no_grad()\n",
			
 
				     "def _loss(model, data_loader, ctx):\n",
			
 
				     "    \"\"\"\n",
			
 
				-    "    计算模型在不同数据集下面的评估指标\n",
			
 
				+    "    Measure the performance of model based on different data sets.\n",
			
 
				     "    \"\"\"\n",
			
 
				     "    loss = []\n",
			
 
				-    "    # 随机使用多个批量数据来预估模型效果\n",
			
 
				+    "    # Use eval_iters batch data to measure the performance\n",
			
 
				     "    for k in range(eval_iters):\n",
			
 
				     "        inputs, labels = data_loader()\n",
			
 
				     "        with ctx:\n",
			
 
				     "            logits = model(inputs).logits\n",
			
 
				-    "            # 根据cross_entropy的定义，需要对logits进行转置运算\n",
			
 
				-    "            # 具体细节请参考cross_entropy的官方文档\n",
			
 
				+    "            # According to the definition of cross_entropy in PyTorch,\n",
			
 
				+    "            # we need to transpose the logits.\n",
			
 
				+    "            # More details can be found in official document.\n",
			
 
				     "            logits = logits.transpose(-2, -1)\n",
			
 
				     "            loss.append(F.cross_entropy(logits, labels).item())\n",
			
 
				     "    return torch.tensor(loss).mean().item()\n",
			
@@ -454,8 +456,9 @@
 
				     "        optimizer.zero_grad(set_to_none=True)\n",
			
 
				     "        inputs, labels = data_loader()\n",
			
 
				     "        logits = model(inputs).logits\n",
			
 
				-    "        # 根据cross_entropy的定义，需要对logits进行转置运算\n",
			
 
				-    "        # 具体细节请参考cross_entropy的官方文档\n",
			
 
				+    "        # According to the definition of cross_entropy in PyTorch,\n",
			
 
				+    "        # we need to transpose the logits.\n",
			
 
				+    "        # More details can be found in official document.\n",
			
 
				     "        logits = logits.transpose(-2, -1)\n",
			
 
				     "        loss = F.cross_entropy(logits, labels)\n",
			
 
				     "        lossi.append(loss.item())\n",
			
@@ -498,7 +501,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 粗糙的训练方式可能会使模型效果急剧恶化\n",
			
 
				+    "# Vanilla model training could get divergent results\n",
			
 
				     "data_loader = lambda: get_data(train_set, batch_size, sequence_len)\n",
			
 
				     "optimizer = optim.AdamW(model.parameters(), lr=learning_rate)\n",
			
 
				     "l = train_gpt(model, optimizer, data_loader, max_iters=500)"
			
@@ -549,8 +552,8 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 使用更精细的训练方式微调大语言模型\n",
			
 
				-    "# get_lr的实现参考自https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
			
 
				+    "# More elegant model training for LLM\n",
			
 
				+    "# The code of get_lr is inspired by https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
			
 
				     "import math\n",
			
 
				     "\n",
			
 
				     "learning_rate = 6e-4\n",
			
@@ -560,16 +563,16 @@
 
				     "\n",
			
 
				     "def get_lr(it):\n",
			
 
				     "    '''\n",
			
 
				-    "    动态调整学习速率\n",
			
 
				-    "    it表示训练次数\n",
			
 
				+    "    Adjust learning rate dynamically \n",
			
 
				+    "    it means the step of training\n",
			
 
				     "    '''\n",
			
 
				-    "    # 1、线性预热\n",
			
 
				+    "    # 1, Linear warmup\n",
			
 
				     "    if it < warmup_iters:\n",
			
 
				     "        return learning_rate * it / warmup_iters\n",
			
 
				-    "    # 2、超出lr_decay_iters，则返回min_lr\n",
			
 
				+    "    # 2, If exceeding lr_decay_iters, return min_lr\n",
			
 
				     "    if it > lr_decay_iters:\n",
			
 
				     "        return min_lr\n",
			
 
				-    "    # 3、逐步衰减学习速率\n",
			
 
				+    "    # 3, decay learning rate\n",
			
 
				     "    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)\n",
			
 
				     "    assert 0 <= decay_ratio <= 1\n",
			
 
				     "    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))\n",
			
@@ -584,23 +587,22 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# 梯度裁剪的超参数\n",
			
 
				+    "# The parameter for gradient clipping\n",
			
 
				     "grad_clip = 1.0\n",
			
 
				     "\n",
			
 
				     "def train_gpt_optimum(model, optimizer, data_loader, max_iters=1000):\n",
			
 
				     "    lossi = []\n",
			
 
				     "    scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))\n",
			
 
				     "    for iter_num in range(max_iters):\n",
			
 
				-    "        # 动态调整学习率\n",
			
 
				+    "        # Get learning rate\n",
			
 
				     "        lr = get_lr(iter_num + 1)\n",
			
 
				     "        for param_group in optimizer.param_groups:\n",
			
 
				     "            param_group['lr'] = lr\n",
			
 
				-    "        # 梯度累积\n",
			
 
				+    "        # Gradient accumulation\n",
			
 
				     "        for i in range(gra_acc_steps):\n",
			
 
				     "            inputs, labels = data_loader()\n",
			
 
				-    "            # 混合精度训练\n",
			
 
				-    "            ## 如果是用CPU进行计算，可能需要将dtype变成torch.bfloat16\n",
			
 
				-    "            ## 当然如果使用CPU，需要非常长的时间\n",
			
 
				+    "            # Mixed precision\n",
			
 
				+    "            # If using a CPU, set dtype to torch.bfloat16\n",
			
 
				     "            ctx = torch.autocast(device_type=device, dtype=torch.float16)\n",
			
 
				     "            with ctx:\n",
			
 
				     "                logits = model(inputs).logits\n",
			
@@ -609,7 +611,7 @@
 
				     "                lossi.append(loss.item())\n",
			
 
				     "                loss *= 1 / gra_acc_steps\n",
			
 
				     "            scaler.scale(loss).backward()\n",
			
 
				-    "        # 梯度裁剪\n",
			
 
				+    "        # Gradient clipping\n",
			
 
				     "        scaler.unscale_(optimizer)\n",
			
 
				     "        clip_grad_norm_(model.parameters(), grad_clip)\n",
			
 
				     "        scaler.step(optimizer)\n",
			
@@ -617,7 +619,7 @@
 
				     "        optimizer.zero_grad(set_to_none=True)\n",
			
 
				     "\n",
			
 
				     "        if iter_num % eval_interval == 0:\n",
			
 
				-    "            # 预估模型损失时，也使用混合精度\n",
			
 
				+    "            # Measure the performance (use mixed precision)\n",
			
 
				     "            stats = estimate_loss(model, ctx)\n",
			
 
				     "            train_loss = f'train loss {stats[\"train\"]:.4f}'\n",
			
 
				     "            test_loss = f'test loss {stats[\"test\"]:.4f}'\n",
			
@@ -634,10 +636,11 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "# 调整批量数据的大小（如需要）\n",
			
 
				+    "# Adjust batch size (if need)\n",
			
 
				     "batch_size = 4\n",
			
 
				-    "# 定义梯度累积的步数\n",
			
 
				+    "# The step of gradient accumulation\n",
			
 
				     "gra_acc_steps = 8 * 2\n",
			
 
				-    "# 重新定义data loader\n",
			
 
				+    "# Redefine data loader\n",
			
 
				     "data_loader = lambda: get_data(train_set, batch_size, sequence_len)"
			
 
				    ]
			
 
				   },
			
@@ -680,13 +683,13 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 卸载LoRA适配器\n",
			
 
				+    "# Unload LoRA adapter\n",
			
 
				     "model.unload()\n",
			
 
				-    "# 清空GPU缓存\n",
			
 
				+    "# Empty GPU cache\n",
			
 
				     "torch.cuda.empty_cache()\n",
			
 
				-    "# 重新安装LoRA适配器\n",
			
 
				+    "# Add new LoRA adapter\n",
			
 
				     "model = init_peft_model(model)\n",
			
 
				-    "# 设置最优化算法的参数\n",
			
 
				+    "# Parameters for AdamW\n",
			
 
				     "weight_decay = 1e-1\n",
			
 
				     "beta1 = 0.9\n",
			
 
				     "beta2 = 0.95\n",
			
@@ -741,12 +744,12 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "def make_inference(model, question):\n",
			
 
				-    "    # 保存跟微调训练数据一致的格式\n",
			
 
				+    "    # Use the same template as train data\n",
			
 
				     "    context = 'Below is an instruction that describes a task.' + \\\n",
			
 
				     "    ' Write a response that appropriately completes the request.\\n\\n'\n",
			
 
				     "    instruction = f'### Instruction:\\n{question}\\n\\n### Response:\\n'\n",
			
 
				     "    token = tokenizer(f'{context}{instruction}', return_tensors='pt').to(device)\n",
			
 
				-    "    # 生成文本时，需要将模型调整成评估模式\n",
			
 
				+    "    # Put the mode on evaluation mode when doing text generation\n",
			
 
				     "    model.eval()\n",
			
 
				     "    output_tokens = model.generate(**token, max_new_tokens=100, early_stopping=True)\n",
			
 
				     "    model.train()\n",
			
@@ -845,7 +848,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 微调之前的模型效果（禁用LoRA相当于回到微调之前的模型状态）\n",
			
 
				+    "# Disable LoRA adapter to show the result before SFT\n",
			
 
				     "with model.disable_adapter():\n",
			
 
				     "    make_inference(model, 'Where is the capital of China?')"
			
 
				    ]