Przeglądaj źródła

update comment for ch11

Gen TANG 2 lat temu
rodzic
commit
f5c5896bd6

+ 9 - 0
ch11_llm/README.md

@@ -0,0 +1,9 @@
+
+|代码|说明|
+|---|---|
+|[char_gpt.ipynb](char_gpt.ipynb)| 从零开始实现GPT-2,并使用模型进行自然语言的自回归学习(根据背景文本预测下一个字母是什么) |
+|[gpt2.ipynb](gpt2.ipynb)| 使用开源的GPT-2模型 |
+|[lora_tutorial.ipynb](lora_tutorial.ipynb)| 实现简单版本的LoRA以及开源工具中LoRA的使用示例 |
+|[gpt2_lora.ipynb](gpt2_lora.ipynb)| 使用LoRA对GPT-2进行监督微调(微调方式并不是最优的) |
+|[gpt2\_lora_optimum.ipynb](gpt2_lora_optimum.ipynb)| 使用LoRA对GPT-2进行更优雅的监督微调 |
+|[gpt2\_reward_modeling.ipynb](gpt2_reward_modeling.ipynb)| 使用LoRA对GPT-2进行评分建模 |

Plik diff jest za duży
+ 973 - 851
ch11_llm/char_gpt.ipynb


+ 12 - 8
ch11_llm/gpt2.ipynb

@@ -6,6 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# 安装第三方库\n",
     "!pip install transformers"
    ]
   },
@@ -21,8 +22,8 @@
     "\n",
     "torch.manual_seed(12046)\n",
     "\n",
-    "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
-    "model = GPT2LMHeadModel.from_pretrained(\"gpt2\")"
+    "tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
+    "model = GPT2LMHeadModel.from_pretrained('gpt2')"
    ]
   },
   {
@@ -42,8 +43,9 @@
     }
    ],
    "source": [
+    "# 使用分词器对文本进行分词\n",
     "question = 'What is the capital of China?'\n",
-    "ids = tokenizer(question, return_tensors=\"pt\")\n",
+    "ids = tokenizer(question, return_tensors='pt')\n",
     "ids"
    ]
   },
@@ -81,8 +83,7 @@
     }
    ],
    "source": [
-    "# 由于GPT-2的模型效果较差,我们通过增大num_beams和no_repeat_ngram_size\n",
-    "# 来优化生成的文本。\n",
+    "# 由于GPT-2的模型效果较差,通过增大num_beams和no_repeat_ngram_size来优化生成的文本。\n",
     "res = model.generate(**ids, max_length=100, early_stopping=True,\n",
     "                     num_beams=3, no_repeat_ngram_size=2)\n",
     "print(res[0])\n",
@@ -95,6 +96,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# 问答示例模版\n",
     "template = '''\n",
     "Q: What is the capital of the United Kingdom?\n",
     "A: London.\n",
@@ -139,7 +141,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ids2 = tokenizer(template % question , return_tensors=\"pt\")"
+    "ids2 = tokenizer(template % question , return_tensors='pt')"
    ]
   },
   {
@@ -172,6 +174,7 @@
     }
    ],
    "source": [
+    "# 通过问答示例来获得想要的结果\n",
     "res2 = model.generate(**ids2, max_length=100, early_stopping=True,\n",
     "                      num_beams=3, no_repeat_ngram_size=2)\n",
     "print(tokenizer.decode(res2[0], skip_special_tokens=True))"
@@ -202,7 +205,7 @@
    "source": [
     "# 中文的效果较差\n",
     "question_zh = '中国的首都在哪里?'\n",
-    "ids_zh = tokenizer(question_zh, return_tensors=\"pt\")\n",
+    "ids_zh = tokenizer(question_zh, return_tensors='pt')\n",
     "res_zh = model.generate(**ids_zh, max_length=100, early_stopping=True,\n",
     "                        num_beams=3, no_repeat_ngram_size=2)\n",
     "print(tokenizer.decode(res_zh[0], skip_special_tokens=True))"
@@ -238,7 +241,8 @@
     }
    ],
    "source": [
-    "ids_zh2 = tokenizer(template % question_zh , return_tensors=\"pt\")\n",
+    "# 即使使用问答示例,也无法获得想要的结果\n",
+    "ids_zh2 = tokenizer(template % question_zh , return_tensors='pt')\n",
     "res_zh2 = model.generate(**ids_zh2, max_length=100, early_stopping=True,\n",
     "                         num_beams=3, no_repeat_ngram_size=2)\n",
     "print(tokenizer.decode(res_zh2[0], skip_special_tokens=True))"

Plik diff jest za duży
+ 488 - 474
ch11_llm/gpt2_lora.ipynb


Plik diff jest za duży
+ 547 - 491
ch11_llm/gpt2_lora_optimum.ipynb


+ 75 - 5
ch11_llm/gpt2_reward_modeling.ipynb

@@ -45,6 +45,7 @@
    },
    "outputs": [],
    "source": [
+    "# 一些超参数\n",
     "learning_rate = 6e-4\n",
     "sequence_len = 1024\n",
     "batch_size = 8\n",
@@ -63,6 +64,7 @@
    "outputs": [],
    "source": [
     "tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
+    "# 没有语言建模头的嵌入模型\n",
     "model = GPT2Model.from_pretrained('gpt2')"
    ]
   },
@@ -93,11 +95,17 @@
    ],
    "source": [
     "def precoss(data):\n",
+    "    '''\n",
+    "    生成训练文本和标签变量\n",
+    "    '''\n",
     "    re = {}\n",
     "    for i in range(2):\n",
     "        key = 'tokens_%s' % i\n",
+    "        # prefix和completion两个字段已经经过了分词处理\n",
     "        re['input_ids_%s' % i] = data[key]['prefix'] + data[key]['completion']\n",
+    "        # 记录文本的实际长度,用于后续的模型计算\n",
     "        re['input_len_%s' % i] = len(re['input_ids_%s' % i])\n",
+    "        # 根据数据说明,定义标签变量\n",
     "        re['label'] = 0 if data['score_0'] > 0 else 1\n",
     "    return re\n",
     "\n",
@@ -147,11 +155,16 @@
     "from torch.nn.utils.rnn import pad_sequence\n",
     "\n",
     "def token_collect(batch):\n",
+    "    '''\n",
+    "    由于文本的长度不一,对于同一批次的训练数据,需要进行数据填充,使得长度一致\n",
+    "    '''\n",
     "    re = {}\n",
     "    for i in range(2):\n",
     "        ids = [data['input_ids_%s' % i] for data in batch]\n",
+    "        # 对于较短的数据,用0在末尾进行填充\n",
     "        re['input_ids_%s' % i] = pad_sequence(ids, batch_first=True)\n",
     "        re['input_len_%s' % i] = torch.stack([data['input_len_%s' % i] for data in batch])\n",
+    "    # 将标签变量也合并成一个张量\n",
     "    re['label'] = torch.stack([data['label'] for data in batch])\n",
     "    return re"
    ]
@@ -197,9 +210,11 @@
    "source": [
     "from torch.utils.data import DataLoader, random_split\n",
     "\n",
+    "# 划分训练集和测试集\n",
     "train_set, test_set = random_split(dataset, [0.8, 0.2])\n",
     "train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=token_collect)\n",
     "test_loader = DataLoader(test_set, batch_size=3, shuffle=True, collate_fn=token_collect)\n",
+    "# 训练数据示例\n",
     "next(iter(train_loader))"
    ]
   },
@@ -255,13 +270,30 @@
     "class RewardModel(nn.Module):\n",
     "\n",
     "    def __init__(self, model):\n",
+    "        '''\n",
+    "        评分模型\n",
+    "        参数\n",
+    "        ----\n",
+    "        model :嵌入模型\n",
+    "        '''\n",
     "        super().__init__()\n",
     "        self.embedding = model\n",
+    "        # 评分建模头\n",
     "        self.score = nn.Linear(model.embed_dim, 1, bias=False)\n",
     "\n",
     "    def forward(self, x, seq_len):\n",
-    "        # x:表示文本,形状(B, T), seq_len:表示文本长度,形状(B)\n",
+    "        '''\n",
+    "        向前传播\n",
+    "        参数\n",
+    "        ----\n",
+    "        x :torch.LongTensor,文本,形状为(B, T)\n",
+    "        seq_len :torch.LongTensor,文本的实际长度,形状为(B)\n",
+    "        返回\n",
+    "        ----\n",
+    "        score :torch.FloatTensor,评分,形状为(B, 1)\n",
+    "        '''\n",
     "        B, _ = x.shape\n",
+    "        # 文本的嵌入向量\n",
     "        emb = self.embedding(x).last_hidden_state  # (B, T, C)\n",
     "        ind = torch.arange(B, device=seq_len.device)\n",
     "        # 获取最后一个词元的特征\n",
@@ -270,6 +302,7 @@
     "        return score\n",
     "\n",
     "r_model = RewardModel(model)\n",
+    "# 展示模型结构\n",
     "r_model"
    ]
   },
@@ -324,11 +357,15 @@
     "    lora_alpha=8,\n",
     "    target_modules=['c_attn'],\n",
     "    lora_dropout=0.4,\n",
+    "    # c_attn.weight的形状是(fan_in, fan_out),所以该参数设置为True\n",
+    "    # 但需注意的是,普通的线性模型权重参数的形状是(fan_out, fan_in)\n",
     "    fan_in_fan_out=True,\n",
     "    bias='none',\n",
+    "    # 评分模型中的score层(评分建模头)也参与模型微调\n",
     "    modules_to_save=['score']\n",
     "    )\n",
     "\n",
+    "# 为评分模型添加LoRA适配器\n",
     "r_model = PeftModel(r_model, config, adapter_name='lora')\n",
     "print_trainable_parameters(r_model)"
    ]
@@ -409,19 +446,37 @@
     "class PreferenceModel(nn.Module):\n",
     "\n",
     "    def __init__(self, model):\n",
+    "        '''\n",
+    "        借鉴逻辑回归的思路,进行偏好建模\n",
+    "        参数\n",
+    "        ----\n",
+    "        model :评分模型\n",
+    "        '''\n",
     "        super().__init__()\n",
     "        self.pref = model\n",
     "\n",
     "    def forward(self, data):\n",
+    "        '''\n",
+    "        定义模型损失\n",
+    "        参数\n",
+    "        ----\n",
+    "        data :dict,训练数据\n",
+    "        返回\n",
+    "        ----\n",
+    "        out :torch.FloatTensor,logits,形状为(B, 2)\n",
+    "        loss :torch.FloatTensor,模型损失\n",
+    "        '''\n",
+    "        # input0的形状是(B, T),len0的形状是(B)\n",
     "        input0, len0 = data['input_ids_0'], data['input_len_0']\n",
     "        input1, len1 = data['input_ids_1'], data['input_len_1']\n",
-    "        score0 = self.pref(input0, len0)\n",
-    "        score1 = self.pref(input1, len1)\n",
-    "        out = torch.concat((score0, score1), dim=1)\n",
+    "        score0 = self.pref(input0, len0)             # (B, 1)\n",
+    "        score1 = self.pref(input1, len1)             # (B, 1)\n",
+    "        out = torch.concat((score0, score1), dim=1)  # (B, 2)\n",
     "        loss = F.cross_entropy(out, data['label'])\n",
     "        return out, loss\n",
     "\n",
     "p_model = PreferenceModel(r_model).to(device)\n",
+    "# 模型结构\n",
     "p_model"
    ]
   },
@@ -446,6 +501,7 @@
     }
    ],
    "source": [
+    "# 利用示例数据验证模型是否搭建正确,并记录微调前的模型效果(方便与后续结果做对比)\n",
     "example = test_set[:1]\n",
     "with torch.no_grad():\n",
     "    p_model.eval()\n",
@@ -480,6 +536,10 @@
     "from contextlib import nullcontext\n",
     "\n",
     "def estimate_loss(model, ctx=nullcontext()):\n",
+    "    '''\n",
+    "    估计模型损失\n",
+    "    ctx参数是为禁用LoRA或者混合精度做准备,当ctx=nullcontext()时,没有任何作用\n",
+    "    '''\n",
     "    re = {}\n",
     "    # 将模型切换至评估模式\n",
     "    model.eval()\n",
@@ -496,7 +556,9 @@
     "    \"\"\"\n",
     "    lossi = []\n",
     "    data_iter= iter(data_loader)\n",
+    "    # 随机使用多个批量数据来预估模型效果\n",
     "    for k in range(eval_iters):\n",
+    "        # 如果数据遍历完了,则重新生成一个data loader\n",
     "        data = next(data_iter, None)\n",
     "        if data is None:\n",
     "            data_iter = iter(data_loader)\n",
@@ -517,7 +579,7 @@
    },
    "outputs": [],
    "source": [
-    "# 参考自https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
+    "# get_lr的实现参考自https://github.com/karpathy/nanoGPT/blob/master/train.py\n",
     "import math\n",
     "\n",
     "warmup_iters = 100\n",
@@ -525,6 +587,10 @@
     "min_lr = learning_rate / 10\n",
     "\n",
     "def get_lr(it):\n",
+    "    '''\n",
+    "    动态调整学习速率\n",
+    "    it表示训练次数\n",
+    "    '''\n",
     "    # 1、线性预热\n",
     "    if it < warmup_iters:\n",
     "        return learning_rate * it / warmup_iters\n",
@@ -546,6 +612,7 @@
    },
    "outputs": [],
    "source": [
+    "# 梯度裁剪的超参数\n",
     "grad_clip = 1.0\n",
     "\n",
     "def train_reward_optimum(model, optimizer, data_loader, max_iters=1000):\n",
@@ -580,6 +647,7 @@
     "        optimizer.zero_grad(set_to_none=True)\n",
     "\n",
     "        if iter_num % eval_interval == 0:\n",
+    "            # 预估模型损失时,也使用混合精度\n",
     "            stats = estimate_loss(model, ctx)\n",
     "            train_loss = f'train loss {stats[\"train\"]:.4f}'\n",
     "            eval_loss = f'test loss {stats[\"test\"]:.4f}'\n",
@@ -626,6 +694,7 @@
     }
    ],
    "source": [
+    "# 设置最优化算法的参数\n",
     "weight_decay = 1e-1\n",
     "beta1 = 0.9\n",
     "beta2 = 0.95\n",
@@ -692,6 +761,7 @@
     }
    ],
    "source": [
+    "# 经过模型微调之后,评分模型的效果有所提升\n",
     "with torch.no_grad():\n",
     "    p_model.eval()\n",
     "    print(p_model(example), example['label'])\n",

+ 31 - 2
ch11_llm/lora_tutorial.ipynb

@@ -6,6 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# 安装第三方库\n",
     "!pip install peft"
    ]
   },
@@ -45,12 +46,22 @@
     "class Lora(nn.Module):\n",
     "    \n",
     "    def __init__(self, model, r=4, lora_alpha=16):\n",
+    "        '''\n",
+    "        LoRA的实现示例:在线性模型中加入LoRA层\n",
+    "        参数\n",
+    "        ----\n",
+    "        model :线性模型\n",
+    "        r :int,LoRA的秩\n",
+    "        lora_alpha :int,LoRA算法里的alpha\n",
+    "        '''\n",
     "        super().__init__()\n",
     "        # model是线性模型\n",
     "        self.model = model\n",
+    "        # 冻结模型\n",
     "        self._freezing_model()\n",
     "        self.lora_A = nn.Linear(model.in_features, r, bias=False)\n",
     "        self.lora_B = nn.Linear(r, model.out_features, bias=False)\n",
+    "        # 定义LoRA的缩放比例\n",
     "        self.scaling = lora_alpha / r\n",
     "        \n",
     "    def _freezing_model(self):\n",
@@ -81,19 +92,25 @@
    ],
    "source": [
     "def _test_lora(model, r=4, lora_alpha=16):\n",
+    "    '''\n",
+    "    测试LoRA实现的准确性\n",
+    "    '''\n",
     "    lora_model = Lora(model, r, lora_alpha)\n",
     "    # 生成对比模型\n",
     "    _model = nn.ModuleDict({'lin': model})\n",
     "    config = LoraConfig(\n",
     "        r=r, lora_alpha=lora_alpha,\n",
-    "        target_modules=['lin'], init_lora_weights=False)\n",
+    "        target_modules=['lin'],\n",
+    "        # 为了测试,我们将随机产生LoRA的初始参数\n",
+    "        # 正常情况下,我们并不更改这个参数的默认值(True)\n",
+    "        init_lora_weights=False)\n",
     "    peft_model = PeftModel(_model, config)\n",
     "    lin = peft_model.base_model.model.lin\n",
     "    # 复制LoRA参数\n",
     "    lora_model.lora_A.weight.data = lin.lora_A.default.weight.clone()\n",
     "    lora_model.lora_B.weight.data = lin.lora_B.default.weight.clone()\n",
     "    x = torch.randn(10, model.in_features)\n",
-    "    return torch.all(lora_model(x) - lin(x) < 1e-3)\n",
+    "    return torch.all(torch.abs(lora_model(x) - lin(x)) < 1e-3)\n",
     "\n",
     "\n",
     "linear_model = nn.Linear(10, 20)\n",
@@ -147,10 +164,13 @@
     }
    ],
    "source": [
+    "# 展示LoRA的效果\n",
     "linear_model = nn.Linear(10, 6)\n",
     "x = torch.randn(3, 10)\n",
+    "# 普通的线性模型\n",
     "print_trainable_parameters(linear_model)\n",
     "print(linear_model(x))\n",
+    "# 加入LoRA之后的模型\n",
     "lora_model = Lora(linear_model)\n",
     "print_trainable_parameters(lora_model)\n",
     "print(lora_model(x))"
@@ -162,9 +182,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# 借助多层感知器,展示LoRA的使用细节\n",
     "class MLP(nn.Module):\n",
     "    \n",
     "    def __init__(self, bias=False):\n",
+    "        '''\n",
+    "        多层感知器\n",
+    "        '''\n",
     "        super().__init__()\n",
     "        self.lin0 = nn.Linear(2, 4, bias=bias)\n",
     "        self.lin1 = nn.Linear(4, 2, bias=bias)\n",
@@ -217,6 +241,7 @@
     }
    ],
    "source": [
+    "# 普通多层感知器的结果\n",
     "origin_re = model(x)\n",
     "origin_re"
    ]
@@ -266,6 +291,7 @@
     "    # 正常情况下,我们并不更改这个参数的默认值(True)\n",
     "    init_lora_weights=False)\n",
     "\n",
+    "# 加入LoRA之后的模型\n",
     "peft_model = PeftModel(model, config, adapter_name='lora1')\n",
     "peft_model"
    ]
@@ -417,7 +443,9 @@
     }
    ],
    "source": [
+    "# 原始模型的参数不能被训练\n",
     "print(peft_model.base_model.model.lin1.weight.requires_grad)\n",
+    "# 两个LoRA的参数可以被训练\n",
     "print(peft_model.base_model.model.lin0.lora_B.lora2.weight.requires_grad)\n",
     "print(peft_model.base_model.model.lin0.lora_B.lora1.weight.requires_grad)\n",
     "optimizer = optim.SGD(peft_model.parameters(), lr=0.1)"
@@ -450,6 +478,7 @@
     "print(f'before bp, lora1: {peft_model.base_model.model.lin0.lora_B.lora1.weight.grad}')\n",
     "print(f'before bp, lora2: {peft_model.base_model.model.lin0.lora_B.lora2.weight.grad}')\n",
     "peft_model(x).sum().backward()\n",
+    "# 只有激活的(active)适配器才会计算梯度\n",
     "print(f'after bp, lora1: {peft_model.base_model.model.lin0.lora_B.lora1.weight.grad}')\n",
     "print(f'after bp, lora2: {peft_model.base_model.model.lin0.lora_B.lora2.weight.grad}')"
    ]

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików