|
|
@@ -43,6 +43,7 @@
|
|
|
},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
+ "# 一些超参数\n",
|
|
|
"learning_rate = 5e-5\n",
|
|
|
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
|
|
|
"gamma = 1.0\n",
|
|
|
@@ -50,6 +51,7 @@
|
|
|
"kl_ctl_value = 0.2\n",
|
|
|
"cliprange = 0.2\n",
|
|
|
"vf_coef = 0.1\n",
|
|
|
+ "# 经过mini_batch_size步后,更新旧模型\n",
|
|
|
"mini_batch_size = 20\n",
|
|
|
"grad_clip = 1.0"
|
|
|
]
|
|
|
@@ -99,9 +101,20 @@
|
|
|
" def __init__(self, model):\n",
|
|
|
" super().__init__()\n",
|
|
|
" self.actor = model\n",
|
|
|
+ " # 值函数估计头\n",
|
|
|
" self.critic = nn.Linear(model.base_model.embed_dim, 1, bias=False)\n",
|
|
|
"\n",
|
|
|
" def forward(self, x):\n",
|
|
|
+ " '''\n",
|
|
|
+ " 向前传播,为了使代码易懂,该函数只支持单条文本的计算\n",
|
|
|
+ " 参数\n",
|
|
|
+ " ----\n",
|
|
|
+ " x :torch.LongTensor,文本,形状为(1, T)\n",
|
|
|
+ " 返回\n",
|
|
|
+ " ----\n",
|
|
|
+ " logits :torch.FloatTensor,logits,形状为(1, T, vs)\n",
|
|
|
+ " values :torch.FloatTensor,值函数,形状为(1, T)\n",
|
|
|
+ " '''\n",
|
|
|
" _res = self.actor(input_ids=x, output_hidden_states=True)\n",
|
|
|
" logits = _res.logits\n",
|
|
|
" emb = _res.hidden_states[-1]\n",
|
|
|
@@ -109,6 +122,9 @@
|
|
|
" return logits, values\n",
|
|
|
"\n",
|
|
|
" def generate(self, idx, max_new_tokens=20):\n",
|
|
|
+ " '''\n",
|
|
|
+ " 生成文本\n",
|
|
|
+ " '''\n",
|
|
|
" model = self.actor\n",
|
|
|
" return model.generate(idx, max_new_tokens=max_new_tokens,\n",
|
|
|
" pad_token_id=tokenizer.eos_token_id)\n",
|
|
|
@@ -137,6 +153,7 @@
|
|
|
" modules_to_save=['critic'])\n",
|
|
|
" return PeftModel(model, config, adapter_name='lora_ppo')\n",
|
|
|
"\n",
|
|
|
+ "# 增加LoRA适配器\n",
|
|
|
"model = init_peft_model(model)"
|
|
|
]
|
|
|
},
|
|
|
@@ -163,13 +180,21 @@
|
|
|
],
|
|
|
"source": [
|
|
|
"def get_forward_result(model, input_ids, response):\n",
|
|
|
+ " '''\n",
|
|
|
+ " 记录向前传播的结果,分别是logits,lnp和值函数\n",
|
|
|
+ " 为了使代码易懂,该函数只支持单条文本的计算\n",
|
|
|
+ " '''\n",
|
|
|
+ " # 记录背景文本的长度\n",
|
|
|
" _, lens = input_ids.shape\n",
|
|
|
" logits, values = model(response)\n",
|
|
|
+ " # 计算交叉熵的时候,需要注意logits和标签的对应关系\n",
|
|
|
" lnp = -F.cross_entropy(logits[:, :-1, :].transpose(-2, -1), response[:, 1:], reduction='none')\n",
|
|
|
+ " # 只记录针对生成文本的结果,其中L表示生成文本的长度\n",
|
|
|
" res = {\n",
|
|
|
- " 'logits': logits[:, lens-1:-1, :],\n",
|
|
|
- " 'lnp': lnp[:, lens-1:],\n",
|
|
|
- " 'values': values[:, lens:]\n",
|
|
|
+ " # 最后一个位置的logits没有作用\n",
|
|
|
+ " 'logits': logits[:, lens-1:-1, :], # (1, L, vs)\n",
|
|
|
+ " 'lnp': lnp[:, lens-1:], # (1, L)\n",
|
|
|
+ " 'values': values[:, lens:] # (1, L)\n",
|
|
|
" }\n",
|
|
|
" return res\n",
|
|
|
"\n",
|
|
|
@@ -177,6 +202,7 @@
|
|
|
"input_ids = example['input_ids']\n",
|
|
|
"response = model.generate(input_ids)\n",
|
|
|
"\n",
|
|
|
+ "# 验证get_forward_result计算结果的形状是准确的\n",
|
|
|
"example_re = get_forward_result(model, input_ids, response)\n",
|
|
|
"for k, v in example_re.items():\n",
|
|
|
" print(k, v.shape)"
|
|
|
@@ -214,12 +240,18 @@
|
|
|
],
|
|
|
"source": [
|
|
|
"def turn_on_train_mode(model, target):\n",
|
|
|
+ " '''\n",
|
|
|
+ " 只将模型中的特定组件设置为训练模式\n",
|
|
|
+ " '''\n",
|
|
|
" for name, module in model.named_modules():\n",
|
|
|
" if name.split('.')[-1] in target:\n",
|
|
|
" module.train()\n",
|
|
|
" return model\n",
|
|
|
"\n",
|
|
|
"def _test_turn_on_train_mode():\n",
|
|
|
+ " '''\n",
|
|
|
+ " 测试turn_on_train_mode是否正确\n",
|
|
|
+ " '''\n",
|
|
|
" test_model = A2CLLM(\n",
|
|
|
" AutoModelForCausalLM.from_pretrained('lvwerra/gpt2-imdb')).to(device)\n",
|
|
|
" config = LoraConfig(\n",
|
|
|
@@ -231,25 +263,25 @@
|
|
|
" bias='none',\n",
|
|
|
" init_lora_weights=False)\n",
|
|
|
" test_model = PeftModel(test_model, config, adapter_name='lora_ppo')\n",
|
|
|
+ " # 模型处于训练模式,由于随机失活的原因,每次运算的结果都不相同\n",
|
|
|
" test_model.train()\n",
|
|
|
" v1 = test_model(response)[1]\n",
|
|
|
" v2 = test_model(response)[1]\n",
|
|
|
- " # 不相等\n",
|
|
|
" print(v1 - v2)\n",
|
|
|
"\n",
|
|
|
" test_model.eval()\n",
|
|
|
+ " # 只将LoRA换至训练模式,由于LoRA里的随机失活,每次运算的结果也不相同\n",
|
|
|
" turn_on_train_mode(test_model, ['c_attn'])\n",
|
|
|
" v1 = test_model(response)[1]\n",
|
|
|
" v2 = test_model(response)[1]\n",
|
|
|
- " # 不相等\n",
|
|
|
" print(v1 - v2)\n",
|
|
|
"\n",
|
|
|
" test_model.eval()\n",
|
|
|
" turn_on_train_mode(test_model, ['c_attn'])\n",
|
|
|
+ " # 禁用LoRA之后,运算结果会相同\n",
|
|
|
" with test_model.disable_adapter():\n",
|
|
|
" v1 = test_model(response)[1]\n",
|
|
|
" v2 = test_model(response)[1]\n",
|
|
|
- " # 相等\n",
|
|
|
" print(v1 - v2)\n",
|
|
|
"\n",
|
|
|
"_test_turn_on_train_mode()"
|
|
|
@@ -281,15 +313,30 @@
|
|
|
"class RewardModel(nn.Module):\n",
|
|
|
"\n",
|
|
|
" def __init__(self, tokenizer):\n",
|
|
|
+ " '''\n",
|
|
|
+ " 评分模型\n",
|
|
|
+ " '''\n",
|
|
|
" super().__init__()\n",
|
|
|
" self.model = pipeline(\"sentiment-analysis\", model='lvwerra/distilbert-imdb')\n",
|
|
|
" self.tokenizer = tokenizer\n",
|
|
|
"\n",
|
|
|
" def forward(self, x):\n",
|
|
|
+ " '''\n",
|
|
|
+ " 向前传播,为了使代码易懂,该函数只支持单条文本的计算\n",
|
|
|
+ " 参数\n",
|
|
|
+ " ----\n",
|
|
|
+ " x :torch.LongTensor,文本,形状为(1, T)\n",
|
|
|
+ " 返回\n",
|
|
|
+ " ----\n",
|
|
|
+ " re :torch.FloatTensor,评分,形状为(1)\n",
|
|
|
+ " '''\n",
|
|
|
" re = []\n",
|
|
|
" x = [self.tokenizer.decode(i) for i in x]\n",
|
|
|
+ " # 此处的x等于背景文本+生成文本,因此得到的scores稍有不妥\n",
|
|
|
+ " # 更准确的做法是只对生成文本进行评分\n",
|
|
|
" scores = self.model(x)\n",
|
|
|
" for s in scores:\n",
|
|
|
+ " # 将POSITIVE的概率视为评分\n",
|
|
|
" if s['label'] == 'POSITIVE':\n",
|
|
|
" re.append(s['score'])\n",
|
|
|
" else:\n",
|
|
|
@@ -324,20 +371,25 @@
|
|
|
],
|
|
|
"source": [
|
|
|
"def compute_rewards(r_model, response, lnp, ref_lnp):\n",
|
|
|
- " # scores: (B), lnp: (B, T), ref_lnp: (B, T)\n",
|
|
|
+ " '''\n",
|
|
|
+ " 定义游戏奖励\n",
|
|
|
+ " 为了使代码易懂,该函数只支持单条文本的计算\n",
|
|
|
+ " '''\n",
|
|
|
+ " # scores的形状为(1), lnp的形状为(1, L), ref_lnp的形状为(1, L)\n",
|
|
|
" # r_model:评分模型,response:模型生成的回答\n",
|
|
|
" # lnp:新/旧模型的概率对数,ref_lnp:参考模型的概率对数\n",
|
|
|
" scores = r_model(response)\n",
|
|
|
" rewards = []\n",
|
|
|
" for score, lnprob, ref_lnprob in zip(scores, lnp, ref_lnp):\n",
|
|
|
- " kl = lnprob - ref_lnprob\n",
|
|
|
+ " kl = lnprob - ref_lnprob # ( L)\n",
|
|
|
" # kl_ctl_value是调节KL penalty的系数,大于0\n",
|
|
|
- " reward = -kl_ctl_value * kl\n",
|
|
|
+ " reward = -kl_ctl_value * kl # ( L)\n",
|
|
|
" # 游戏奖励等于模型评分 + KL penalty\n",
|
|
|
- " reward[-1] += score\n",
|
|
|
+ " reward[-1] += score # ( L)\n",
|
|
|
" rewards.append(reward)\n",
|
|
|
- " return torch.stack(rewards)\n",
|
|
|
+ " return torch.stack(rewards) # (1, L)\n",
|
|
|
"\n",
|
|
|
+ "# 得到参考模型的结果\n",
|
|
|
"with torch.no_grad():\n",
|
|
|
" with model.disable_adapter():\n",
|
|
|
" ref_example_re = get_forward_result(model, input_ids, response)\n",
|
|
|
@@ -361,7 +413,7 @@
|
|
|
" self.lambda_ = lambda_\n",
|
|
|
"\n",
|
|
|
" def __call__(self, rewards, values):\n",
|
|
|
- " # advantages table\n",
|
|
|
+ " # 优势函数\n",
|
|
|
" advantages = []\n",
|
|
|
" last_advantage = 0\n",
|
|
|
" vt_next = 0\n",
|
|
|
@@ -401,6 +453,14 @@
|
|
|
],
|
|
|
"source": [
|
|
|
"def compute_loss(old_lnp, lnp, vpred, advantages):\n",
|
|
|
+ " '''\n",
|
|
|
+ " 定义模型损失\n",
|
|
|
+ " 为了使代码易懂,该函数只支持单条文本的计算\n",
|
|
|
+ " '''\n",
|
|
|
+ " # old_lnp:旧模型的概率对数,形状为(1, L)\n",
|
|
|
+ " # lnp:新/旧模型的概率对数,形状为(1, L)\n",
|
|
|
+ " # vpred:值函数,形状为(1, L)\n",
|
|
|
+ " # advantages:优势函数,形状为(1, L)\n",
|
|
|
" # 值函数损失\n",
|
|
|
" vf_loss = -advantages * vpred\n",
|
|
|
" # 策略损失\n",
|
|
|
@@ -443,6 +503,7 @@
|
|
|
"source": [
|
|
|
"def play_game(model, r_model, gae, data):\n",
|
|
|
" model.eval()\n",
|
|
|
+ " # 分别是背景文本,回复,向前传播结果和优势函数\n",
|
|
|
" all_input_ids, all_response, all_res, all_advantages = [], [], [], []\n",
|
|
|
" for input_ids in data['input_ids']:\n",
|
|
|
" all_input_ids.append(input_ids)\n",
|
|
|
@@ -458,6 +519,7 @@
|
|
|
" ref_res = get_forward_result(model, input_ids, response)\n",
|
|
|
" rewards = compute_rewards(r_model, response, res['lnp'], ref_res['lnp'])\n",
|
|
|
" all_advantages.append(gae(rewards, res['values']))\n",
|
|
|
+ " # 只将LoRA适配器切换至训练模式\n",
|
|
|
" turn_on_train_mode(model, ['c_attn'])\n",
|
|
|
" return all_input_ids, all_response, all_res, all_advantages\n",
|
|
|
"\n",
|
|
|
@@ -488,18 +550,24 @@
|
|
|
],
|
|
|
"source": [
|
|
|
"def estimate_rewards(r_model, model, all_input_ids):\n",
|
|
|
+ " '''\n",
|
|
|
+ " 预估模型评分\n",
|
|
|
+ " '''\n",
|
|
|
" re = {}\n",
|
|
|
" # 将模型切换至评估模式\n",
|
|
|
" model.eval()\n",
|
|
|
" for input_ids in all_input_ids:\n",
|
|
|
+ " # 生成文本\n",
|
|
|
" response = model.generate(input_ids)\n",
|
|
|
+ " # 记录评分\n",
|
|
|
" re['score'] = re.get('score', 0) + r_model(response).item()\n",
|
|
|
+ " # 记录参考模型的评分\n",
|
|
|
" with model.disable_adapter():\n",
|
|
|
" response = model.generate(input_ids)\n",
|
|
|
" re['ref_score'] = re.get('ref_score', 0) + r_model(response).item()\n",
|
|
|
" re['score'] /= len(all_input_ids)\n",
|
|
|
" re['ref_score'] /= len(all_input_ids)\n",
|
|
|
- " # 将模型切换至训练模式\n",
|
|
|
+ " # 只将LoRA适配器切换至训练模式\n",
|
|
|
" turn_on_train_mode(model, ['c_attn'])\n",
|
|
|
" return re\n",
|
|
|
"\n",
|
|
|
@@ -567,6 +635,7 @@
|
|
|
" # 梯度裁剪\n",
|
|
|
" clip_grad_norm_(model.parameters(), grad_clip)\n",
|
|
|
" optimizer.step()\n",
|
|
|
+ " # 将最后一个批次数据作为测试集\n",
|
|
|
" res = estimate_rewards(r_model, model, tokenized[-mini_batch_size:]['input_ids'])\n",
|
|
|
" print(f'step {s:>4}: score {res[\"score\"]:.4f}, ref_score {res[\"ref_score\"]:.4f}')"
|
|
|
]
|