In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch.optim as optim
from datasets import load_dataset
from transformers import pipeline

torch.manual_seed(12046)



In [2]:
# 一些超参数
learning_rate = 5e-5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
gamma = 1.0
lambda_ = 0.95
kl_ctl_value = 0.2
cliprange = 0.2
vf_coef = 0.1
# 经过mini_batch_size步后,更新旧模型
mini_batch_size = 20
grad_clip = 1.0

In [3]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [4]:
def prepare_input(data):
 data['input_ids'] = [tokenizer.encode(data['text'])[:8]]
 return data

datasets = load_dataset('imdb', split='train[:500]')
datasets = datasets.filter(lambda x: len(x['text']) > 20)
tokenized = datasets.map(prepare_input, remove_columns=datasets.column_names)
tokenized.set_format(type='torch', device=device)
example = tokenized[1]

In [5]:
class A2CLLM(nn.Module):

 def __init__(self, model):
 super().__init__()
 self.actor = model
 # 值函数估计头
 self.critic = nn.Linear(model.base_model.embed_dim, 1, bias=False)

 def forward(self, x):
 '''
 向前传播,为了使代码易懂,该函数只支持单条文本的计算
 参数
 ----
 x :torch.LongTensor,文本,形状为(1, T)
 返回
 ----
 logits :torch.FloatTensor,logits,形状为(1, T, vs)
 values :torch.FloatTensor,值函数,形状为(1, T)
 '''
 _res = self.actor(input_ids=x, output_hidden_states=True)
 logits = _res.logits
 emb = _res.hidden_states[-1]
 values = self.critic(emb).squeeze(-1)
 return logits, values

 def generate(self, idx, max_new_tokens=20):
 '''
 生成文本
 '''
 model = self.actor
 return model.generate(idx, max_new_tokens=max_new_tokens,
 pad_token_id=tokenizer.eos_token_id)

model = A2CLLM(AutoModelForCausalLM.from_pretrained('lvwerra/gpt2-imdb')).to(device)

In [6]:
from peft import LoraConfig, PeftModel

def init_peft_model(model):
 config = LoraConfig(
 r=1,
 lora_alpha=8,
 target_modules=['c_attn'],
 fan_in_fan_out=True,
 lora_dropout=0.1,
 bias='none',
 modules_to_save=['critic'])
 return PeftModel(model, config, adapter_name='lora_ppo')

# 增加LoRA适配器
model = init_peft_model(model)

In [7]:
def get_forward_result(model, input_ids, response):
 '''
 记录向前传播的结果,分别是logits,lnp和值函数
 为了使代码易懂,该函数只支持单条文本的计算
 '''
 # 记录背景文本的长度
 _, lens = input_ids.shape
 logits, values = model(response)
 # 计算交叉熵的时候,需要注意logits和标签的对应关系
 lnp = -F.cross_entropy(logits[:, :-1, :].transpose(-2, -1), response[:, 1:], reduction='none')
 # 只记录针对生成文本的结果,其中L表示生成文本的长度
 res = {
 # 最后一个位置的logits没有作用
 'logits': logits[:, lens-1:-1, :], # (1, L, vs)
 'lnp': lnp[:, lens-1:], # (1, L)
 'values': values[:, lens:] # (1, L)
 }
 return res


input_ids = example['input_ids']
response = model.generate(input_ids)

# 验证get_forward_result计算结果的形状是准确的
example_re = get_forward_result(model, input_ids, response)
for k, v in example_re.items():
 print(k, v.shape)

logits torch.Size([1, 20, 50257])
lnp torch.Size([1, 20])
values torch.Size([1, 20])


In [8]:
def turn_on_train_mode(model, target):
 '''
 只将模型中的特定组件设置为训练模式
 '''
 for name, module in model.named_modules():
 if name.split('.')[-1] in target:
 module.train()
 return model

def _test_turn_on_train_mode():
 '''
 测试turn_on_train_mode是否正确
 '''
 test_model = A2CLLM(
 AutoModelForCausalLM.from_pretrained('lvwerra/gpt2-imdb')).to(device)
 config = LoraConfig(
 r=1,
 lora_alpha=8,
 target_modules=['c_attn'],
 fan_in_fan_out=True,
 lora_dropout=0.1,
 bias='none',
 init_lora_weights=False)
 test_model = PeftModel(test_model, config, adapter_name='lora_ppo')
 # 模型处于训练模式,由于随机失活的原因,每次运算的结果都不相同
 test_model.train()
 v1 = test_model(response)[1]
 v2 = test_model(response)[1]
 print(v1 - v2)

 test_model.eval()
 # 只将LoRA换至训练模式,由于LoRA里的随机失活,每次运算的结果也不相同
 turn_on_train_mode(test_model, ['c_attn'])
 v1 = test_model(response)[1]
 v2 = test_model(response)[1]
 print(v1 - v2)

 test_model.eval()
 turn_on_train_mode(test_model, ['c_attn'])
 # 禁用LoRA之后,运算结果会相同
 with test_model.disable_adapter():
 v1 = test_model(response)[1]
 v2 = test_model(response)[1]
 print(v1 - v2)

_test_turn_on_train_mode()

tensor([[ 0.3356, -0.3501, -0.6011, -0.4132, 1.0261, 0.8811, -0.3165, 0.4929,
 -0.9196, -0.3321, -0.2723, -0.1996, -0.6541, 0.1892, 0.6956, 0.3488,
 0.2956, 0.3583, 0.2754, 0.5844, 0.7313, 0.1374, 0.5127, -0.1030,
 0.5666, -0.0081, 0.3219, -0.0353]], device='cuda:0',
 grad_fn=)
tensor([[ 0.0418, 0.5579, 0.5273, 1.0549, 0.5402, 0.1473, 0.3205, 0.0311,
 0.6900, -0.2323, 0.1526, 0.4450, 0.1746, 0.6160, -0.2214, -0.1989,
 0.1022, 0.2701, -0.0173, -0.0539, -0.1477, 0.0678, -0.0153, -0.6429,
 -0.3822, -0.4266, -0.2184, -0.4352]], device='cuda:0',
 grad_fn=)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 0., 0., 0., 0.]], device='cuda:0', grad_fn=)


In [9]:
class RewardModel(nn.Module):

 def __init__(self, tokenizer):
 '''
 评分模型
 '''
 super().__init__()
 self.model = pipeline("sentiment-analysis", model='lvwerra/distilbert-imdb')
 self.tokenizer = tokenizer

 def forward(self, x):
 '''
 向前传播,为了使代码易懂,该函数只支持单条文本的计算
 参数
 ----
 x :torch.LongTensor,文本,形状为(1, T)
 返回
 ----
 re :torch.FloatTensor,评分,形状为(1)
 '''
 re = []
 x = [self.tokenizer.decode(i) for i in x]
 # 此处的x等于背景文本+生成文本,因此得到的scores稍有不妥
 # 更准确的做法是只对生成文本进行评分
 scores = self.model(x)
 for s in scores:
 # 将POSITIVE的概率视为评分
 if s['label'] == 'POSITIVE':
 re.append(s['score'])
 else:
 re.append(1 - s['score'])
 return torch.tensor(re)

r_model = RewardModel(tokenizer).to(device)
r_model(response)

tensor([0.9959])

In [10]:
def compute_rewards(r_model, response, lnp, ref_lnp):
 '''
 定义游戏奖励
 为了使代码易懂,该函数只支持单条文本的计算
 '''
 # scores的形状为(1), lnp的形状为(1, L), ref_lnp的形状为(1, L)
 # r_model:评分模型,response:模型生成的回答
 # lnp:新/旧模型的概率对数,ref_lnp:参考模型的概率对数
 scores = r_model(response)
 rewards = []
 for score, lnprob, ref_lnprob in zip(scores, lnp, ref_lnp):
 kl = lnprob - ref_lnprob # ( L)
 # kl_ctl_value是调节KL penalty的系数,大于0
 reward = -kl_ctl_value * kl # ( L)
 # 游戏奖励等于模型评分 + KL penalty
 reward[-1] += score # ( L)
 rewards.append(reward)
 return torch.stack(rewards) # (1, L)

# 得到参考模型的结果
with torch.no_grad():
 with model.disable_adapter():
 ref_example_re = get_forward_result(model, input_ids, response)

rewards = compute_rewards(r_model, response, example_re['lnp'], ref_example_re['lnp'])
rewards.shape

torch.Size([1, 20])

In [11]:
class GAE:

 def __init__(self, gamma, lambda_):
 self.gamma = gamma
 self.lambda_ = lambda_

 def __call__(self, rewards, values):
 # 优势函数
 advantages = []
 last_advantage = 0
 vt_next = 0
 for r, vt in zip(reversed(rewards), reversed(values)):
 delta = r + self.gamma * vt_next - vt
 last_advantage = delta + self.gamma * self.lambda_ * last_advantage
 advantages.insert(0, last_advantage)
 vt_next = vt

 return torch.stack(advantages)

gae = GAE(gamma, lambda_)
advantages = gae(rewards, example_re['values'])

In [12]:
def compute_loss(old_lnp, lnp, vpred, advantages):
 '''
 定义模型损失
 为了使代码易懂,该函数只支持单条文本的计算
 '''
 # old_lnp:旧模型的概率对数,形状为(1, L)
 # lnp:新/旧模型的概率对数,形状为(1, L)
 # vpred:值函数,形状为(1, L)
 # advantages:优势函数,形状为(1, L)
 # 值函数损失
 vf_loss = -advantages * vpred
 # 策略损失
 ratio = torch.exp(lnp - old_lnp)
 pg_losses = -advantages * ratio
 pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - cliprange, 1.0 + cliprange)
 pg_loss = torch.max(pg_losses, pg_losses2)
 # 整体损失
 loss = pg_loss.mean() + vf_coef * vf_loss.mean()
 return loss

compute_loss(example_re['lnp'], example_re['lnp'], example_re['values'], advantages)

tensor(-0.2746, device='cuda:0', grad_fn=)

In [13]:
def play_game(model, r_model, gae, data):
 model.eval()
 # 分别是背景文本,回复,向前传播结果和优势函数
 all_input_ids, all_response, all_res, all_advantages = [], [], [], []
 for input_ids in data['input_ids']:
 all_input_ids.append(input_ids)
 # 生成评论
 response = model.generate(input_ids)
 all_response.append(response)
 with torch.no_grad():
 # 记录旧模型数据
 res = get_forward_result(model, input_ids, response)
 all_res.append(res)
 # 记录参考模型数据
 with model.disable_adapter():
 ref_res = get_forward_result(model, input_ids, response)
 rewards = compute_rewards(r_model, response, res['lnp'], ref_res['lnp'])
 all_advantages.append(gae(rewards, res['values']))
 # 只将LoRA适配器切换至训练模式
 turn_on_train_mode(model, ['c_attn'])
 return all_input_ids, all_response, all_res, all_advantages

play_game(model, r_model, gae, tokenized[:2])[0]

[tensor([[ 40, 26399, 314, 3001, 327, 47269, 20958, 12]],
 device='cuda:0'),
 tensor([[ 1, 40, 1703, 44269, 25, 12550, 1, 318]],
 device='cuda:0')]

In [14]:
def estimate_rewards(r_model, model, all_input_ids):
 '''
 预估模型评分
 '''
 re = {}
 # 将模型切换至评估模式
 model.eval()
 for input_ids in all_input_ids:
 # 生成文本
 response = model.generate(input_ids)
 # 记录评分
 re['score'] = re.get('score', 0) + r_model(response).item()
 # 记录参考模型的评分
 with model.disable_adapter():
 response = model.generate(input_ids)
 re['ref_score'] = re.get('ref_score', 0) + r_model(response).item()
 re['score'] /= len(all_input_ids)
 re['ref_score'] /= len(all_input_ids)
 # 只将LoRA适配器切换至训练模式
 turn_on_train_mode(model, ['c_attn'])
 return re

estimate_rewards(r_model, model, tokenized[:20]['input_ids'])

{'score': 0.5244841426610947, 'ref_score': 0.5244841426610947}

In [15]:
steps = datasets.num_rows // mini_batch_size
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

for s in range(steps-1):
 data = tokenized[s * mini_batch_size: (s + 1) * mini_batch_size]
 # 进行游戏,收集数据。play_game返回的数据都是无法计算梯度的
 # 在play_game中,会基于model生成参考模型
 input_ids, response, old_res, advantages = play_game(model, r_model, gae, data)
 # 循环完成之后,才用新模型替换旧模型
 for _ids, _resp, _old_res, _ad in zip(input_ids, response, old_res, advantages):
 optimizer.zero_grad(set_to_none=True)
 # 收集新模型的数据,model_res里面的数据可以计算梯度
 model_res = get_forward_result(model, _ids, _resp)
 loss = compute_loss(_old_res['lnp'], model_res['lnp'], model_res['values'], _ad)
 loss.backward()
 # 梯度裁剪
 clip_grad_norm_(model.parameters(), grad_clip)
 optimizer.step()
 # 将最后一个批次数据作为测试集
 res = estimate_rewards(r_model, model, tokenized[-mini_batch_size:]['input_ids'])
 print(f'step {s:>4}: score {res["score"]:.4f}, ref_score {res["ref_score"]:.4f}')

step 0: score 0.5412, ref_score 0.5085
step 1: score 0.5412, ref_score 0.5085
step 2: score 0.5085, ref_score 0.5085
step 3: score 0.5412, ref_score 0.5085
step 4: score 0.5180, ref_score 0.5085
step 5: score 0.5182, ref_score 0.5085
step 6: score 0.4743, ref_score 0.5085
step 7: score 0.4743, ref_score 0.5085
step 8: score 0.4741, ref_score 0.5085
step 9: score 0.4741, ref_score 0.5085
step 10: score 0.4725, ref_score 0.5085
step 11: score 0.5210, ref_score 0.5085
step 12: score 0.5225, ref_score 0.5085
step 13: score 0.5168, ref_score 0.5085
step 14: score 0.5184, ref_score 0.5085
step 15: score 0.5135, ref_score 0.5085
step 16: score 0.5147, ref_score 0.5085
step 17: score 0.5129, ref_score 0.5085
step 18: score 0.6062, ref_score 0.5085
step 19: score 0.6182, ref_score 0.5085
step 20: score 0.6737, ref_score 0.5085
step 21: score 0.6730, ref_score 0.5085
step 22: score 0.6731, ref_score 0.5085
step 23: score 0.6724, ref_score 0.5085
