In [None]:
# 安装第三方库
!pip install peft

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from peft import LoraConfig, PeftModel


torch.manual_seed(12046)



In [3]:
class Lora(nn.Module):
 
 def __init__(self, model, r=4, lora_alpha=16):
 '''
 LoRA的实现示例:在线性模型中加入LoRA层
 参数
 ----
 model :线性模型
 r :int,LoRA的秩
 lora_alpha :int,LoRA算法里的alpha
 '''
 super().__init__()
 # model是线性模型
 self.model = model
 # 冻结模型
 self._freezing_model()
 self.lora_A = nn.Linear(model.in_features, r, bias=False)
 self.lora_B = nn.Linear(r, model.out_features, bias=False)
 # 定义LoRA的缩放比例
 self.scaling = lora_alpha / r
 
 def _freezing_model(self):
 for p in self.model.parameters():
 p.requires_grad = False
 
 def forward(self, x):
 origin = self.model(x)
 delta = self.lora_B(self.lora_A(x)) * self.scaling
 return origin + delta

In [4]:
def _test_lora(model, r=4, lora_alpha=16):
 '''
 测试LoRA实现的准确性
 '''
 lora_model = Lora(model, r, lora_alpha)
 # 生成对比模型
 _model = nn.ModuleDict({'lin': model})
 config = LoraConfig(
 r=r, lora_alpha=lora_alpha,
 target_modules=['lin'],
 # 为了测试,我们将随机产生LoRA的初始参数
 # 正常情况下,我们并不更改这个参数的默认值(True)
 init_lora_weights=False)
 peft_model = PeftModel(_model, config)
 lin = peft_model.base_model.model.lin
 # 复制LoRA参数
 lora_model.lora_A.weight.data = lin.lora_A.default.weight.clone()
 lora_model.lora_B.weight.data = lin.lora_B.default.weight.clone()
 x = torch.randn(10, model.in_features)
 return torch.all(torch.abs(lora_model(x) - lin(x)) < 1e-3)


linear_model = nn.Linear(10, 20)
_test_lora(linear_model)

tensor(True)

In [5]:
def print_trainable_parameters(model):
 """
 输出模型中可供训练的参数个数
 """
 trainable_params = 0
 all_param = 0
 for _, param in model.named_parameters():
 all_param += param.numel()
 if param.requires_grad:
 trainable_params += param.numel()
 trainable = f'trainable params: {trainable_params:,}'
 params = f'all params: {all_param:,}'
 percent = f'trainable%: {100 * trainable_params / all_param:.3f}'
 print(f'{trainable} || {params} || {percent}')

In [6]:
# 展示LoRA的效果
linear_model = nn.Linear(10, 6)
x = torch.randn(3, 10)
# 普通的线性模型
print_trainable_parameters(linear_model)
print(linear_model(x))
# 加入LoRA之后的模型
lora_model = Lora(linear_model)
print_trainable_parameters(lora_model)
print(lora_model(x))

trainable params: 66 || all params: 66 || trainable%: 100.000
tensor([[-1.3469e-03, -3.9655e-01, 5.7396e-01, 3.1267e-01, -1.6206e+00,
 -1.1444e-01],
 [ 1.4498e-01, 2.1979e-02, 7.9094e-01, 5.0265e-01, -1.9905e-01,
 -2.0630e-01],
 [ 6.7352e-01, -5.4856e-01, -1.0576e-01, -1.1910e+00, 6.0106e-01,
 -3.6762e-01]], grad_fn=)
trainable params: 64 || all params: 130 || trainable%: 49.231
tensor([[-1.2706, -0.6598, -1.7895, -2.0696, -0.8138, 0.5505],
 [-0.1612, 0.5151, 1.3028, -0.0054, -0.1200, -1.4266],
 [ 0.9780, -1.2645, -0.0722, 3.4606, 0.6124, -0.9581]],
 grad_fn=)


In [7]:
# 借助多层感知器,展示LoRA的使用细节
class MLP(nn.Module):
 
 def __init__(self, bias=False):
 '''
 多层感知器
 '''
 super().__init__()
 self.lin0 = nn.Linear(2, 4, bias=bias)
 self.lin1 = nn.Linear(4, 2, bias=bias)

 def forward(self, x):
 x = F.relu(self.lin0(x))
 x = self.lin1(x)
 return x

In [8]:
model = MLP()
x = torch.randn(2)
model

MLP(
 (lin0): Linear(in_features=2, out_features=4, bias=False)
 (lin1): Linear(in_features=4, out_features=2, bias=False)
)

In [9]:
# 普通多层感知器的结果
origin_re = model(x)
origin_re

tensor([-0.2006, 0.0176], grad_fn=)

In [10]:
config = LoraConfig(
 r=2,
 lora_alpha=16,
 target_modules=['lin0'],
 # 为了展示方便,我们将随机产生LoRA的初始参数
 # 正常情况下,我们并不更改这个参数的默认值(True)
 init_lora_weights=False)

# 加入LoRA之后的模型
peft_model = PeftModel(model, config, adapter_name='lora1')
peft_model

PeftModel(
 (base_model): LoraModel(
 (model): MLP(
 (lin0): Linear(
 in_features=2, out_features=4, bias=False
 (lora_dropout): ModuleDict(
 (lora1): Identity()
 )
 (lora_A): ModuleDict(
 (lora1): Linear(in_features=2, out_features=2, bias=False)
 )
 (lora_B): ModuleDict(
 (lora1): Linear(in_features=2, out_features=4, bias=False)
 )
 (lora_embedding_A): ParameterDict()
 (lora_embedding_B): ParameterDict()
 )
 (lin1): Linear(in_features=4, out_features=2, bias=False)
 )
 )
)

In [11]:
# 加入LoRA之后,原模型也更改了
origin_re, peft_model(x), model(x)

(tensor([-0.2006, 0.0176], grad_fn=),
 tensor([-0.3068, 0.1873], grad_fn=),
 tensor([-0.3068, 0.1873], grad_fn=))

In [12]:
# 禁用LoRA之后,模型恢复到原模型状态
with peft_model.disable_adapter():
 print(peft_model(x))
print(origin_re)

tensor([-0.2006, 0.0176])
tensor([-0.2006, 0.0176], grad_fn=)


In [13]:
# 将LoRA卸载之后,模型又恢复成初始状态
peft_model.unload()
origin_re, peft_model(x), model(x)

(tensor([-0.2006, 0.0176], grad_fn=),
 tensor([-0.2006, 0.0176]),
 tensor([-0.2006, 0.0176]))

In [14]:
# 在模型中加入多个LoRA适配器
config1 = LoraConfig(r=3, lora_alpha=16, target_modules=['lin0'])
config2 = LoraConfig(r=5, lora_alpha=16, target_modules=['lin0', 'lin1'])

model = MLP()
peft_model = PeftModel(model, config1, adapter_name='lora1')
peft_model.add_adapter(peft_config=config2, adapter_name='lora2')
peft_model

PeftModel(
 (base_model): LoraModel(
 (model): MLP(
 (lin0): Linear(
 in_features=2, out_features=4, bias=False
 (lora_dropout): ModuleDict(
 (lora1): Identity()
 (lora2): Identity()
 )
 (lora_A): ModuleDict(
 (lora1): Linear(in_features=2, out_features=3, bias=False)
 (lora2): Linear(in_features=2, out_features=5, bias=False)
 )
 (lora_B): ModuleDict(
 (lora1): Linear(in_features=3, out_features=4, bias=False)
 (lora2): Linear(in_features=5, out_features=4, bias=False)
 )
 (lora_embedding_A): ParameterDict()
 (lora_embedding_B): ParameterDict()
 )
 (lin1): Linear(
 in_features=4, out_features=2, bias=False
 (lora_dropout): ModuleDict(
 (lora2): Identity()
 )
 (lora_A): ModuleDict(
 (lora2): Linear(in_features=4, out_features=5, bias=False)
 )
 (lora_B): ModuleDict(
 (lora2): Linear(in_features=5, out_features=2, bias=False)
 )
 (lora_embedding_A): ParameterDict()
 (lora_embedding_B): ParameterDict()
 )
 )
 )
)

In [15]:
# 原始模型的参数不能被训练
print(peft_model.base_model.model.lin1.weight.requires_grad)
# 两个LoRA的参数可以被训练
print(peft_model.base_model.model.lin0.lora_B.lora2.weight.requires_grad)
print(peft_model.base_model.model.lin0.lora_B.lora1.weight.requires_grad)
optimizer = optim.SGD(peft_model.parameters(), lr=0.1)

False
True
True


In [16]:
# 使用其中一个适配器
optimizer.zero_grad()
print(f'active adapter: {peft_model.active_adapter}')
print(f'before bp, lora1: {peft_model.base_model.model.lin0.lora_B.lora1.weight.grad}')
print(f'before bp, lora2: {peft_model.base_model.model.lin0.lora_B.lora2.weight.grad}')
peft_model(x).sum().backward()
# 只有激活的(active)适配器才会计算梯度
print(f'after bp, lora1: {peft_model.base_model.model.lin0.lora_B.lora1.weight.grad}')
print(f'after bp, lora2: {peft_model.base_model.model.lin0.lora_B.lora2.weight.grad}')

active adapter: lora1
before bp, lora1: None
before bp, lora2: None
after bp, lora1: tensor([[ 0.0000, -0.0000, 0.0000],
 [ 0.0995, -0.6875, 0.6329],
 [-1.0511, 7.2626, -6.6856],
 [ 0.0000, -0.0000, 0.0000]])
after bp, lora2: None


In [17]:
# 切换适配器
peft_model.set_adapter('lora2')
optimizer.zero_grad()
print(f'active adapter: {peft_model.active_adapter}')
print(f'before bp, lora1: {peft_model.base_model.model.lin0.lora_A.lora1.weight.grad}')
print(f'before bp, lora2: {peft_model.base_model.model.lin0.lora_A.lora2.weight.grad}')
peft_model(x).sum().backward()
print(f'after bp, lora1: {peft_model.base_model.model.lin0.lora_B.lora1.weight.grad}')
print(f'after bp, lora2: {peft_model.base_model.model.lin0.lora_B.lora2.weight.grad}')

active adapter: lora2
before bp, lora1: None
before bp, lora2: None
after bp, lora1: None
after bp, lora2: tensor([[ 0.0000, -0.0000, -0.0000, 0.0000, -0.0000],
 [ 0.0056, -0.3897, -0.0461, 0.2852, -0.0939],
 [-0.0597, 4.1171, 0.4875, -3.0128, 0.9917],
 [ 0.0000, -0.0000, -0.0000, 0.0000, -0.0000]])
