|
|
@@ -62,7 +62,8 @@ class YoloTrainer(object):
|
|
|
self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
|
|
|
|
|
|
# ---------------------------- Build Optimizer ----------------------------
|
|
|
- cfg.base_lr = cfg.per_image_lr * args.batch_size
|
|
|
+ self.grad_accumulate = max(256 // args.batch_size, 1)
|
|
|
+ cfg.base_lr = cfg.per_image_lr * args.batch_size * self.grad_accumulate
|
|
|
cfg.min_lr = cfg.base_lr * cfg.min_lr_ratio
|
|
|
self.optimizer, self.start_epoch = build_yolo_optimizer(cfg, model, args.resume)
|
|
|
|
|
|
@@ -208,21 +209,23 @@ class YoloTrainer(object):
|
|
|
loss_dict = self.criterion(outputs=outputs, targets=targets)
|
|
|
losses = loss_dict['losses']
|
|
|
loss_dict_reduced = distributed_utils.reduce_dict(loss_dict)
|
|
|
+ losses /= self.grad_accumulate
|
|
|
|
|
|
# Backward
|
|
|
self.scaler.scale(losses).backward()
|
|
|
|
|
|
# Optimize
|
|
|
- if self.cfg.clip_max_norm > 0:
|
|
|
- self.scaler.unscale_(self.optimizer)
|
|
|
- torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.cfg.clip_max_norm)
|
|
|
- self.scaler.step(self.optimizer)
|
|
|
- self.scaler.update()
|
|
|
- self.optimizer.zero_grad()
|
|
|
-
|
|
|
- # ModelEMA
|
|
|
- if self.model_ema is not None:
|
|
|
- self.model_ema.update(model)
|
|
|
+ if (iter_i + 1) % self.grad_accumulate == 0:
|
|
|
+ if self.cfg.clip_max_norm > 0:
|
|
|
+ self.scaler.unscale_(self.optimizer)
|
|
|
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.cfg.clip_max_norm)
|
|
|
+ self.scaler.step(self.optimizer)
|
|
|
+ self.scaler.update()
|
|
|
+ self.optimizer.zero_grad()
|
|
|
+
|
|
|
+ # ModelEMA
|
|
|
+ if self.model_ema is not None:
|
|
|
+ self.model_ema.update(model)
|
|
|
|
|
|
# Update log
|
|
|
metric_logger.update(**loss_dict_reduced)
|
|
|
@@ -343,7 +346,8 @@ class RTDetrTrainer(object):
|
|
|
self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
|
|
|
|
|
|
# ---------------------------- Build Optimizer ----------------------------
|
|
|
- cfg.base_lr = cfg.per_image_lr * args.batch_size
|
|
|
+ self.grad_accumulate = max(16 // args.batch_size, 1)
|
|
|
+ cfg.base_lr = cfg.per_image_lr * args.batch_size * self.grad_accumulate
|
|
|
cfg.min_lr = cfg.base_lr * cfg.min_lr_ratio
|
|
|
self.optimizer, self.start_epoch = build_rtdetr_optimizer(cfg, model, args.resume)
|
|
|
|
|
|
@@ -477,22 +481,24 @@ class RTDetrTrainer(object):
|
|
|
outputs = model(images, targets)
|
|
|
loss_dict = self.criterion(outputs, targets)
|
|
|
losses = sum(loss_dict.values())
|
|
|
+ losses /= self.grad_accumulate
|
|
|
loss_dict_reduced = distributed_utils.reduce_dict(loss_dict)
|
|
|
|
|
|
# Backward
|
|
|
self.scaler.scale(losses).backward()
|
|
|
|
|
|
# Optimize
|
|
|
- if self.cfg.clip_max_norm > 0:
|
|
|
- self.scaler.unscale_(self.optimizer)
|
|
|
- torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.cfg.clip_max_norm)
|
|
|
- self.scaler.step(self.optimizer)
|
|
|
- self.scaler.update()
|
|
|
- self.optimizer.zero_grad()
|
|
|
-
|
|
|
- # ModelEMA
|
|
|
- if self.model_ema is not None:
|
|
|
- self.model_ema.update(model)
|
|
|
+ if (iter_i + 1) % self.grad_accumulate == 0:
|
|
|
+ if self.cfg.clip_max_norm > 0:
|
|
|
+ self.scaler.unscale_(self.optimizer)
|
|
|
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.cfg.clip_max_norm)
|
|
|
+ self.scaler.step(self.optimizer)
|
|
|
+ self.scaler.update()
|
|
|
+ self.optimizer.zero_grad()
|
|
|
+
|
|
|
+ # ModelEMA
|
|
|
+ if self.model_ema is not None:
|
|
|
+ self.model_ema.update(model)
|
|
|
|
|
|
# Update log
|
|
|
metric_logger.update(**loss_dict_reduced)
|