yjh0410 1 год назад
Родитель
Сommit
f705024e0a
4 измененных файлов с 48 добавлено и 39 удалено
  1. 5 5
      config/data_config/transform_config.py
  2. 7 4
      models/detectors/__init__.py
  3. 4 3
      train.py
  4. 32 27
      utils/solver/optimizer.py

+ 5 - 5
config/data_config/transform_config.py

@@ -21,7 +21,7 @@ yolo_x_trans_config = {
     # Basic Augment
     'affine_params': {
         'degrees': 0.0,
-        'translate': 0.2,
+        'translate': 0.1,
         'scale': [0.1, 2.0],
         'shear': 0.0,
         'perspective': 0.0,
@@ -43,7 +43,7 @@ yolo_l_trans_config = {
     # Basic Augment
     'affine_params': {
         'degrees': 0.0,
-        'translate': 0.2,
+        'translate': 0.1,
         'scale': [0.1, 2.0],
         'shear': 0.0,
         'perspective': 0.0,
@@ -65,7 +65,7 @@ yolo_m_trans_config = {
     # Basic Augment
     'affine_params': {
         'degrees': 0.0,
-        'translate': 0.2,
+        'translate': 0.1,
         'scale': [0.1, 2.0],
         'shear': 0.0,
         'perspective': 0.0,
@@ -87,8 +87,8 @@ yolo_s_trans_config = {
     # Basic Augment
     'affine_params': {
         'degrees': 0.0,
-        'translate': 0.2,
-        'scale': [0.1, 2.0],
+        'translate': 0.1,
+        'scale': [0.5, 1.5],
         'shear': 0.0,
         'perspective': 0.0,
         'hsv_h': 0.015,

+ 7 - 4
models/detectors/__init__.py

@@ -86,12 +86,15 @@ def build_model(args,
 
         # keep training
         if args.resume and args.resume != "None":
-            print('keep training: ', args.resume)
             checkpoint = torch.load(args.resume, map_location='cpu')
             # checkpoint state dict
-            checkpoint_state_dict = checkpoint.pop("model")
-            model.load_state_dict(checkpoint_state_dict)
-            del checkpoint, checkpoint_state_dict
+            try:
+                checkpoint_state_dict = checkpoint.pop("model")
+                print('Load model from the checkpoint: ', args.resume)
+                model.load_state_dict(checkpoint_state_dict)
+                del checkpoint, checkpoint_state_dict
+            except:
+                print("No model in the given checkpoint.")
 
         return model, criterion
 

+ 4 - 3
train.py

@@ -177,11 +177,11 @@ def train():
     model, criterion = build_model(args, model_cfg, device, data_cfg['num_classes'], True)
     model = model.to(device).train()
     model_without_ddp = model
-    if args.sybn and args.distributed:
-        print('use SyncBatchNorm ...')
-        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
     if args.distributed:
         model = DDP(model, device_ids=[args.gpu], find_unused_parameters=args.find_unused_parameters)
+        if args.sybn:
+            print('use SyncBatchNorm ...')
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
         model_without_ddp = model.module
     ## Calcute Params & GFLOPs
     if distributed_utils.is_main_process:
@@ -204,6 +204,7 @@ def train():
         # to check whether the evaluator can work
         model_eval = model_without_ddp
         trainer.eval(model_eval)
+        return
 
     ## Satrt Training
     trainer.train(model)

+ 32 - 27
utils/solver/optimizer.py

@@ -1,5 +1,4 @@
 import torch
-import torch.nn as nn
 
 
 def build_optimizer(cfg, model, resume=None):
@@ -9,36 +8,42 @@ def build_optimizer(cfg, model, resume=None):
     print('--momentum: {}'.format(cfg['momentum']))
     print('--weight_decay: {}'.format(cfg['weight_decay']))
 
-    g = [], [], []  # optimizer parameter groups
-    bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k)  # normalization layers, i.e. BatchNorm2d()
-    for v in model.modules():
-        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):  # bias (no decay)
-            g[2].append(v.bias)
-        if isinstance(v, bn):  # weight (no decay)
-            g[1].append(v.weight)
-        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):  # weight (with decay)
-            g[0].append(v.weight)
+    # ------------- Divide model's parameters -------------
+    param_dicts = [], [], []
+    norm_names = ["norm"] + ["norm{}".format(i) for i in range(10000)]
+    for n, p in model.named_parameters():
+        if p.requires_grad:
+            if "bias" == n.split(".")[-1]:
+                param_dicts[0].append(p)      # no weight decay for all layers' bias
+            else:
+                if n.split(".")[-2] in norm_names:
+                    param_dicts[1].append(p)  # no weight decay for all NormLayers' weight
+                else:
+                    param_dicts[2].append(p)  # weight decay for all Non-NormLayers' weight
 
-    if cfg['optimizer'] == 'adam':
-        optimizer = torch.optim.Adam(g[2], lr=cfg['lr0'])  # adjust beta1 to momentum
-    elif cfg['optimizer'] == 'adamw':
-        optimizer = torch.optim.AdamW(g[2], lr=cfg['lr0'], weight_decay=0.0)
-    elif cfg['optimizer'] == 'sgd':
-        optimizer = torch.optim.SGD(g[2], lr=cfg['lr0'], momentum=cfg['momentum'], nesterov=True)
+    # Build optimizer
+    if cfg['optimizer'] == 'sgd':
+        optimizer = torch.optim.SGD(param_dicts[0], lr=cfg['lr0'], momentum=cfg['momentum'], weight_decay=0.0)
+    elif cfg['optimizer'] =='adamw':
+        optimizer = torch.optim.AdamW(param_dicts[0], lr=cfg['lr0'], weight_decay=0.0)
     else:
-        raise NotImplementedError('Optimizer {} not implemented.'.format(cfg['optimizer']))
-
-    optimizer.add_param_group({'params': g[0], 'weight_decay': cfg['weight_decay']})  # add g0 with weight_decay
-    optimizer.add_param_group({'params': g[1], 'weight_decay': 0.0})                  # add g1 (BatchNorm2d weights)
+        raise NotImplementedError("Unknown optimizer: {}".format(cfg['optimizer']))
+    
+    # Add param groups
+    optimizer.add_param_group({"params": param_dicts[1], "weight_decay": 0.0})
+    optimizer.add_param_group({"params": param_dicts[2], "weight_decay": cfg['weight_decay']})
 
     start_epoch = 0
-    if resume and resume != "None":
-        print('keep training: ', resume)
-        checkpoint = torch.load(resume, map_location='cpu')
+    if resume and resume != 'None':
+        checkpoint = torch.load(resume)
         # checkpoint state dict
-        checkpoint_state_dict = checkpoint.pop("optimizer")
-        optimizer.load_state_dict(checkpoint_state_dict)
-        start_epoch = checkpoint.pop("epoch") + 1
-        del checkpoint, checkpoint_state_dict
+        try:
+            checkpoint_state_dict = checkpoint.pop("optimizer")
+            print('Load optimizer from the checkpoint: ', resume)
+            optimizer.load_state_dict(checkpoint_state_dict)
+            start_epoch = checkpoint.pop("epoch") + 1
+            del checkpoint, checkpoint_state_dict
+        except:
+            print("No optimzier in the given checkpoint.")
                                                         
     return optimizer, start_epoch