yjh0410 1 سال پیش
والد
کامیت
373bd025d1

+ 1 - 5
odlab/config/__init__.py

@@ -1,15 +1,11 @@
 # ----------------------- Model Config -----------------------
-from .retinanet_config import retinanet_cfg
 from .fcos_config      import fcos_cfg
 from .yolof_config     import yolof_cfg
 from .detr_config      import detr_cfg
 
 def build_config(args):
-    # RetinaNet
-    if args.model in retinanet_cfg.keys():
-        return retinanet_cfg[args.model]
     # FCOS
-    elif args.model in fcos_cfg.keys():
+    if   args.model in fcos_cfg.keys():
         return fcos_cfg[args.model]
     # YOLOF
     elif args.model in yolof_cfg.keys():

+ 11 - 1
odlab/config/detr_config.py

@@ -1,4 +1,14 @@
-# Plain DETR
+# DETR
+
+
+class DetrBaseConfig(object):
+    def __init__(self):
+        pass
+
+    def print_config(self):
+        config_dict = {key: value for key, value in self.__dict__.items() if not key.startswith('__')}
+        for k, v in config_dict.items():
+            print("{} : {}".format(k, v))
 
 detr_cfg = {
     'detr_r50':{

+ 169 - 0
odlab/config/fcos_config.py

@@ -1,6 +1,15 @@
 # Fully Convolutional One-Stage object detector
 
 
+class FcosBaseConfig(object):
+    def __init__(self):
+        pass
+
+    def print_config(self):
+        config_dict = {key: value for key, value in self.__dict__.items() if not key.startswith('__')}
+        for k, v in config_dict.items():
+            print("{} : {}".format(k, v))
+
 fcos_cfg = {
     'fcos_r18_1x':{
         # ----------------- Model-----------------
@@ -164,4 +173,164 @@ fcos_cfg = {
         'normalize_coords': False,
     },
 
+    'fcos_rt_r18_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet18',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone
+        'pretrained_weight': 'imagenet1k_v1',
+        'max_stride': 32,
+        'out_stride': [8, 16, 32],
+        ## Neck
+        'neck': 'basic_fpn',
+        'fpn_p6_feat': False,
+        'fpn_p7_feat': False,
+        'fpn_p6_from_c5': False,
+        ## Head
+        'head': 'fcos_head',
+        'head_dim': 256,
+        'num_cls_head': 4,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': 'GN',
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.6,
+        'test_topk': 100,
+        'test_conf_thresh': 0.5,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'fcos_matcher',
+        'matcher_hpy':{'center_sampling_radius': 1.5,
+                       'object_sizes_of_interest': [[-1, 64], [64, 128], [128, float('inf')]]
+                       },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        'loss_ctn_weight': 1.0,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.01 / 16,
+        'backbone_lr_ratio': 1.0 / 1.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': -1.0,
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 36,       # 1x
+        'lr_epoch': [24, 33],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608],   # short edge of image
+        'train_max_size': 900,
+        'test_min_size': [512],
+        'test_max_size': 736,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
+    'fcos_rt_r50_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet50',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone
+        'pretrained_weight': 'imagenet1k_v1',
+        'max_stride': 32,
+        'out_stride': [8, 16, 32],
+        ## Neck
+        'neck': 'basic_fpn',
+        'fpn_p6_feat': False,
+        'fpn_p7_feat': False,
+        'fpn_p6_from_c5': False,
+        ## Head
+        'head': 'fcos_head',
+        'head_dim': 256,
+        'num_cls_head': 4,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': 'GN',
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.6,
+        'test_topk': 100,
+        'test_conf_thresh': 0.5,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'fcos_matcher',
+        'matcher_hpy':{'center_sampling_radius': 1.5,
+                       'object_sizes_of_interest': [[-1, 64], [64, 128], [128, float('inf')]]
+                       },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        'loss_ctn_weight': 1.0,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.01 / 16,
+        'backbone_lr_ratio': 1.0 / 1.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': -1.0,
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 36,       # 1x
+        'lr_epoch': [24, 33],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608],   # short edge of image
+        'train_max_size': 900,
+        'test_min_size': [512],
+        'test_max_size': 736,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
 }

+ 0 - 175
odlab/config/retinanet_config.py

@@ -1,175 +0,0 @@
-# RetinaNet
-
-
-retinanet_cfg = {
-    'retinanet_r18_1x':{
-        # ----------------- Model-----------------
-        ## Backbone
-        'backbone': 'resnet18',
-        'backbone_norm': 'FrozeBN',
-        'res5_dilation': False,
-        'pretrained': True,
-        'pretrained_weight': 'imagenet1k_v1',
-        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone        
-        'max_stride': 128,
-        'out_stride': [8, 16, 32, 64, 128],
-        ## Neck
-        'neck': 'basic_fpn',
-        'fpn_p6_feat': True,
-        'fpn_p7_feat': True,
-        'fpn_p6_from_c5': True,
-        ## Head
-        'head': 'retinanet_head',
-        'head_dim': 256,
-        'num_cls_head': 4,
-        'num_reg_head': 4,
-        'head_act': 'relu',
-        'head_norm': None,
-        'anchor_config': {'basic_size': [[32, 32], [64, 64], [128, 128], [256, 256], [512, 512]],
-                          'aspect_ratio': [0.5, 1.0, 2.0],
-                          'area_scale': [2 ** 0, 2 ** (1. / 3.), 2 ** (2. / 3.)]},
-        ## Post-process
-        'train_topk': 1000,
-        'train_conf_thresh': 0.05,
-        'train_nms_thresh': 0.6,
-        'test_topk': 100,
-        'test_conf_thresh': 0.3,
-        'test_nms_thresh': 0.45,
-        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
-        # ----------------- Label Assignment -----------------
-        'matcher': 'retinanet_matcher',
-        'matcher_hpy': {'iou_thresh': [0.4, 0.5],
-                        'iou_labels': [0, -1, 1], # [negative sample, ignored sample, positive sample]
-                        'allow_low_quality_matches': True,
-                        },
-        # ----------------- Loss weight -----------------
-        ## Loss hyper-parameters
-        'focal_loss_alpha': 0.25,
-        'focal_loss_gamma': 2.0,
-        'loss_cls_weight': 1.0,
-        'loss_reg_weight': 1.0,
-        'use_giou_loss': False,
-        # ----------------- Training -----------------
-        ## Training scheduler
-        'scheduler': '1x',
-        ## Optimizer
-        'optimizer': 'sgd',
-        'base_lr': 0.01 / 16,
-        'backbone_lr_ratio': 1.0 / 1.0,
-        'momentum': 0.9,
-        'weight_decay': 1e-4,
-        'clip_max_norm': -1.0,
-        'param_dict_type': 'default',
-        ## LR Scheduler
-        'lr_scheduler': 'step',
-        'warmup': 'linear',
-        'warmup_iters': 500,
-        'warmup_factor': 0.00066667,
-        ## Epoch
-        'max_epoch': 12,      # 1x
-        'lr_epoch': [8, 11],  # 1x
-        # ----------------- Input -----------------
-        ## Transforms
-        'train_min_size': [800],   # short edge of image
-        'train_max_size': 1333,
-        'test_min_size': [800],
-        'test_max_size': 1333,
-        ## Pixel mean & std
-        'pixel_mean': [0.485, 0.456, 0.406],
-        'pixel_std':  [0.229, 0.224, 0.225],
-        ## Transforms
-        'detr_style': False,
-        'trans_config': [
-            {'name': 'RandomHFlip'},
-            {'name': 'RandomResize'},
-        ],
-        'box_format': 'xyxy',
-        'normalize_coords': False,
-    },
-
-    'retinanet_r50_1x':{
-        # ----------------- Model-----------------
-        ## Backbone
-        'backbone': 'resnet50',
-        'backbone_norm': 'FrozeBN',
-        'res5_dilation': False,
-        'pretrained': True,
-        'pretrained_weight': 'imagenet1k_v1',
-        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone        
-        'max_stride': 128,
-        'out_stride': [8, 16, 32, 64, 128],
-        ## Neck
-        'neck': 'basic_fpn',
-        'fpn_p6_feat': True,
-        'fpn_p7_feat': True,
-        'fpn_p6_from_c5': True,
-        ## Head
-        'head': 'retinanet_head',
-        'head_dim': 256,
-        'num_cls_head': 4,
-        'num_reg_head': 4,
-        'head_act': 'relu',
-        'head_norm': None,
-        'anchor_config': {'basic_size': [[32, 32], [64, 64], [128, 128], [256, 256], [512, 512]],
-                          'aspect_ratio': [0.5, 1.0, 2.0],
-                          'area_scale': [2 ** 0, 2 ** (1. / 3.), 2 ** (2. / 3.)]},
-        ## Post-process
-        'train_topk': 1000,
-        'train_conf_thresh': 0.05,
-        'train_nms_thresh': 0.6,
-        'test_topk': 100,
-        'test_conf_thresh': 0.3,
-        'test_nms_thresh': 0.45,
-        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
-        # ----------------- Label Assignment -----------------
-        'matcher': 'retinanet_matcher',
-        'matcher_hpy': {'iou_thresh': [0.4, 0.5],
-                        'iou_labels': [0, -1, 1], # [negative sample, ignored sample, positive sample]
-                        'allow_low_quality_matches': True,
-                        },
-        # ----------------- Loss weight -----------------
-        ## Loss hyper-parameters
-        'focal_loss_alpha': 0.25,
-        'focal_loss_gamma': 2.0,
-        'loss_cls_weight': 1.0,
-        'loss_reg_weight': 1.0,
-        'use_giou_loss': False,
-        # ----------------- Training -----------------
-        ## Training scheduler
-        'scheduler': '1x',
-        ## Optimizer
-        'optimizer': 'sgd',
-        'base_lr': 0.01 / 16,
-        'backbone_lr_ratio': 1.0 / 1.0,
-        'momentum': 0.9,
-        'weight_decay': 1e-4,
-        'clip_max_norm': -1.0,
-        'param_dict_type': 'default',
-        ## LR Scheduler
-        'lr_scheduler': 'step',
-        'warmup': 'linear',
-        'warmup_iters': 500,
-        'warmup_factor': 0.00066667,
-        ## Epoch
-        'max_epoch': 12,      # 1x
-        'lr_epoch': [8, 11],  # 1x
-        # ----------------- Input -----------------
-        ## Transforms
-        'train_min_size': [800],   # short edge of image
-        'train_max_size': 1333,
-        'test_min_size': [800],
-        'test_max_size': 1333,
-        ## Pixel mean & std
-        'pixel_mean': [0.485, 0.456, 0.406],
-        'pixel_std':  [0.229, 0.224, 0.225],
-        ## Transforms
-        'detr_style': False,
-        'trans_config': [
-            {'name': 'RandomHFlip'},
-            {'name': 'RandomResize'},
-        ],
-        'box_format': 'xyxy',
-        'normalize_coords': False,
-    },
-
-}

+ 9 - 0
odlab/config/yolof_config.py

@@ -1,6 +1,15 @@
 # Fully Convolutional One-Stage object detector
 
 
+class YolofBaseConfig(object):
+    def __init__(self):
+        pass
+
+    def print_config(self):
+        config_dict = {key: value for key, value in self.__dict__.items() if not key.startswith('__')}
+        for k, v in config_dict.items():
+            print("{} : {}".format(k, v))
+
 yolof_cfg = {
     # --------------- C5 level ---------------
     'yolof_r18_c5_1x':{

+ 6 - 6
odlab/datasets/__init__.py

@@ -6,15 +6,15 @@ from .coco import build_coco, coco_labels, coco_indexs
 from .transforms import build_transform
 
 
-def build_dataset(args, transform=None, is_train=False):
+def build_dataset(args, cfg, transform=None, is_train=False):
     if args.dataset == 'coco':
         dataset = build_coco(args, transform, is_train)
-        dataset_info = {
-            'class_labels': dataset.coco_labels,
-            'num_classes': 80
-        }
+        class_labels = coco_labels
+        num_classes  = 80
+    cfg.class_labels = class_labels
+    cfg.num_classes  = num_classes
 
-    return dataset, dataset_info
+    return dataset
 
 def build_dataloader(args, dataset, batch_size, collate_fn, is_train=False):
     if args.distributed:

+ 16 - 23
odlab/engine.py

@@ -21,27 +21,24 @@ def train_one_epoch(cfg,
                     epoch       : int,
                     vis_target  : bool,
                     warmup_lr_scheduler,
-                    class_labels = None,
-                    model_ema    = None,
                     debug       :bool = False
                     ):
     model.train()
     criterion.train()
     metric_logger = MetricLogger(delimiter="  ")
     metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
-    header = 'Epoch: [{} / {}]'.format(epoch, cfg['max_epoch'])
+    header = 'Epoch: [{} / {}]'.format(epoch, cfg.max_epoch)
     epoch_size = len(data_loader)
     print_freq = 10
 
-    iteration = 0
-    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
-        ni = iteration + epoch * epoch_size
+    for iter_i, (samples, targets) in metric_logger.log_every(data_loader, print_freq, header):
+        ni = iter_i + epoch * epoch_size
         # WarmUp
-        if ni < cfg['warmup_iters']:
+        if ni < cfg.warmup_iters:
             warmup_lr_scheduler(ni, optimizer)
-        elif ni == cfg['warmup_iters']:
+        elif ni == cfg.warmup_iters:
             print('Warmup stage is over.')
-            warmup_lr_scheduler.set_lr(optimizer, cfg['base_lr'])
+            warmup_lr_scheduler.set_lr(optimizer, cfg.base_lr)
 
         # To device
         images, masks = samples
@@ -51,7 +48,7 @@ def train_one_epoch(cfg,
 
         # Visualize train targets
         if vis_target:
-            vis_data(images, targets, masks, class_labels, cfg['normalize_coords'], cfg['box_format'])
+            vis_data(images, targets, masks, cfg.class_labels, cfg.normalize_coords, cfg.box_format)
 
         # Inference
         outputs = model(images, masks, targets)
@@ -60,13 +57,11 @@ def train_one_epoch(cfg,
         loss_dict = criterion(outputs, targets)
         loss_weight_dict = criterion.weight_dict
         losses = sum(loss_dict[k] * loss_weight_dict[k] for k in loss_dict.keys() if k in loss_weight_dict)
+        loss_value = losses.item()
+        losses /= cfg.grad_accumulate
 
         # Reduce losses over all GPUs for logging purposes
         loss_dict_reduced = distributed_utils.reduce_dict(loss_dict)
-        loss_dict_reduced_scaled = {k: v * loss_weight_dict[k] for k, v in loss_dict_reduced.items() if k in loss_weight_dict}
-        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
-
-        loss_value = losses_reduced_scaled.item()
 
         # Check loss
         if not math.isfinite(loss_value):
@@ -75,18 +70,16 @@ def train_one_epoch(cfg,
             sys.exit(1)
 
         # Backward
-        optimizer.zero_grad()
         losses.backward()
-        if cfg['clip_max_norm'] > 0:
-            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg['clip_max_norm'])
-        optimizer.step()
-        iteration += 1
+        if cfg.clip_max_norm > 0:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.clip_max_norm)
 
-        # ema
-        if model_ema is not None:
-            model_ema.update(model)
+        # Optimize
+        if (iter_i + 1) % cfg.grad_accumulate == 0:
+            optimizer.step()
+            optimizer.zero_grad()
 
-        metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled)
+        metric_logger.update(loss=loss_value, **loss_dict_reduced)
         metric_logger.update(lr=optimizer.param_groups[0]["lr"])
 
         if debug:

+ 2 - 2
odlab/evaluator/__init__.py

@@ -1,10 +1,10 @@
 from evaluator.coco_evaluator import COCOAPIEvaluator
 
 
-def build_evluator(args, cfg, device, testset=False):
+def build_evluator(args, cfg, device):
     evaluator = None
     # COCO Evaluator
     if args.dataset == 'coco':
-        evaluator = COCOAPIEvaluator(args, cfg, device, testset)
+        evaluator = COCOAPIEvaluator(args, cfg, device)
 
     return evaluator

+ 5 - 7
odlab/evaluator/coco_evaluator.py

@@ -4,23 +4,21 @@ import contextlib
 import torch
 from pycocotools.cocoeval import COCOeval
 
-from datasets import build_dataset, build_transform
-
+from datasets import build_transform
+from datasets.coco import build_coco
 
 class COCOAPIEvaluator():
-    def __init__(self, args, cfg, device, testset=False):
+    def __init__(self, args, cfg, device):
         # ----------------- Basic parameters -----------------
-        self.ddp_mode = True if args.distributed else False
-        self.image_set = 'test2017' if testset else 'val2017'
+        self.image_set = 'val2017'
         self.device = device
-        self.testset = testset
         # ----------------- Metrics -----------------
         self.map = 0.
         self.ap50_95 = 0.
         self.ap50 = 0.
         # ----------------- Dataset -----------------
         self.transform = build_transform(cfg, is_train=False)
-        self.dataset, self.dataset_info = build_dataset(args, self.transform, is_train=False)
+        self.dataset = build_coco(args, self.transform, is_train=False)
 
 
     @torch.no_grad()

+ 20 - 52
odlab/main.py

@@ -11,7 +11,6 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 
 from utils import distributed_utils
 from utils.misc import compute_flops, collate_fn
-from utils.misc import get_param_dict, ModelEMA
 from utils.optimizer import build_optimizer
 from utils.lr_scheduler import build_wp_lr_scheduler, build_lr_scheduler
 
@@ -36,12 +35,8 @@ def parse_args():
     # Model
     parser.add_argument('-m', '--model', default='yolof_r18_c5_1x',
                         help='build object detector')
-    parser.add_argument('-p', '--pretrained', default=None, type=str,
-                        help='load pretrained weight')
     parser.add_argument('-r', '--resume', default=None, type=str,
                         help='keep training')
-    parser.add_argument('--ema', default=None, type=str,
-                        help='use Model EMA trick.')
     # Dataset
     parser.add_argument('--root', default='/Users/liuhaoran/Desktop/python_work/object-detection/dataset/COCO/',
                         help='data root')
@@ -53,8 +48,6 @@ def parse_args():
     parser.add_argument('--num_workers', default=2, type=int, 
                         help='Number of workers used in dataloading')
     # Epoch
-    parser.add_argument('--eval_epoch', default=2, type=int,
-                        help='interval between evaluations')
     parser.add_argument('--save_folder', default='weights/', type=str, 
                         help='path to save weight')
     parser.add_argument('--eval_first', action="store_true", default=False,
@@ -68,8 +61,6 @@ def parse_args():
                         help='number of distributed processes')
     parser.add_argument('--sybn', action='store_true', default=False, 
                         help='use sybn.')
-    parser.add_argument('--find_unused_parameters', action='store_true', default=False, 
-                        help='set find_unused_parameters as True.')
     # Debug setting
     parser.add_argument('--debug', action='store_true', default=False, 
                         help='debug codes.')
@@ -93,7 +84,6 @@ def main():
     path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
     os.makedirs(path_to_save, exist_ok=True)
 
-
     # ---------------------------- Build DDP ----------------------------
     distributed_utils.init_distributed_mode(args)
     print("git:\n  {}\n".format(distributed_utils.get_sha()))
@@ -101,7 +91,6 @@ def main():
     print('World size: {}'.format(world_size))
     per_gpu_batch = args.batch_size // world_size
 
-
     # ---------------------------- Build CUDA ----------------------------
     if args.cuda and torch.cuda.is_available():
         print('use cuda')
@@ -109,28 +98,23 @@ def main():
     else:
         device = torch.device("cpu")
 
-
     # ---------------------------- Fix random seed ----------------------------
     fix_random_seed(args)
 
-
     # ---------------------------- Build config ----------------------------
     cfg = build_config(args)
     print('Model config: ', cfg)
 
-
     # ---------------------------- Build Dataset ----------------------------
     transforms = build_transform(cfg, is_train=True)
-    dataset, dataset_info = build_dataset(args, transforms, is_train=True)
-
+    dataset = build_dataset(args, cfg, transforms, is_train=True)
 
     # ---------------------------- Build Dataloader ----------------------------
     train_loader = build_dataloader(args, dataset, per_gpu_batch, collate_fn, is_train=True)
 
-
     # ---------------------------- Build model ----------------------------
     ## Build model
-    model, criterion = build_model(args, cfg, dataset_info['num_classes'], is_val=True)
+    model, criterion = build_model(args, cfg, cfg.num_classes, is_val=True)
     model.to(device)
     model_without_ddp = model
     ## Calcute Params & GFLOPs
@@ -139,51 +123,35 @@ def main():
         model_copy.trainable = False
         model_copy.eval()
         compute_flops(model=model_copy,
-                      min_size=cfg['test_min_size'],
-                      max_size=cfg['test_max_size'],
+                      min_size=cfg.test_min_size,
+                      max_size=cfg.test_max_size,
                       device=device)
         del model_copy
     if args.distributed:
         dist.barrier()
 
-
     # ---------------------------- Build Optimizer ----------------------------
-    cfg['base_lr'] = cfg['base_lr'] * args.batch_size
-    param_dicts = None
-    if 'param_dict_type' in cfg.keys() and cfg['param_dict_type'] != 'default':
-        print("- Param dict type: {}".format(cfg['param_dict_type']))
-        param_dicts = get_param_dict(model_without_ddp, cfg)
-    optimizer, start_epoch = build_optimizer(cfg, model_without_ddp, param_dicts, args.resume)
-
+    cfg.grad_accumulate = max(16 // args.batch_size, 1)
+    cfg.base_lr = cfg.per_image_lr * args.batch_size * cfg.grad_accumulate
+    optimizer, start_epoch = build_optimizer(cfg, model_without_ddp, args.resume)
 
     # ---------------------------- Build LR Scheduler ----------------------------
-    wp_lr_scheduler = build_wp_lr_scheduler(cfg, cfg['base_lr'])
+    wp_lr_scheduler = build_wp_lr_scheduler(cfg, cfg.base_lr)
     lr_scheduler    = build_lr_scheduler(cfg, optimizer, args.resume)
 
-
-    # ---------------------------- Build Model EMA ----------------------------
-    model_ema = None
-    if 'use_ema' in cfg.keys() and cfg['use_ema']:
-        print("Build Model EMA for {}".format(args.model))
-        model_ema = ModelEMA(cfg, model, start_epoch * len(train_loader))
-
-
     # ---------------------------- Build DDP model ----------------------------
     if args.distributed:
-        model = DDP(model, device_ids=[args.gpu], find_unused_parameters=args.find_unused_parameters)
+        model = DDP(model, device_ids=[args.gpu])
         model_without_ddp = model.module
 
-
     # ---------------------------- Build Evaluator ----------------------------
     evaluator = build_evluator(args, cfg, device)
 
-
     # ----------------------- Eval before training -----------------------
     if args.eval_first and distributed_utils.is_main_process():
         evaluator.evaluate(model_without_ddp)
         return
 
-
     # ----------------------- Training -----------------------
     print("Start training")
     best_map = -1.
@@ -201,8 +169,6 @@ def main():
                         epoch,
                         args.vis_tgt,
                         wp_lr_scheduler,
-                        dataset_info['class_labels'],
-                        model_ema=model_ema,
                         debug=args.debug)
         
         # LR Scheduler
@@ -210,23 +176,25 @@ def main():
 
         # Evaluate
         if distributed_utils.is_main_process():
-            model_eval = model_ema.ema if model_ema is not None else model_without_ddp
+            model_eval = model_without_ddp
+            to_save = False
             if (epoch % args.eval_epoch) == 0 or (epoch == cfg['max_epoch'] - 1):
                 if evaluator is None:
-                    cur_map = 0.
+                    to_save = True
                 else:
                     evaluator.evaluate(model_eval)
-                    cur_map = evaluator.map
-                # Save model
-                if cur_map > best_map:
-                    # update best-map
-                    best_map = cur_map
+                    # Save model
+                    if evaluator.map >= best_map:
+                        best_map = evaluator.map
+                        to_save = True
+
+                if to_save:
                     # save model
-                    print('Saving state, epoch:', epoch + 1)
+                    print('Saving state, epoch:', epoch)
                     torch.save({'model':        model_eval.state_dict(),
                                 'optimizer':    optimizer.state_dict(),
                                 'lr_scheduler': lr_scheduler.state_dict(),
-                                'mAP':          round(cur_map*100, 1),
+                                'mAP':          round(best_map*100, 1),
                                 'epoch':        epoch,
                                 'args':         args}, 
                                 os.path.join(path_to_save, '{}_best.pth'.format(args.model)))

+ 3 - 5
odlab/models/backbone/__init__.py

@@ -1,15 +1,13 @@
 from .resnet           import build_resnet
-from .swin_transformer import build_swin_transformer
 
 
 def build_backbone(cfg):
     print('==============================')
     print('Backbone: {}'.format(cfg['backbone']))
     # ResNet
-    if cfg['backbone'] in ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']:
+    if "resnet" in cfg.backbone:
         return build_resnet(cfg)
-    # SwinTransformer
-    elif cfg['backbone'] in ['swin_T_224_1k', 'swin_S_224_22k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']:
-        return build_swin_transformer(cfg)
+    else:
+        raise NotImplementedError("unknown backbone: {}".format(cfg.backbone))
     
                            

+ 0 - 222
odlab/models/basic/conv.py

@@ -1,11 +1,4 @@
-import math
-from typing import List
-
-import torch
 import torch.nn as nn
-import torch.nn.functional as F
-
-from .norm import LayerNorm2D
 
 
 def get_conv2d(c1, c2, k, p, s, d, g):
@@ -80,218 +73,3 @@ class ConvModule(nn.Module):
 
     def forward(self, x):
         return self.convs(x)
-
-class BasicConv(nn.Module):
-    def __init__(self, 
-                 in_dim,                   # in channels
-                 out_dim,                  # out channels 
-                 kernel_size=1,            # kernel size 
-                 padding=0,                # padding
-                 stride=1,                 # padding
-                 dilation=1,               # dilation
-                 act_type  :str = 'lrelu', # activation
-                 norm_type :str = 'BN',    # normalization
-                 depthwise :bool = False
-                ):
-        super(BasicConv, self).__init__()
-        self.depthwise = depthwise
-        if not depthwise:
-            self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=1)
-            self.norm = get_norm(norm_type, out_dim)
-        else:
-            self.conv1 = get_conv2d(in_dim, in_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=in_dim)
-            self.norm1 = get_norm(norm_type, in_dim)
-            self.conv2 = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=1)
-            self.norm2 = get_norm(norm_type, out_dim)
-        self.act  = get_activation(act_type)
-
-    def forward(self, x):
-        if not self.depthwise:
-            return self.act(self.norm(self.conv(x)))
-        else:
-            # Depthwise conv
-            x = self.norm1(self.conv1(x))
-            # Pointwise conv
-            x = self.norm2(self.conv2(x))
-            return x
-
-class UpSampleWrapper(nn.Module):
-    """Upsample last feat map to specific stride."""
-    def __init__(self, in_dim, upsample_factor):
-        super(UpSampleWrapper, self).__init__()
-        # ---------- Basic parameters ----------
-        self.upsample_factor = upsample_factor
-
-        # ---------- Network parameters ----------
-        if upsample_factor == 1:
-            self.upsample = nn.Identity()
-        else:
-            scale = int(math.log2(upsample_factor))
-            dim = in_dim
-            layers = []
-            for _ in range(scale-1):
-                layers += [
-                    nn.ConvTranspose2d(dim, dim, kernel_size=2, stride=2),
-                    LayerNorm2D(dim),
-                    nn.GELU()
-                ]
-            layers += [nn.ConvTranspose2d(dim, dim, kernel_size=2, stride=2)]
-            self.upsample = nn.Sequential(*layers)
-            self.out_dim = dim
-
-    def forward(self, x):
-        x = self.upsample(x)
-
-        return x
-
-
-# ----------------- RepCNN module -----------------
-class RepVggBlock(nn.Module):
-    def __init__(self, in_dim, out_dim, act_type='relu', norm_type='BN'):
-        super().__init__()
-        # ----------------- Basic parameters -----------------
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        # ----------------- Network parameters -----------------
-        self.conv1 = BasicConv(in_dim, out_dim, kernel_size=3, padding=1, act_type=None, norm_type=norm_type)
-        self.conv2 = BasicConv(in_dim, out_dim, kernel_size=1, padding=0, act_type=None, norm_type=norm_type)
-        self.act   = get_activation(act_type) 
-
-    def forward(self, x):
-        if hasattr(self, 'conv'):
-            y = self.conv(x)
-        else:
-            y = self.conv1(x) + self.conv2(x)
-
-        return self.act(y)
-
-    def convert_to_deploy(self):
-        if not hasattr(self, 'conv'):
-            self.conv = nn.Conv2d(self.in_dim, self.out_dim, 3, 1, padding=1)
-
-        kernel, bias = self.get_equivalent_kernel_bias()
-        self.conv.weight.data = kernel
-        self.conv.bias.data = bias 
-        # self.__delattr__('conv1')
-        # self.__delattr__('conv2')
-
-    def get_equivalent_kernel_bias(self):
-        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
-        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
-        
-        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1
-
-    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
-        if kernel1x1 is None:
-            return 0
-        else:
-            return F.pad(kernel1x1, [1, 1, 1, 1])
-
-    def _fuse_bn_tensor(self, branch: BasicConv):
-        if branch is None:
-            return 0, 0
-        kernel = branch.conv.weight
-        running_mean = branch.norm.running_mean
-        running_var = branch.norm.running_var
-        gamma = branch.norm.weight
-        beta = branch.norm.bias
-        eps = branch.norm.eps
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape(-1, 1, 1, 1)
-        return kernel * t, beta - running_mean * gamma / std
-
-class RepCSPLayer(nn.Module):
-    def __init__(self,
-                 in_dim     :int   = 256,
-                 out_dim    :int   = 256,
-                 num_blocks :int   = 3,
-                 expansion  :float = 1.0,
-                 act_type   :str   = "relu",
-                 norm_type  :str   = "GN",):
-        super(RepCSPLayer, self).__init__()
-        # ----------------- Basic parameters -----------------
-        inter_dim = int(out_dim * expansion)
-        # ----------------- Network parameters -----------------
-        self.conv1 = BasicConv(in_dim, inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.conv2 = BasicConv(in_dim, inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.bottlenecks = nn.Sequential(*[
-            RepVggBlock(inter_dim, inter_dim, act_type, norm_type) for _ in range(num_blocks)
-        ])
-        if inter_dim != out_dim:
-            self.conv3 = BasicConv(inter_dim, out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        else:
-            self.conv3 = nn.Identity()
-
-    def forward(self, x):
-        x_1 = self.conv1(x)
-        x_1 = self.bottlenecks(x_1)
-        x_2 = self.conv2(x)
-
-        return self.conv3(x_1 + x_2)
-
-
-# ----------------- CNN module -----------------
-class YoloBottleneck(nn.Module):
-    def __init__(self,
-                 in_dim       :int,
-                 out_dim      :int,
-                 kernel_size  :List  = [1, 3],
-                 expand_ratio :float = 0.5,
-                 shortcut     :bool  = False,
-                 act_type     :str   = 'silu',
-                 norm_type    :str   = 'BN',
-                 depthwise    :bool  = False,
-                 ) -> None:
-        super(YoloBottleneck, self).__init__()
-        inter_dim = int(out_dim * expand_ratio)
-        # ----------------- Network setting -----------------
-        self.conv_layer1 = BasicConv(in_dim, inter_dim,
-                                     kernel_size=kernel_size[0], padding=kernel_size[0]//2, stride=1,
-                                     act_type=act_type, norm_type=norm_type)
-        self.conv_layer2 = BasicConv(inter_dim, out_dim,
-                                     kernel_size=kernel_size[1], padding=kernel_size[1]//2, stride=1,
-                                     act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        self.shortcut = shortcut and in_dim == out_dim
-
-    def forward(self, x):
-        h = self.conv_layer2(self.conv_layer1(x))
-
-        return x + h if self.shortcut else h
-
-class ELANLayer(nn.Module):
-    def __init__(self,
-                 in_dim,
-                 out_dim,
-                 expand_ratio :float = 0.5,
-                 num_blocks   :int   = 1,
-                 shortcut     :bool  = False,
-                 act_type     :str   = 'silu',
-                 norm_type    :str   = 'BN',
-                 depthwise    :bool  = False,
-                 ) -> None:
-        super(ELANLayer, self).__init__()
-        self.inter_dim = round(out_dim * expand_ratio)
-        self.input_proj  = BasicConv(in_dim, self.inter_dim * 2, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.output_proj = BasicConv((2 + num_blocks) * self.inter_dim, out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.module = nn.ModuleList([YoloBottleneck(self.inter_dim,
-                                                    self.inter_dim,
-                                                    kernel_size  = [3, 3],
-                                                    expand_ratio = 1.0,
-                                                    shortcut     = shortcut,
-                                                    act_type     = act_type,
-                                                    norm_type    = norm_type,
-                                                    depthwise    = depthwise)
-                                                    for _ in range(num_blocks)])
-
-    def forward(self, x):
-        # Input proj
-        x1, x2 = torch.chunk(self.input_proj(x), 2, dim=1)
-        out = list([x1, x2])
-
-        # Bottlenecl
-        out.extend(m(out[-1]) for m in self.module)
-
-        # Output proj
-        out = self.output_proj(torch.cat(out, dim=1))
-
-        return out

+ 0 - 15
odlab/models/basic/norm.py

@@ -38,18 +38,3 @@ class FrozenBatchNorm2d(torch.nn.Module):
         scale = w * (rv + eps).rsqrt()
         bias = b - rm * scale
         return x * scale + bias
-
-class LayerNorm2D(nn.Module):
-    def __init__(self, normalized_shape, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.ln = norm_layer(normalized_shape) if norm_layer is not None else nn.Identity()
-
-    def forward(self, x):
-        """
-        x: N C H W
-        """
-        x = x.permute(0, 2, 3, 1)
-        x = self.ln(x)
-        x = x.permute(0, 3, 1, 2)
-        return x
-    

+ 4 - 8
odlab/models/detectors/__init__.py

@@ -1,19 +1,15 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 import torch
 
-from .retinanet.build import build_retinanet
-from .fcos.build      import build_fcos
-from .yolof.build     import build_yolof
-from .detr.build      import build_detr
+from .fcos.build  import build_fcos
+from .yolof.build import build_yolof
+from .detr.build  import build_detr
 
 
 def build_model(args, cfg, num_classes=80, is_val=False):
     # ------------ build object detector ------------
-    ## RetinaNet    
-    if 'retinanet' in args.model:
-        model, criterion = build_retinanet(cfg, num_classes, is_val)
     ## FCOS    
-    elif 'fcos' in args.model:
+    if 'fcos' in args.model:
         model, criterion = build_fcos(cfg, num_classes, is_val)
     ## YOLOF    
     elif 'yolof' in args.model:

+ 1 - 1
odlab/models/detectors/fcos/fcos.py

@@ -96,7 +96,7 @@ class FCOS(nn.Module):
 
         return bboxes, scores, labels
 
-    def forward(self, src, src_mask=None, targets=None):
+    def forward(self, src, src_mask=None):
         # ---------------- Backbone ----------------
         pyramid_feats = self.backbone(src)
 

+ 0 - 55
odlab/models/detectors/retinanet/README.md

@@ -1,55 +0,0 @@
-# RetinaNet
-
-Our `RetinaNet-R50-1x` baseline on COCO-val:
-```Shell
-
-```
-
-- ImageNet-1K_V1 pretrained
-
-| Model             |  scale     |  FPS  | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | Weight | Logs  |
-| ------------------| ---------- | ----- | ---------------------- |  ---------------  | ------ | ----- |
-| RetinaNet_R18_1x  |  800,1333  |       |          30.5          |        48.1       | [ckpt](https://github.com/yjh0410/ODLab/releases/download/detection_weights/retinanet_r18_1x_coco.pth) | [log](https://github.com/yjh0410/ODLab/releases/download/detection_weights/RetinaNet-R18-1x.txt) |
-| RetinaNet_R50_1x  |  800,1333  |       |                        |                   |  |  |
-
-
-## Train RetinaNet
-### Single GPU
-Taking training **RetinaNet_R18_1x** on COCO as the example,
-```Shell
-python main.py --cuda -d coco --root path/to/coco -m retinanet_r18_1x --batch_size 16 --eval_epoch 2
-```
-
-### Multi GPU
-Taking training **RetinaNet_R18_1x** on COCO as the example,
-```Shell
-python -m torch.distributed.run --nproc_per_node=8 train.py --cuda -dist -d coco --root path/to/coco -m retinanet_r18_1x --batch_size 16 --eval_epoch 2 
-```
-
-## Test RetinaNet
-Taking testing **RetinaNet_R18_1x** on COCO-val as the example,
-```Shell
-python test.py --cuda -d coco --root path/to/coco -m retinanet_r18_1x --weight path/to/retinanet_r18_1x.pth -vt 0.4 --show 
-```
-
-## Evaluate RetinaNet
-Taking evaluating **RetinaNet_R18_1x** on COCO-val as the example,
-```Shell
-python main.py --cuda -d coco --root path/to/coco -m retinanet_r18_1x --resume path/to/retinanet_r18_1x.pth --eval_first
-```
-
-## Demo
-### Detect with Image
-```Shell
-python demo.py --mode image --path_to_img path/to/image_dirs/ --cuda -m retinanet_r18_1x --weight path/to/weight -vt 0.4 --show
-```
-
-### Detect with Video
-```Shell
-python demo.py --mode video --path_to_vid path/to/video --cuda -m retinanet_r18_1x --weight path/to/weight -vt 0.4 --show --gif
-```
-
-### Detect with Camera
-```Shell
-python demo.py --mode camera --cuda -m retinanet_r18_1x --weight path/to/weight -vt 0.4 --show --gif
-```

+ 0 - 24
odlab/models/detectors/retinanet/build.py

@@ -1,24 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-
-from .criterion import build_criterion
-from .retinanet import RetinaNet
-
-
-# build RetinaNet
-def build_retinanet(cfg, num_classes=80, is_val=False):
-    # -------------- Build RetinaNet --------------
-    model = RetinaNet(cfg         = cfg,
-                      num_classes = num_classes,
-                      conf_thresh = cfg['train_conf_thresh'] if is_val else cfg['test_conf_thresh'],
-                      nms_thresh  = cfg['train_nms_thresh']  if is_val else cfg['test_nms_thresh'],
-                      topk        = cfg['train_topk']        if is_val else cfg['test_topk'],
-                      ca_nms      = False if is_val else cfg['nms_class_agnostic'])
-            
-    # -------------- Build Criterion --------------
-    criterion = None
-    if is_val:
-        # build criterion for training
-        criterion = build_criterion(cfg, num_classes)
-
-    return model, criterion

+ 0 - 136
odlab/models/detectors/retinanet/criterion.py

@@ -1,136 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
-from utils.misc import sigmoid_focal_loss
-from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
-
-from .matcher import RetinaNetMatcher
-
-
-class Criterion(nn.Module):
-    def __init__(self, cfg, num_classes=80):
-        super().__init__()
-        # ------------- Basic parameters -------------
-        self.cfg = cfg
-        self.num_classes = num_classes
-        # ------------- Focal loss -------------
-        self.alpha = cfg['focal_loss_alpha']
-        self.gamma = cfg['focal_loss_gamma']
-        # ------------- Loss weight -------------
-        self.weight_dict = {'loss_cls': cfg['loss_cls_weight'],
-                            'loss_reg': cfg['loss_reg_weight']}
-        # ------------- Matcher -------------
-        self.matcher_cfg = cfg['matcher_hpy']
-        self.matcher = RetinaNetMatcher(num_classes,
-                                        iou_threshold=self.matcher_cfg['iou_thresh'],
-                                        iou_labels=self.matcher_cfg['iou_labels'],
-                                        allow_low_quality_matches=self.matcher_cfg['allow_low_quality_matches']
-                                        )
-
-    def loss_labels(self, pred_cls, tgt_cls, num_boxes):
-        """
-            pred_cls: (Tensor) [N, C]
-            tgt_cls:  (Tensor) [N, C]
-        """
-        # cls loss: [V, C]
-        loss_cls = sigmoid_focal_loss(pred_cls, tgt_cls, self.alpha, self.gamma)
-
-        return loss_cls.sum() / num_boxes
-
-    def loss_bboxes(self, pred_reg=None, pred_box=None, tgt_box=None, anchors=None, num_boxes=1, use_giou=False):
-        """
-            pred_reg: (Tensor) [Nq, 4]
-            tgt_box:  (Tensor) [Nq, 4]
-            anchors:  (Tensor) [Nq, 4]
-        """
-        # GIoU loss
-        if use_giou:
-            pred_giou = generalized_box_iou(pred_box, tgt_box)  # [N, M]
-            loss_reg = 1. - torch.diag(pred_giou)
-        
-        # L1 loss
-        else:
-            # xyxy -> cxcy&bwbh
-            tgt_cxcy = (tgt_box[..., :2] + tgt_box[..., 2:]) * 0.5
-            tgt_bwbh = tgt_box[..., 2:] - tgt_box[..., :2]
-            # encode gt box
-            tgt_offsets = (tgt_cxcy - anchors[..., :2]) / anchors[..., 2:]
-            tgt_sizes = torch.log(tgt_bwbh / anchors[..., 2:])
-            tgt_box_encode = torch.cat([tgt_offsets, tgt_sizes], dim=-1)
-            # compute l1 loss
-            loss_reg = F.l1_loss(pred_reg, tgt_box_encode, reduction='none')
-
-        return loss_reg.sum() / num_boxes
-
-    def forward(self, outputs, targets):
-        """
-            outputs['pred_cls']: (Tensor) [B, M, C]
-            outputs['pred_reg']: (Tensor) [B, M, 4]
-            outputs['strides']: (List) [8, 16, 32, ...] stride of the model output
-            targets: (List) [dict{'boxes': [...], 
-                                 'labels': [...], 
-                                 'orig_size': ...}, ...]
-            anchors: (Tensor) [M, 4]
-        """
-        # -------------------- Pre-process --------------------
-        cls_preds = torch.cat(outputs['pred_cls'], dim=1).view(-1, self.num_classes)
-        reg_preds = torch.cat(outputs['pred_reg'], dim=1).view(-1, 4)
-        box_preds = torch.cat(outputs['pred_box'], dim=1).view(-1, 4)
-        masks = ~torch.cat(outputs['mask'], dim=1).view(-1)
-        B = len(targets)
-       
-        # process anchor boxes
-        anchor_boxes = torch.cat(outputs['anchors'])
-        anchor_boxes = anchor_boxes[None].repeat(B, 1, 1)
-        anchor_boxes_xyxy = box_cxcywh_to_xyxy(anchor_boxes)
-
-        # -------------------- Label Assignment --------------------
-        tgt_classes, tgt_boxes = self.matcher(anchor_boxes_xyxy, targets)
-        tgt_classes = tgt_classes.flatten()
-        tgt_boxes = tgt_boxes.view(-1, 4)
-        del anchor_boxes_xyxy
-
-        foreground_idxs = (tgt_classes >= 0) & (tgt_classes != self.num_classes)
-        valid_idxs = (tgt_classes >= 0) & masks
-        num_foreground = foreground_idxs.sum()
-        if is_dist_avail_and_initialized():
-            torch.distributed.all_reduce(num_foreground)
-        num_foreground = torch.clamp(num_foreground / get_world_size(), min=1).item()
-
-        # -------------------- Classification loss --------------------
-        gt_cls_target = torch.zeros_like(cls_preds)
-        gt_cls_target[foreground_idxs, tgt_classes[foreground_idxs]] = 1
-        loss_labels = self.loss_labels(
-            cls_preds[valid_idxs], gt_cls_target[valid_idxs], num_foreground)
-
-        # -------------------- Regression loss --------------------
-        if self.cfg['use_giou_loss']:
-            box_preds_pos = box_preds[foreground_idxs]
-            tgt_boxes_pos = tgt_boxes[foreground_idxs].to(reg_preds.device)
-            loss_bboxes = self.loss_bboxes(
-                pred_box=box_preds_pos, tgt_box=tgt_boxes_pos, num_boxes=num_foreground, use_giou=self.cfg['use_giou_loss'])
-        else:
-            reg_preds_pos = reg_preds[foreground_idxs]
-            tgt_boxes_pos = tgt_boxes[foreground_idxs].to(reg_preds.device)
-            anchors_pos = anchor_boxes.view(-1, 4)[foreground_idxs]
-            loss_bboxes = self.loss_bboxes(
-                pred_reg=reg_preds_pos, tgt_box=tgt_boxes_pos, anchors=anchors_pos, num_boxes=num_foreground, use_giou=self.cfg['use_giou_loss'])
-
-        loss_dict = dict(
-                loss_cls = loss_labels,
-                loss_reg = loss_bboxes,
-        )
-
-        return loss_dict
-
-    
-# build criterion
-def build_criterion(cfg, num_classes=80):
-    criterion = Criterion(cfg=cfg, num_classes=num_classes)
-    return criterion
-
-
-if __name__ == "__main__":
-    pass

+ 0 - 181
odlab/models/detectors/retinanet/matcher.py

@@ -1,181 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by BaseDetection, Inc. and its affiliates.
-import torch
-from utils.box_ops import box_iou
-
-
-class RetinaNetMatcher(object):
-    """
-    This class assigns to each predicted "element" (e.g., a box) a ground-truth
-    element. Each predicted element will have exactly zero or one matches; each
-    ground-truth element may be matched to zero or more predicted elements.
-
-    The matching is determined by the MxN match_quality_matrix, that characterizes
-    how well each (ground-truth, prediction)-pair match each other. For example,
-    if the elements are boxes, this matrix may contain box intersection-over-union
-    overlap values.
-
-    The matcher returns (a) a vector of length N containing the index of the
-    ground-truth element m in [0, M) that matches to prediction n in [0, N).
-    (b) a vector of length N containing the labels for each prediction.
-    """
-
-    def __init__(self,
-                 num_classes, 
-                 iou_threshold, 
-                 iou_labels, 
-                 allow_low_quality_matches=False):
-        """
-        Args:
-            thresholds (list): a list of thresholds used to stratify predictions
-                into levels.
-            labels (list): a list of values to label predictions belonging at
-                each level. A label can be one of {-1, 0, 1} signifying
-                {ignore, negative class, positive class}, respectively.
-            allow_low_quality_matches (bool): if True, produce additional matches
-                for predictions with maximum match quality lower than high_threshold.
-                See set_low_quality_matches_ for more details.
-
-            For example,
-                thresholds = [0.3, 0.5]
-                labels = [0, -1, 1]
-                All predictions with iou < 0.3 will be marked with 0 and
-                thus will be considered as false positives while training.
-                All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
-                thus will be ignored.
-                All predictions with 0.5 <= iou will be marked with 1 and
-                thus will be considered as true positives.
-        """
-        self.num_classes = num_classes
-        # Add -inf and +inf to first and last position in iou_thresholdhreshold
-        iou_threshold = iou_threshold[:]
-        assert iou_threshold[0] > 0
-        iou_threshold.insert(0, -float("inf"))
-        iou_threshold.append(float("inf"))
-        assert all(low <= high for (low, high) in zip(iou_threshold[:-1], iou_threshold[1:]))
-        assert all(label in [-1, 0, 1] for label in iou_labels)
-        assert len(iou_labels) == len(iou_threshold) - 1
-        self.iou_threshold = iou_threshold
-        self.iou_labels = iou_labels
-        self.allow_low_quality_matches = allow_low_quality_matches
-
-    @torch.no_grad()
-    def __call__(self, anchors, targets):
-        """
-            anchors: (Tensor) [B, M, 4] (x1, y1, x2, y2)
-            targets: (Dict) dict{'boxes': [...], 
-                                 'labels': [...], 
-                                 'orig_size': ...}
-        """
-        # list[Tensor(R, 4)], one for each image
-        gt_classes = []
-        gt_boxes = []
-        device = anchors.device
-
-        for anchors_per_image, targets_per_image in zip(anchors, targets):
-            # [N,]
-            tgt_labels = targets_per_image['labels'].to(device)
-            # [N, 4]
-            tgt_boxes = targets_per_image['boxes'].to(device)
-            # [N, M], N is the number of targets, M is the number of anchors
-            match_quality_matrix, _ = box_iou(tgt_boxes, anchors_per_image)
-            gt_matched_idxs, anchor_labels = self.matching(match_quality_matrix)
-            has_gt = len(tgt_labels) > 0
-            if has_gt:
-                # ground truth box regression
-                matched_gt_boxes = tgt_boxes[gt_matched_idxs]
-
-                gt_classes_i = tgt_labels[gt_matched_idxs]
-                # Anchors with label 0 are treated as background.
-                gt_classes_i[anchor_labels == 0] = self.num_classes
-                # Anchors with label -1 are ignored.
-                gt_classes_i[anchor_labels == -1] = -1
-            else:
-                gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes
-                matched_gt_boxes = torch.zeros_like(anchors_per_image)
-
-            gt_classes.append(gt_classes_i)
-            gt_boxes.append(matched_gt_boxes)
-
-        return torch.stack(gt_classes), torch.stack(gt_boxes)
-
-    def matching(self, match_quality_matrix):
-        """
-        Args:
-            match_quality_matrix (Tensor[float]): an N x M tensor, containing the
-                pairwise quality between N ground-truth elements and M predicted
-                elements. All elements must be >= 0 (due to the us of `torch.nonzero`
-                for selecting indices in :meth:`set_low_quality_matches_`).
-
-        Returns:
-            matches (Tensor[int64]): a vector of length M, where matches[i] is a matched
-                ground-truth index in [0, N)
-            match_labels (Tensor[int8]): a vector of length M, where pred_labels[i] indicates
-                whether a prediction is a true or false positive or ignored
-        """
-        assert match_quality_matrix.dim() == 2
-        if match_quality_matrix.numel() == 0:
-            default_matches = match_quality_matrix.new_full(
-                (match_quality_matrix.size(1),), 0, dtype=torch.int64
-            )
-            # When no gt boxes exist, we define IOU = 0 and therefore set labels
-            # to `self.labels[0]`, which usually defaults to background class 0
-            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
-            default_match_labels = match_quality_matrix.new_full(
-                (match_quality_matrix.size(1),), self.iou_labels[0], dtype=torch.int8
-            )
-            return default_matches, default_match_labels
-
-        assert torch.all(match_quality_matrix >= 0)
-
-        # match_quality_matrix is N (gt) x M (predicted)
-        # Max over gt elements (dim 0) to find best gt candidate for each prediction
-        matched_vals, matches = match_quality_matrix.max(dim=0)
-
-        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
-
-        for (l, low, high) in zip(self.iou_labels, self.iou_threshold[:-1], self.iou_threshold[1:]):
-            low_high = (matched_vals >= low) & (matched_vals < high)
-            match_labels[low_high] = l
-
-        if self.allow_low_quality_matches:
-            self.set_low_quality_matches_(match_labels, match_quality_matrix)
-
-        return matches, match_labels
-
-    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
-        """
-        Produce additional matches for predictions that have only low-quality matches.
-        Specifically, for each ground-truth G find the set of predictions that have
-        maximum overlap with it (including ties); for each prediction in that set, if
-        it is unmatched, then match it to the ground-truth G.
-
-        This function implements the RPN assignment case (i) in Sec. 3.1.2 of the
-        Faster R-CNN paper: https://arxiv.org/pdf/1506.01497v3.pdf.
-        """
-        # For each gt, find the prediction with which it has highest quality
-        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
-        # Find the highest quality match available, even if it is low, including ties.
-        # Note that the matches qualities must be positive due to the use of
-        # `torch.nonzero`.
-        gt_pred_pairs_of_highest_quality = torch.nonzero(
-            match_quality_matrix == highest_quality_foreach_gt[:, None],
-            as_tuple=False
-        )
-        # Example gt_pred_pairs_of_highest_quality:
-        #   tensor([[    0, 39796],
-        #           [    1, 32055],
-        #           [    1, 32070],
-        #           [    2, 39190],
-        #           [    2, 40255],
-        #           [    3, 40390],
-        #           [    3, 41455],
-        #           [    4, 45470],
-        #           [    5, 45325],
-        #           [    5, 46390]])
-        # Each row is a (gt index, prediction index)
-        # Note how gt items 1, 2, 3, and 5 each have two ties
-
-        pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1]
-        match_labels[pred_inds_to_update] = 1

+ 0 - 123
odlab/models/detectors/retinanet/retinanet.py

@@ -1,123 +0,0 @@
-import numpy as np
-import math
-import torch
-import torch.nn as nn
-
-# --------------- Model components ---------------
-from ...backbone import build_backbone
-from ...neck import build_neck
-from ...head import build_head
-
-# --------------- External components ---------------
-from utils.misc import multiclass_nms
-
-
-# ------------------------ RetinaNet ------------------------
-class RetinaNet(nn.Module):
-    def __init__(self, 
-                 cfg,
-                 num_classes :int   = 80, 
-                 conf_thresh :float = 0.05,
-                 nms_thresh  :float = 0.6,
-                 topk        :int   = 1000,
-                 trainable   :bool  = False,
-                 ca_nms      :bool  = False):
-        super(RetinaNet, self).__init__()
-        # ---------------------- Basic Parameters ----------------------
-        self.cfg = cfg
-        self.trainable = trainable
-        self.topk = topk
-        self.num_classes = num_classes
-        self.conf_thresh = conf_thresh
-        self.nms_thresh = nms_thresh
-        self.ca_nms = ca_nms
-
-        # ---------------------- Network Parameters ----------------------
-        ## Backbone
-        self.backbone, feat_dims = build_backbone(cfg, trainable&cfg['pretrained'])
-
-        ## Neck
-        self.fpn = build_neck(cfg, feat_dims, cfg['head_dim'])
-        
-        ## Heads
-        self.head = build_head(cfg, cfg['head_dim'], cfg['head_dim'], num_classes)
-
-    def post_process(self, cls_preds, box_preds):
-        """
-        Input:
-            cls_preds: List(Tensor) [[B, H x W, KA x C], ...]
-            box_preds: List(Tensor) [[B, H x W, KA x 4], ...]
-        """
-        all_scores = []
-        all_labels = []
-        all_bboxes = []
-        
-        for cls_pred_i, box_pred_i in zip(cls_preds, box_preds):
-            cls_pred_i = cls_pred_i[0]
-            box_pred_i = box_pred_i[0]
-            
-            # (H x W x KA x C,)
-            scores_i = cls_pred_i.sigmoid().flatten()
-
-            # Keep top k top scoring indices only.
-            num_topk = min(self.topk, box_pred_i.size(0))
-
-            # torch.sort is actually faster than .topk (at least on GPUs)
-            predicted_prob, topk_idxs = scores_i.sort(descending=True)
-            topk_scores = predicted_prob[:num_topk]
-            topk_idxs = topk_idxs[:num_topk]
-
-            # filter out the proposals with low confidence score
-            keep_idxs = topk_scores > self.conf_thresh
-            topk_idxs = topk_idxs[keep_idxs]
-
-            # final scores
-            scores = topk_scores[keep_idxs]
-            # final labels
-            labels = topk_idxs % self.num_classes
-            # final bboxes
-            anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
-            bboxes = box_pred_i[anchor_idxs]
-
-            all_scores.append(scores)
-            all_labels.append(labels)
-            all_bboxes.append(bboxes)
-
-        scores = torch.cat(all_scores)
-        labels = torch.cat(all_labels)
-        bboxes = torch.cat(all_bboxes)
-
-        # to cpu & numpy
-        scores = scores.cpu().numpy()
-        labels = labels.cpu().numpy()
-        bboxes = bboxes.cpu().numpy()
-
-        # nms
-        scores, labels, bboxes = multiclass_nms(
-            scores, labels, bboxes, self.nms_thresh, self.num_classes, self.ca_nms)
-
-        return bboxes, scores, labels
-
-    def forward(self, src, src_mask=None, targets=None):
-        # ---------------- Backbone ----------------
-        pyramid_feats = self.backbone(src)
-
-        # ---------------- Neck ----------------
-        pyramid_feats = self.fpn(pyramid_feats)
-
-        # ---------------- Heads ----------------
-        outputs = self.head(pyramid_feats, src_mask)
-
-        if not self.training:
-            # ---------------- PostProcess ----------------
-            cls_pred = outputs["pred_cls"]
-            box_pred = outputs["pred_box"]
-            bboxes, scores, labels = self.post_process(cls_pred, box_pred)
-            # normalize bbox
-            bboxes[..., 0::2] /= src.shape[-1]
-            bboxes[..., 1::2] /= src.shape[-2]
-            bboxes = bboxes.clip(0., 1.)
-
-            return bboxes, scores, labels
-
-        return outputs 

+ 1 - 1
odlab/models/detectors/yolof/yolof.py

@@ -81,7 +81,7 @@ class YOLOF(nn.Module):
 
         return bboxes, scores, labels
 
-    def forward(self, src, src_mask=None, targets=None):
+    def forward(self, src, src_mask=None):
         # ---------------- Backbone ----------------
         pyramid_feats = self.backbone(src)
 

+ 5 - 16
odlab/models/head/__init__.py

@@ -1,6 +1,5 @@
-from .retinanet_head import RetinaNetHead
-from .yolof_head     import YOLOFHead
-from .fcos_head      import FCOSHead
+from .yolof_head     import YolofHead
+from .fcos_head      import FcosHead
 
 
 # build head
@@ -8,18 +7,8 @@ def build_head(cfg, in_dim, out_dim, num_classes):
     print('==============================')
     print('Head: {}'.format(cfg['head']))
     
-    if cfg['head'] == 'retinanet_head':
-        model = RetinaNetHead(cfg          = cfg,
-                              in_dim       = in_dim,
-                              out_dim      = out_dim,
-                              num_classes  = num_classes,
-                              num_cls_head = cfg['num_cls_head'],
-                              num_reg_head = cfg['num_reg_head'],
-                              act_type     = cfg['head_act'],
-                              norm_type    = cfg['head_norm']
-                              )
-    elif cfg['head'] == 'fcos_head':
-        model = FCOSHead(cfg          = cfg,
+    if cfg['head'] == 'fcos_head':
+        model = FcosHead(cfg          = cfg,
                          in_dim       = in_dim,
                          out_dim      = out_dim,
                          num_classes  = num_classes,
@@ -29,7 +18,7 @@ def build_head(cfg, in_dim, out_dim, num_classes):
                          norm_type    = cfg['head_norm']
                          )
     elif cfg['head'] == 'yolof_head':
-        model = YOLOFHead(cfg          = cfg,
+        model = YolofHead(cfg          = cfg,
                           in_dim       = in_dim,
                           out_dim      = out_dim,
                           num_classes  = num_classes,

+ 1 - 1
odlab/models/head/fcos_head.py

@@ -25,7 +25,7 @@ class Scale(nn.Module):
         return x * self.scale
 
 
-class FCOSHead(nn.Module):
+class FcosHead(nn.Module):
     def __init__(self, cfg, in_dim, out_dim, num_classes, num_cls_head=1, num_reg_head=1, act_type='relu', norm_type='BN'):
         super().__init__()
         self.fmp_size = None

+ 0 - 203
odlab/models/head/retinanet_head.py

@@ -1,203 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-
-from ..basic.conv import ConvModule
-
-
-class RetinaNetHead(nn.Module):
-    def __init__(self, cfg, in_dim, out_dim, num_classes, num_cls_head=1, num_reg_head=1, act_type='relu', norm_type='BN'):
-        super().__init__()
-        self.fmp_size = None
-        self.DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
-        # ------------------ Basic parameters -------------------
-        self.cfg = cfg
-        self.in_dim = in_dim
-        self.num_classes = num_classes
-        self.num_cls_head=num_cls_head
-        self.num_reg_head=num_reg_head
-        self.act_type=act_type
-        self.norm_type=norm_type
-        self.stride = cfg['out_stride']
-        # ------------------ Anchor parameters -------------------
-        self.anchor_size = self.get_anchor_sizes(cfg)  # [S, KA, 2]
-        self.num_anchors = self.anchor_size.shape[1]
-
-        # ------------------ Network parameters -------------------
-        ## cls head
-        cls_heads = []
-        self.cls_head_dim = out_dim
-        for i in range(self.num_cls_head):
-            if i == 0:
-                cls_heads.append(
-                    ConvModule(in_dim, self.cls_head_dim, k=3, p=1, s=1, 
-                               act_type=self.act_type,
-                               norm_type=self.norm_type)
-                               )
-            else:
-                cls_heads.append(
-                    ConvModule(self.cls_head_dim, self.cls_head_dim, k=3, p=1, s=1, 
-                               act_type=self.act_type,
-                               norm_type=self.norm_type)
-                               )
-        ## reg head
-        reg_heads = []
-        self.reg_head_dim = out_dim
-        for i in range(self.num_reg_head):
-            if i == 0:
-                reg_heads.append(
-                    ConvModule(in_dim, self.reg_head_dim, k=3, p=1, s=1, 
-                               act_type=self.act_type,
-                               norm_type=self.norm_type)
-                               )
-            else:
-                reg_heads.append(
-                    ConvModule(self.reg_head_dim, self.reg_head_dim, k=3, p=1, s=1, 
-                               act_type=self.act_type,
-                               norm_type=self.norm_type)
-                               )
-        self.cls_heads = nn.Sequential(*cls_heads)
-        self.reg_heads = nn.Sequential(*reg_heads)
-
-        ## pred layers
-        self.cls_pred = nn.Conv2d(self.cls_head_dim, num_classes * self.num_anchors, kernel_size=3, padding=1)
-        self.reg_pred = nn.Conv2d(self.reg_head_dim, 4 * self.num_anchors, kernel_size=3, padding=1)
-
-        # init bias
-        self._init_layers()
-
-    def _init_layers(self):
-        for module in [self.cls_heads, self.reg_heads, self.cls_pred, self.reg_pred]:
-            for layer in module.modules():
-                if isinstance(layer, nn.Conv2d):
-                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
-                    torch.nn.init.constant_(layer.bias, 0)
-                if isinstance(layer, nn.GroupNorm):
-                    torch.nn.init.constant_(layer.weight, 1)
-                    torch.nn.init.constant_(layer.bias, 0)
-        # init the bias of cls pred
-        init_prob = 0.01
-        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
-        torch.nn.init.constant_(self.cls_pred.bias, bias_value)
-        
-    def get_anchor_sizes(self, cfg):
-        basic_anchor_size =   cfg['anchor_config']['basic_size']
-        anchor_aspect_ratio = cfg['anchor_config']['aspect_ratio']
-        anchor_area_scale =   cfg['anchor_config']['area_scale']
-
-        num_scales = len(basic_anchor_size)
-        num_anchors = len(anchor_aspect_ratio) * len(anchor_area_scale)
-        anchor_sizes = []
-        for size in basic_anchor_size:
-            for ar in anchor_aspect_ratio:
-                for s in anchor_area_scale:
-                    ah, aw = size
-                    area = ah * aw * s
-                    anchor_sizes.append(
-                        [torch.sqrt(torch.tensor(ar * area)),
-                         torch.sqrt(torch.tensor(area / ar))]
-                         )
-        # [S * KA, 2] -> [S, KA, 2]
-        anchor_sizes = torch.as_tensor(anchor_sizes).view(num_scales, num_anchors, 2)
-
-        return anchor_sizes
-
-    def get_anchors(self, level, fmp_size):
-        """
-            fmp_size: (List) [H, W]
-        """
-        # generate grid cells
-        fmp_h, fmp_w = fmp_size
-        # [KA, 2]
-        anchor_size = self.anchor_size[level]
-
-        anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
-        # [H, W, 2] -> [HW, 2]
-        anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2) + 0.5
-        # [HW, 2] -> [HW, 1, 2] -> [HW, KA, 2] 
-        anchor_xy = anchor_xy[:, None, :].repeat(1, self.num_anchors, 1)
-        anchor_xy *= self.stride[level]
-
-        # [KA, 2] -> [1, KA, 2] -> [HW, KA, 2]
-        anchor_wh = anchor_size[None, :, :].repeat(fmp_h*fmp_w, 1, 1)
-
-        # [HW, KA, 4] -> [M, 4], M = HW x KA
-        anchor_boxes = torch.cat([anchor_xy, anchor_wh], dim=-1)
-        anchor_boxes = anchor_boxes.view(-1, 4)
-
-        return anchor_boxes
-        
-    def decode_boxes(self, anchor_boxes, pred_reg):
-        """
-            anchor_boxes: (List[Tensor]) [1, M, 4] or [M, 4]
-            pred_reg:     (List[Tensor]) [B, M, 4] or [M, 4]
-        """
-        # x = x_anchor + dx * w_anchor
-        # y = y_anchor + dy * h_anchor
-        pred_ctr_offset = pred_reg[..., :2] * anchor_boxes[..., 2:]
-        pred_ctr_xy = anchor_boxes[..., :2] + pred_ctr_offset
-
-        # w = w_anchor * exp(tw)
-        # h = h_anchor * exp(th)
-        pred_dwdh = pred_reg[..., 2:]
-        pred_dwdh = torch.clamp(pred_dwdh, max=self.DEFAULT_SCALE_CLAMP)
-        pred_wh = anchor_boxes[..., 2:] * pred_dwdh.exp()
-
-        # convert [x, y, w, h] -> [x1, y1, x2, y2]
-        pred_x1y1 = pred_ctr_xy - 0.5 * pred_wh
-        pred_x2y2 = pred_ctr_xy + 0.5 * pred_wh
-        pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
-
-        return pred_box
-
-    def forward(self, pyramid_feats, mask=None):
-        all_masks = []
-        all_anchors = []
-        all_cls_preds = []
-        all_reg_preds = []
-        all_box_preds = []
-        for level, feat in enumerate(pyramid_feats):
-            # ------------------- Decoupled head -------------------
-            cls_feat = self.cls_heads(feat)
-            reg_feat = self.reg_heads(feat)
-
-            # ------------------- Generate anchor box -------------------
-            B, _, H, W = cls_feat.size()
-            fmp_size = [H, W]
-            anchor_boxes = self.get_anchors(level, fmp_size)   # [M, 4]
-            anchor_boxes = anchor_boxes.to(cls_feat.device)
-
-            # ------------------- Predict -------------------
-            cls_pred = self.cls_pred(cls_feat)
-            reg_pred = self.reg_pred(reg_feat)
-
-            # ------------------- Process preds -------------------
-            ## [B, C, H, W] -> [B, H, W, C] -> [B, M, C]
-            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
-            reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 4)
-            ## Decode bbox
-            box_pred = self.decode_boxes(anchor_boxes, reg_pred)
-            ## Adjust mask
-            if mask is not None:
-                # [B, H, W]
-                mask_i = torch.nn.functional.interpolate(mask[None].float(), size=[H, W]).bool()[0]
-                # [B, H, W] -> [B, M]
-                mask_i = mask_i.flatten(1)     
-                # [B, HW] -> [B, HW, KA] -> [B, M], M= HW x KA
-                mask_i = mask_i[..., None].repeat(1, 1, self.num_anchors).flatten(1)
-                
-                all_masks.append(mask_i)
-                
-            all_anchors.append(anchor_boxes)
-            all_cls_preds.append(cls_pred)
-            all_reg_preds.append(reg_pred)
-            all_box_preds.append(box_pred)
-
-        outputs = {"pred_cls": all_cls_preds,  # List [B, M, C]
-                   "pred_reg": all_reg_preds,  # List [B, M, 4]
-                   "pred_box": all_box_preds,  # List [B, M, 4]
-                   "anchors": all_anchors,     # List [B, M, 2]
-                   "strides": self.stride,
-                   "mask": all_masks}          # List [B, M,]
-
-        return outputs 

+ 1 - 1
odlab/models/head/yolof_head.py

@@ -5,7 +5,7 @@ import torch.nn as nn
 from ..basic.conv import ConvModule
 
 
-class YOLOFHead(nn.Module):
+class YolofHead(nn.Module):
     def __init__(self, cfg, in_dim, out_dim, num_classes, num_cls_head=1, num_reg_head=1, act_type='relu', norm_type='BN'):
         super().__init__()
         self.fmp_size = None

+ 0 - 142
odlab/models/neck/hybrid_encoder.py

@@ -1,142 +0,0 @@
-from typing import List
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..basic.conv import BasicConv, RepCSPLayer
-from ..basic.transformer import TransformerEncoder
-
-
-# -------------- Feature Pyramid Network + Transformer Encoder --------------
-class HybridEncoder(nn.Module):
-    def __init__(self, 
-                 in_dims        :List  = [256, 512, 1024],
-                 out_dim        :int   = 256,
-                 num_blocks     :int   = 3,
-                 expansion      :float = 1.0,
-                 act_type       :str   = 'silu',
-                 norm_type      :str   = 'GN',
-                 depthwise      :bool  = False,
-                 # Transformer's parameters
-                 num_heads      :int   = 8,
-                 num_layers     :int   = 1,
-                 ffn_dim        :int   = 1024,
-                 dropout        :float = 0.1,
-                 pe_temperature :float = 10000.,
-                 en_act_type    :str   = 'gelu',
-                 en_pre_norm    :bool  = False,
-                 ) -> None:
-        super(HybridEncoder, self).__init__()
-        # ---------------- Basic parameters ----------------
-        self.in_dims = in_dims
-        self.out_dim = out_dim
-        self.out_dims = [self.out_dim] * len(in_dims)
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.ffn_dim = ffn_dim
-        c3, c4, c5 = in_dims
-
-        # ---------------- Input projs ----------------
-        self.input_proj_1 = BasicConv(c5, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.input_proj_2 = BasicConv(c4, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.input_proj_3 = BasicConv(c3, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-
-        # ---------------- Transformer Encoder ----------------
-        self.transformer_encoder = TransformerEncoder(d_model        = self.out_dim,
-                                                      num_heads      = num_heads,
-                                                      num_layers     = num_layers,
-                                                      ffn_dim        = ffn_dim,
-                                                      pe_temperature = pe_temperature,
-                                                      dropout        = dropout,
-                                                      act_type       = en_act_type,
-                                                      pre_norm       = en_pre_norm,
-                                                      )
-
-        # ---------------- Top dwon FPN ----------------
-        ## P5 -> P4
-        self.reduce_layer_1 = BasicConv(self.out_dim, self.out_dim,
-                                        kernel_size=1, padding=0, stride=1,
-                                        act_type=act_type, norm_type=norm_type)
-        self.top_down_layer_1 = RepCSPLayer(in_dim      = self.out_dim * 2,
-                                            out_dim     = self.out_dim,
-                                            num_blocks  = num_blocks,
-                                            expansion   = expansion,
-                                            act_type    = act_type,
-                                            norm_type   = norm_type,
-                                            )
-        ## P4 -> P3
-        self.reduce_layer_2 = BasicConv(self.out_dim, self.out_dim,
-                                        kernel_size=1, padding=0, stride=1,
-                                        act_type=act_type, norm_type=norm_type)
-        self.top_down_layer_2 = RepCSPLayer(in_dim      = self.out_dim * 2,
-                                            out_dim     = self.out_dim,
-                                            num_blocks  = num_blocks,
-                                            expansion   = expansion,
-                                            act_type    = act_type,
-                                            norm_type   = norm_type,
-                                            )
-        
-        # ---------------- Bottom up PAN----------------
-        ## P3 -> P4
-        self.dowmsample_layer_1 = BasicConv(self.out_dim, self.out_dim,
-                                            kernel_size=3, padding=1, stride=2,
-                                            act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        self.bottom_up_layer_1 = RepCSPLayer(in_dim      = self.out_dim * 2,
-                                             out_dim     = self.out_dim,
-                                             num_blocks  = num_blocks,
-                                             expansion   = expansion,
-                                             act_type    = act_type,
-                                             norm_type   = norm_type,
-                                             )
-        ## P4 -> P5
-        self.dowmsample_layer_2 = BasicConv(self.out_dim, self.out_dim,
-                                            kernel_size=3, padding=1, stride=2,
-                                            act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        self.bottom_up_layer_2 = RepCSPLayer(in_dim      = self.out_dim * 2,
-                                             out_dim     = self.out_dim,
-                                             num_blocks  = num_blocks,
-                                             expansion   = expansion,
-                                             act_type    = act_type,
-                                             norm_type   = norm_type,
-                                             )
-
-        self.init_weights()
-  
-    def init_weights(self):
-        """Initialize the parameters."""
-        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                # In order to be consistent with the source code,
-                # reset the Conv2d initialization parameters
-                m.reset_parameters()
-
-    def forward(self, features):
-        c3, c4, c5 = features
-
-        # -------- Input projs --------
-        p5 = self.input_proj_1(c5)
-        p4 = self.input_proj_2(c4)
-        p3 = self.input_proj_3(c3)
-
-        # -------- Transformer encoder --------
-        p5 = self.transformer_encoder(p5)
-
-        # -------- Top down FPN --------
-        p5_in = self.reduce_layer_1(p5)
-        p5_up = F.interpolate(p5_in, size=p4.shape[2:])
-        p4 = self.top_down_layer_1(torch.cat([p4, p5_up], dim=1))
-
-        p4_in = self.reduce_layer_2(p4)
-        p4_up = F.interpolate(p4_in, size=p3.shape[2:])
-        p3 = self.top_down_layer_2(torch.cat([p3, p4_up], dim=1))
-
-        # -------- Bottom up PAN --------
-        p3_ds = self.dowmsample_layer_1(p3)
-        p4 = self.bottom_up_layer_1(torch.cat([p4_in, p3_ds], dim=1))
-
-        p4_ds = self.dowmsample_layer_2(p4)
-        p5 = self.bottom_up_layer_2(torch.cat([p5_in, p4_ds], dim=1))
-
-        out_feats = [p3, p4, p5]
-        
-        return out_feats

+ 23 - 58
odlab/test.py

@@ -11,6 +11,7 @@ from datasets import build_dataset, build_transform
 
 # load some utils
 from utils.misc import load_weight, compute_flops
+from utils.vis_tools import visualize
 
 from config import build_config
 from models.detectors import build_model
@@ -31,8 +32,6 @@ def parse_args():
                         help='Final confidence threshold')
     parser.add_argument('-ws', '--window_scale', default=1.0, type=float,
                         help='resize window of cv2 for visualization.')
-    parser.add_argument('--resave', action='store_true', default=False, 
-                        help='resave checkpoints without optimizer state dict.')
     # Model
     parser.add_argument('-m', '--model', default='yolof_r18_c5_1x', type=str,
                         help='build detector')
@@ -48,41 +47,8 @@ def parse_args():
 
     return parser.parse_args()
 
-def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4):
-    x1, y1, x2, y2 = bbox
-    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-    t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0]
-    # plot bbox
-    cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2)
-    
-    if label is not None:
-        # plot title bbox
-        cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1)
-        # put the test on the title bbox
-        cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA)
-
-    return img
-
-def visualize(img, 
-              bboxes, 
-              scores, 
-              labels, 
-              vis_thresh, 
-              class_colors, 
-              class_names):
-    ts = 0.4
-    for i, bbox in enumerate(bboxes):
-        if scores[i] > vis_thresh:
-            cls_id = int(labels[i])
-            cls_color = class_colors[cls_id]
-                
-            mess = '%s: %.2f' % (class_names[cls_id], scores[i])
-            img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts)
-
-    return img
-        
 @torch.no_grad()
-def run(args, model, device, dataset, transform, class_colors, class_names):
+def test_det(args, model, device, dataset, transform, class_colors, class_names):
     num_images = len(dataset)
     save_path = os.path.join('det_results/', args.dataset, args.model)
     os.makedirs(save_path, exist_ok=True)
@@ -97,7 +63,10 @@ def run(args, model, device, dataset, transform, class_colors, class_names):
 
         # Inference
         t0 = time.time()
-        bboxes, scores, labels = model(x)
+        outputs = model(x)
+        scores = outputs['scores']
+        labels = outputs['labels']
+        bboxes = outputs['bboxes']
         print("Infer. time: {}".format(time.time() - t0, "s"))
         
         # Rescale bboxes
@@ -105,10 +74,12 @@ def run(args, model, device, dataset, transform, class_colors, class_names):
         bboxes[..., 1::2] *= orig_h
 
         # vis detection
-        image = np.array(image).astype(np.uint8)
-        image = image[..., (2, 1, 0)].copy()
-        img_processed = visualize(
-            image, bboxes, scores, labels, args.visual_threshold, class_colors, class_names)
+        img_processed = visualize(image=image,
+                                  bboxes=bboxes,
+                                  scores=scores,
+                                  labels=labels,
+                                  class_colors=class_colors,
+                                  class_names=class_names)
         if args.show:
             h, w = img_processed.shape[:2]
             sw, sh = int(w*args.window_scale), int(h*args.window_scale)
@@ -138,16 +109,15 @@ if __name__ == '__main__':
     transform = build_transform(cfg, is_train=False)
 
     # Dataset
-    dataset, dataset_info = build_dataset(args, is_train=False)
+    dataset = build_dataset(args, cfg, is_train=False)
 
     np.random.seed(0)
     class_colors = [(np.random.randint(255),
                      np.random.randint(255),
-                     np.random.randint(255))
-                     for _ in range(dataset_info['num_classes'])]
+                     np.random.randint(255)) for _ in range(cfg.num_classes)]
 
     # Model
-    model = build_model(args, cfg, dataset_info['num_classes'], is_val=False)
+    model = build_model(args, cfg, is_val=False)
     model = load_weight(model, args.weight, args.fuse_conv_bn)
     model.to(device).eval()
 
@@ -161,19 +131,14 @@ if __name__ == '__main__':
         max_size=cfg['test_max_size'],
         device=device)
     del model_copy
-
-    # Resave model weight
-    if args.resave:
-        print('Resave: {}'.format(args.model.upper()))
-        checkpoint = torch.load(args.weight, map_location='cpu')
-        output_dir = 'weights/{}/{}/'.format(args.dataset, args.model)
-        os.makedirs(output_dir, exist_ok=True)
-        checkpoint_path = os.path.join(output_dir, "{}_pure.pth".format(args.model))
-        torch.save({'model': model.state_dict(),
-                    'mAP': checkpoint.pop("mAP"),
-                    'epoch': checkpoint.pop("epoch")}, 
-                    checkpoint_path)
         
     print("================= DETECT =================")
     # run
-    run(args, model, device, dataset, transform, class_colors, dataset_info['class_labels'])
+    test_det(args         = args,
+             model        = model, 
+             device       = device, 
+             dataset      = dataset,
+             transform    = transform,
+             class_colors = class_colors,
+             class_names  = cfg.class_labels,
+             )

+ 0 - 91
odlab/utils/box_ops.py

@@ -59,81 +59,18 @@ def get_ious(bboxes1,
     else:
         raise NotImplementedError
 
-
-def delta2bbox(proposals,
-               deltas,
-               max_shape=None,
-               wh_ratio_clip=16 / 1000,
-               clip_border=True,
-               add_ctr_clamp=False,
-               ctr_clamp=32):
-
-    dxy = deltas[..., :2]
-    dwh = deltas[..., 2:]
-
-    # Compute width/height of each roi
-    pxy = proposals[..., :2]
-    pwh = proposals[..., 2:]
-
-    dxy_wh = pwh * dxy
-    wh_ratio_clip = torch.tensor(wh_ratio_clip).to(deltas.device)
-    max_ratio = torch.abs(torch.log(wh_ratio_clip))
-    if add_ctr_clamp:
-        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
-        dwh = torch.clamp(dwh, max=max_ratio)
-    else:
-        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
-
-    gxy = pxy + dxy_wh
-    gwh = pwh * dwh.exp()
-    x1y1 = gxy - (gwh * 0.5)
-    x2y2 = gxy + (gwh * 0.5)
-    bboxes = torch.cat([x1y1, x2y2], dim=-1)
-    if clip_border and max_shape is not None:
-        bboxes[..., 0::2].clamp_(min=0).clamp_(max=max_shape[1])
-        bboxes[..., 1::2].clamp_(min=0).clamp_(max=max_shape[0])
-        
-    return bboxes
-
-
-def bbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)):
-    # hack for matcher
-    if proposals.size() != gt.size():
-        proposals = proposals[:, None]
-        gt = gt[None]
-
-    proposals = proposals.float()
-    gt = gt.float()
-    px, py, pw, ph = proposals.unbind(-1)
-    gx, gy, gw, gh = gt.unbind(-1)
-
-    dx = (gx - px) / (pw + 0.1)
-    dy = (gy - py) / (ph + 0.1)
-    dw = torch.log(gw / (pw + 0.1))
-    dh = torch.log(gh / (ph + 0.1))
-    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
-
-    means = deltas.new_tensor(means).unsqueeze(0)
-    stds = deltas.new_tensor(stds).unsqueeze(0)
-    deltas = deltas.sub_(means).div_(stds)
-
-    return deltas
-
-
 def box_cxcywh_to_xyxy(x):
     x_c, y_c, w, h = x.unbind(-1)
     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
          (x_c + 0.5 * w), (y_c + 0.5 * h)]
     return torch.stack(b, dim=-1)
 
-
 def box_xyxy_to_cxcywh(x):
     x0, y0, x1, y1 = x.unbind(-1)
     b = [(x0 + x1) / 2, (y0 + y1) / 2,
          (x1 - x0), (y1 - y0)]
     return torch.stack(b, dim=-1)
 
-
 # modified from torchvision to also return the union
 def box_iou(boxes1, boxes2):
     area1 = box_area(boxes1)
@@ -152,7 +89,6 @@ def box_iou(boxes1, boxes2):
     
     return iou, union
 
-
 def generalized_box_iou(boxes1, boxes2):
     """
     Generalized IoU from https://giou.stanford.edu/
@@ -175,30 +111,3 @@ def generalized_box_iou(boxes1, boxes2):
     area = wh[:, :, 0] * wh[:, :, 1]
 
     return iou - (area - union) / area
-
-
-def masks_to_boxes(masks):
-    """Compute the bounding boxes around the provided masks
-
-    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
-
-    Returns a [N, 4] tensors, with the boxes in xyxy format
-    """
-    if masks.numel() == 0:
-        return torch.zeros((0, 4), device=masks.device)
-
-    h, w = masks.shape[-2:]
-
-    y = torch.arange(0, h, dtype=torch.float)
-    x = torch.arange(0, w, dtype=torch.float)
-    y, x = torch.meshgrid(y, x)
-
-    x_mask = (masks * x.unsqueeze(0))
-    x_max = x_mask.flatten(1).max(-1)[0]
-    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
-
-    y_mask = (masks * y.unsqueeze(0))
-    y_max = y_mask.flatten(1).max(-1)[0]
-    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
-
-    return torch.stack([x_min, y_min, x_max, y_max], 1)

+ 0 - 98
odlab/utils/dn_compoments.py

@@ -1,98 +0,0 @@
-import torch
-from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
-
-
-def inverse_sigmoid(x, eps=1e-5):
-    x = x.clamp(min=0., max=1.)
-    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
-
-def get_contrastive_denoising_training_group(targets,
-                                             num_classes,
-                                             num_queries,
-                                             class_embed,
-                                             num_denoising=100,
-                                             label_noise_ratio=0.5,
-                                             box_noise_scale=1.0,):
-    if num_denoising <= 0:
-        return None, None, None, None
-
-    num_gts = [len(t['labels']) for t in targets]
-    device = targets[0]['labels'].device
-    
-    max_gt_num = max(num_gts)
-    if max_gt_num == 0:
-        return None, None, None, None
-
-    num_group = num_denoising // max_gt_num
-    num_group = 1 if num_group == 0 else num_group
-    # pad gt to max_num of a batch
-    bs = len(num_gts)
-
-    input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device)
-    input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device)
-    pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device)
-
-    for i in range(bs):
-        num_gt = num_gts[i]
-        if num_gt > 0:
-            input_query_class[i, :num_gt] = targets[i]['labels']
-            input_query_bbox[i, :num_gt] = targets[i]['boxes']
-            pad_gt_mask[i, :num_gt] = 1
-    # each group has positive and negative queries.
-    input_query_class = input_query_class.tile([1, 2 * num_group])
-    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
-    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
-    # positive and negative mask
-    negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device)
-    negative_gt_mask[:, max_gt_num:] = 1
-    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
-    positive_gt_mask = 1 - negative_gt_mask
-    # contrastive denoising training positive index
-    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
-    dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
-    dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts])
-    # total denoising queries
-    num_denoising = int(max_gt_num * 2 * num_group)
-
-    if label_noise_ratio > 0:
-        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
-        # randomly put a new one here
-        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
-        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
-
-    if box_noise_scale > 0:
-        known_bbox = box_cxcywh_to_xyxy(input_query_bbox)
-        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
-        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
-        rand_part = torch.rand_like(input_query_bbox)
-        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
-        rand_part *= rand_sign
-        known_bbox += rand_part * diff
-        known_bbox.clip_(min=0.0, max=1.0)
-        input_query_bbox = box_xyxy_to_cxcywh(known_bbox)
-        input_query_bbox = inverse_sigmoid(input_query_bbox)
-    input_query_class = class_embed(input_query_class)
-
-    tgt_size = num_denoising + num_queries
-    # attn_mask = torch.ones([tgt_size, tgt_size], device=device) < 0
-    attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device)
-    # match query cannot see the reconstruction
-    attn_mask[num_denoising:, :num_denoising] = True
-    
-    # reconstruct cannot see each other
-    for i in range(num_group):
-        if i == 0:
-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
-        if i == num_group - 1:
-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True
-        else:
-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True
-        
-    dn_meta = {
-        "dn_positive_idx": dn_positive_idx,
-        "dn_num_group": num_group,
-        "dn_num_split": [num_denoising, num_queries]
-    }
-
-    return input_query_class, input_query_bbox, attn_mask, dn_meta

+ 1 - 1
odlab/utils/lr_scheduler.py

@@ -51,7 +51,7 @@ def build_lr_scheduler(cfg, optimizer, resume=None):
         pass
         
     if resume is not None:
-        print('keep training: ', resume)
+        print('Load lr scheduler from the checkpoint: ', resume)
         checkpoint = torch.load(resume)
         # checkpoint state dict
         checkpoint_state_dict = checkpoint.pop("lr_scheduler")

+ 3 - 140
odlab/utils/misc.py

@@ -2,13 +2,11 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 # ---------------------------------------------------------------------------
 import time
-import math
 import datetime
 import numpy as np
-from typing import List
-from thop import profile
-from copy import deepcopy
-from collections import defaultdict, deque
+from   typing import List
+from   thop import profile
+from   collections import defaultdict, deque
 
 import torch
 import torch.nn as nn
@@ -243,14 +241,6 @@ def collate_fn(batch):
 
 
 # ---------------------------- For Model ----------------------------
-def match_name_keywords(n, name_keywords):
-    out = False
-    for b in name_keywords:
-        if b in n:
-            out = True
-            break
-    return out
-
 ## fuse Conv & BN layer
 def fuse_conv_bn(module):
     """Recursively fuse conv and bn in a module.
@@ -346,133 +336,6 @@ def get_total_grad_norm(parameters, norm_type=2):
                             norm_type)
     return total_norm
 
-## param Dict
-def get_param_dict(model, cfg, return_name=False):
-    # sanity check: a variable could not match backbone_names and linear_proj_names at the same time
-    cfg['lr_backbone'] = cfg['base_lr'] * cfg['backbone_lr_ratio']
-    for n, p in model.named_parameters():
-        if match_name_keywords(n, cfg['lr_backbone_names']) and match_name_keywords(n, cfg['lr_linear_proj_names']):
-            raise ValueError
-
-    param_dicts = [
-        {
-            "params": [
-                p if not return_name else n
-                for n, p in model.named_parameters()
-                if not match_name_keywords(n, cfg['lr_backbone_names'])
-                and not match_name_keywords(n, cfg['lr_linear_proj_names'])
-                and not match_name_keywords(n, cfg['wd_norm_names'])
-                and p.requires_grad
-            ],
-            "lr": cfg['base_lr'],
-            "weight_decay": cfg['weight_decay'],
-        },
-        {
-            "params": [
-                p if not return_name else n
-                for n, p in model.named_parameters()
-                if match_name_keywords(n, cfg['lr_backbone_names'])
-                and not match_name_keywords(n, cfg['lr_linear_proj_names'])
-                and not match_name_keywords(n, cfg['wd_norm_names'])
-                and p.requires_grad
-            ],
-            "lr": cfg['lr_backbone'],
-            "weight_decay": cfg['weight_decay'],
-        },
-        {
-            "params": [
-                p if not return_name else n
-                for n, p in model.named_parameters()
-                if not match_name_keywords(n, cfg['lr_backbone_names'])
-                and match_name_keywords(n, cfg['lr_linear_proj_names'])
-                and not match_name_keywords(n, cfg['wd_norm_names'])
-                and p.requires_grad
-            ],
-            "lr": cfg['base_lr'] * cfg['lr_linear_proj_mult'],
-            "weight_decay": cfg['weight_decay'],
-        },
-        {
-            "params": [
-                p if not return_name else n
-                for n, p in model.named_parameters()
-                if not match_name_keywords(n, cfg['lr_backbone_names'])
-                and not match_name_keywords(n, cfg['lr_linear_proj_names'])
-                and match_name_keywords(n, cfg['wd_norm_names'])
-                and p.requires_grad
-            ],
-            "lr": cfg['base_lr'],
-            "weight_decay": cfg['weight_decay'] * cfg['wd_norm_mult'],
-        },
-        {
-            "params": [
-                p if not return_name else n
-                for n, p in model.named_parameters()
-                if match_name_keywords(n, cfg['lr_backbone_names'])
-                and not match_name_keywords(n, cfg['lr_linear_proj_names'])
-                and match_name_keywords(n, cfg['wd_norm_names'])
-                and p.requires_grad
-            ],
-            "lr": cfg['lr_backbone'],
-            "weight_decay": cfg['weight_decay'] * cfg['wd_norm_mult'],
-        },
-        {
-            "params": [
-                p if not return_name else n
-                for n, p in model.named_parameters()
-                if not match_name_keywords(n, cfg['lr_backbone_names'])
-                and match_name_keywords(n, cfg['lr_linear_proj_names'])
-                and match_name_keywords(n, cfg['wd_norm_names'])
-                and p.requires_grad
-            ],
-            "lr": cfg['base_lr'] * cfg['lr_linear_proj_mult'],
-            "weight_decay": cfg['weight_decay'] * cfg['wd_norm_mult'],
-        },
-    ]
-
-    return param_dicts
-
-## Model EMA
-class ModelEMA(object):
-    def __init__(self, cfg, model, updates=0):
-        # Create EMA
-        self.ema = deepcopy(self.de_parallel(model)).eval()  # FP32 EMA
-        self.updates = updates  # number of EMA updates
-        self.decay = lambda x: cfg['ema_decay'] * (1 - math.exp(-x / cfg['ema_tau']))  # decay exponential ramp (to help early epochs)
-        for p in self.ema.parameters():
-            p.requires_grad_(False)
-
-    def is_parallel(self, model):
-        # Returns True if model is of type DP or DDP
-        return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
-
-    def de_parallel(self, model):
-        # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
-        return model.module if self.is_parallel(model) else model
-
-    def copy_attr(self, a, b, include=(), exclude=()):
-        # Copy attributes from b to a, options to only include [...] and to exclude [...]
-        for k, v in b.__dict__.items():
-            if (len(include) and k not in include) or k.startswith('_') or k in exclude:
-                continue
-            else:
-                setattr(a, k, v)
-
-    def update(self, model):
-        # Update EMA parameters
-        self.updates += 1
-        d = self.decay(self.updates)
-
-        msd = self.de_parallel(model).state_dict()  # model state_dict
-        for k, v in self.ema.state_dict().items():
-            if v.dtype.is_floating_point:  # true for FP16 and FP32
-                v *= d
-                v += (1 - d) * msd[k].detach()
-        # assert v.dtype == msd[k].dtype == torch.float32, f'{k}: EMA {v.dtype} and model {msd[k].dtype} must be FP32'
-
-    def update_attr(self, model, include=(), exclude=('process_group', 'reducer')):
-        # Update EMA attributes
-        self.copy_attr(self.ema, model, include, exclude)
-
 
 # ---------------------------- For Loss ----------------------------
 ## focal loss

+ 23 - 80
odlab/utils/optimizer.py

@@ -2,97 +2,40 @@ import torch
 from torch import optim
 
 
-def build_optimizer(optimizer_cfg, model, param_dicts=None, resume=None):
+def build_optimizer(cfg, model, resume=None):
     print('==============================')
-    print('Optimizer: {}'.format(optimizer_cfg['optimizer']))
-    print('--base_lr: {}'.format(optimizer_cfg['base_lr']))
-    print('--backbone_lr_ratio: {}'.format(optimizer_cfg['backbone_lr_ratio']))
-    print('--momentum: {}'.format(optimizer_cfg['momentum']))
-    print('--weight_decay: {}'.format(optimizer_cfg['weight_decay']))
-
-    if param_dicts is None:
-        param_dicts = [
-            {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]},
-            {
-                "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
-                "lr": optimizer_cfg['base_lr'] * optimizer_cfg['backbone_lr_ratio'],
-            },
-        ]
-
-    if optimizer_cfg['optimizer'] == 'sgd':
+    print('Optimizer: {}'.format(cfg.optimizer))
+    print('--base_lr: {}'.format(cfg.base_lr))
+    print('--backbone_lr_ratio: {}'.format(cfg.backbone_lr_ratio))
+    print('--momentum: {}'.format(cfg.momentum))
+    print('--weight_decay: {}'.format(cfg.weight_decay))
+
+    param_dicts = [
+        {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]},
+        {
+            "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
+            "lr": cfg.base_lr * cfg.backbone_lr_ratio,
+        },
+    ]
+
+    if cfg.optimizer == 'sgd':
         optimizer = optim.SGD(
             params=param_dicts, 
-            lr=optimizer_cfg['base_lr'],
-            momentum=optimizer_cfg['momentum'],
-            weight_decay=optimizer_cfg['weight_decay']
+            lr=cfg.base_lr,
+            momentum=cfg.momentum,
+            weight_decay=cfg.weight_decay
             )
                                 
-    elif optimizer_cfg['optimizer'] == 'adamw':
+    elif cfg.optimizer == 'adamw':
         optimizer = optim.AdamW(
             params=param_dicts, 
-            lr=optimizer_cfg['base_lr'],
-            weight_decay=optimizer_cfg['weight_decay']
+            lr=cfg.base_lr,
+            weight_decay=cfg.weight_decay
             )
                                 
     start_epoch = 0
     if resume is not None:
-        print('keep training: ', resume)
-        checkpoint = torch.load(resume)
-        # checkpoint state dict
-        checkpoint_state_dict = checkpoint.pop("optimizer")
-        optimizer.load_state_dict(checkpoint_state_dict)
-        start_epoch = checkpoint.pop("epoch") + 1
-                                                        
-    return optimizer, start_epoch
-
-
-def build_detr_optimizer(optimizer_cfg, model, resume=None):
-    print('==============================')
-    print('Optimizer: {}'.format(optimizer_cfg['optimizer']))
-    print('--base_lr: {}'.format(optimizer_cfg['base_lr']))
-    print('--backbone_lr_ratio: {}'.format(optimizer_cfg['backbone_lr_ratio']))
-    print('--weight_decay: {}'.format(optimizer_cfg['weight_decay']))
-
-    # ------------- Divide model's parameters -------------
-    param_dicts = [], [], [], [], [], [], []
-    norm_names = ["norm"] + ["norm{}".format(i) for i in range(10000)]
-    for n, p in model.named_parameters():
-        # Non-Backbone's learnable parameters
-        if "backbone" not in n and p.requires_grad:
-            if "bias" == n.split(".")[-1]:
-                param_dicts[0].append(p)      # no weight decay for all layers' bias
-            else:
-                if n.split(".")[-2] in norm_names:
-                    param_dicts[1].append(p)  # no weight decay for all NormLayers' weight
-                elif "cpb_mlp1" in n.split(".") or "cpb_mlp2" in n.split("."):
-                    param_dicts[2].append(p)  # no weight decay for plain-detr cpb_mlp weight
-                else:
-                    param_dicts[3].append(p)  # weight decay for all Non-NormLayers' weight
-        # Backbone's learnable parameters
-        elif "backbone" in n and p.requires_grad:
-            if "bias" == n.split(".")[-1]:
-                param_dicts[4].append(p)      # no weight decay for all layers' bias
-            else:
-                if n.split(".")[-2] in norm_names:
-                    param_dicts[5].append(p)  # no weight decay for all NormLayers' weight
-                else:
-                    param_dicts[6].append(p)  # weight decay for all Non-NormLayers' weight
-
-    # Non-Backbone's learnable parameters
-    optimizer = torch.optim.AdamW(param_dicts[0], lr=optimizer_cfg['base_lr'], weight_decay=0.0)
-    optimizer.add_param_group({"params": param_dicts[1], "weight_decay": 0.0})
-    optimizer.add_param_group({"params": param_dicts[2], "weight_decay": 0.0})
-    optimizer.add_param_group({"params": param_dicts[3], "weight_decay": optimizer_cfg['weight_decay']})
-
-    # Backbone's learnable parameters
-    backbone_lr = optimizer_cfg['base_lr'] * optimizer_cfg['backbone_lr_ratio']
-    optimizer.add_param_group({"params": param_dicts[4], "lr": backbone_lr, "weight_decay": 0.0})
-    optimizer.add_param_group({"params": param_dicts[5], "lr": backbone_lr, "weight_decay": 0.0})
-    optimizer.add_param_group({"params": param_dicts[6], "lr": backbone_lr, "weight_decay": optimizer_cfg['weight_decay']})
-
-    start_epoch = 0
-    if resume is not None:
-        print('keep training: ', resume)
+        print('Load optimzier from the checkpoint: ', resume)
         checkpoint = torch.load(resume)
         # checkpoint state dict
         checkpoint_state_dict = checkpoint.pop("optimizer")

+ 0 - 107
odlab/utils/plot_utils.py

@@ -1,107 +0,0 @@
-"""
-Plotting utilities to visualize training logs.
-"""
-import torch
-import pandas as pd
-import numpy as np
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-from pathlib import Path, PurePath
-
-
-def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
-    '''
-    Function to plot specific fields from training log(s). Plots both training and test results.
-
-    :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
-              - fields = which results to plot from each log file - plots both training and test for each field.
-              - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
-              - log_name = optional, name of log file if different than default 'log.txt'.
-
-    :: Outputs - matplotlib plots of results in fields, color coded for each log file.
-               - solid lines are training results, dashed lines are test results.
-
-    '''
-    func_name = "plot_utils.py::plot_logs"
-
-    # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
-    # convert single Path to list to avoid 'not iterable' error
-
-    if not isinstance(logs, list):
-        if isinstance(logs, PurePath):
-            logs = [logs]
-            print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
-        else:
-            raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
-            Expect list[Path] or single Path obj, received {type(logs)}")
-
-    # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir
-    for i, dir in enumerate(logs):
-        if not isinstance(dir, PurePath):
-            raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
-        if not dir.exists():
-            raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
-        # verify log_name exists
-        fn = Path(dir / log_name)
-        if not fn.exists():
-            print(f"-> missing {log_name}.  Have you gotten to Epoch 1 in training?")
-            print(f"--> full path of missing log file: {fn}")
-            return
-
-    # load log file(s) and plot
-    dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
-
-    fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
-
-    for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
-        for j, field in enumerate(fields):
-            if field == 'mAP':
-                coco_eval = pd.DataFrame(
-                    np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1]
-                ).ewm(com=ewm_col).mean()
-                axs[j].plot(coco_eval, c=color)
-            else:
-                df.interpolate().ewm(com=ewm_col).mean().plot(
-                    y=[f'train_{field}', f'test_{field}'],
-                    ax=axs[j],
-                    color=[color] * 2,
-                    style=['-', '--']
-                )
-    for ax, field in zip(axs, fields):
-        ax.legend([Path(p).name for p in logs])
-        ax.set_title(field)
-
-
-def plot_precision_recall(files, naming_scheme='iter'):
-    if naming_scheme == 'exp_id':
-        # name becomes exp_id
-        names = [f.parts[-3] for f in files]
-    elif naming_scheme == 'iter':
-        names = [f.stem for f in files]
-    else:
-        raise ValueError(f'not supported {naming_scheme}')
-    fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
-    for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
-        data = torch.load(f)
-        # precision is n_iou, n_points, n_cat, n_area, max_det
-        precision = data['precision']
-        recall = data['params'].recThrs
-        scores = data['scores']
-        # take precision for all classes, all areas and 100 detections
-        precision = precision[0, :, :, 0, -1].mean(1)
-        scores = scores[0, :, :, 0, -1].mean(1)
-        prec = precision.mean()
-        rec = data['recall'][0, :, 0, -1].mean()
-        print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
-              f'score={scores.mean():0.3f}, ' +
-              f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
-              )
-        axs[0].plot(recall, precision, c=color)
-        axs[1].plot(recall, scores, c=color)
-
-    axs[0].set_title('Precision / Recall')
-    axs[0].legend(names)
-    axs[1].set_title('Scores / Recall')
-    axs[1].legend(names)
-    return fig, axs

+ 9 - 96
odlab/utils/vis_tools.py

@@ -61,7 +61,7 @@ def vis_data(images, targets, masks=None, class_labels=None, normalized_coord=Fa
         cv2.imshow('train target', image)
         cv2.waitKey(0)
 
-## plot bbox & label on image
+## Draw bbox & label on the image
 def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4):
     x1, y1, x2, y2 = bbox
     x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
@@ -77,101 +77,14 @@ def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4):
 
     return img
 
-## visualize detection
-def visualize(img, 
-              bboxes, 
-              scores, 
-              labels, 
-              vis_thresh, 
-              class_colors, 
-              class_names):
+## Visualize the detection results
+def visualize(image, bboxes, scores, labels, class_colors, class_names):
     ts = 0.4
     for i, bbox in enumerate(bboxes):
-        if scores[i] > vis_thresh:
-            cls_id = int(labels[i])
-            cls_color = class_colors[cls_id]
-                
-            mess = '%s: %.2f' % (class_names[cls_id], scores[i])
-            img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts)
-
-    return img
-        
-
-## convert feature to he heatmap
-def convert_feature_heatmap(feature):
-    """
-        feature: (ndarray) [H, W, C]
-    """
-    heatmap = None
-
-    return heatmap
-
-## draw feature on the image
-def draw_feature(img, features, save=None):
-    """
-        img: (ndarray & cv2.Mat) [H, W, C], where the C is 3 for RGB or 1 for Gray.
-        features: (List[ndarray]). It is a list of the multiple feature map whose shape is [H, W, C].
-        save: (bool) save the result or not.
-    """
-    img_h, img_w = img.shape[:2]
-
-    for i, fmp in enumerate(features):
-        hmp = convert_feature_heatmap(fmp)
-        hmp = cv2.resize(hmp, (img_w, img_h))
-        hmp = hmp.astype(np.uint8)*255
-        hmp_rgb = cv2.applyColorMap(hmp, cv2.COLORMAP_JET)
-        
-        superimposed_img = hmp_rgb * 0.4 + img 
-
-        # show the heatmap
-        plt.imshow(hmp)
-        plt.close()
-
-        # show the image with heatmap
-        cv2.imshow("image with heatmap", superimposed_img)
-        cv2.waitKey(0)
-        cv2.destroyAllWindows()
-
-        if save:
-            save_dir = 'feature_heatmap'
-            os.makedirs(save_dir, exist_ok=True)
-            cv2.imwrite(os.path.join(save_dir, 'feature_{}.png'.format(i) ), superimposed_img)    
-
-
-# -------------------------- For Tracking Task --------------------------
-def get_color(idx):
-    idx = idx * 3
-    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
-
-    return color
-
-
-def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2=None):
-    im = np.ascontiguousarray(np.copy(image))
-    im_h, im_w = im.shape[:2]
-
-    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255
-
-    #text_scale = max(1, image.shape[1] / 1600.)
-    #text_thickness = 2
-    #line_thickness = max(1, int(image.shape[1] / 500.))
-    text_scale = 2
-    text_thickness = 2
-    line_thickness = 3
-
-    radius = max(5, int(im_w/140.))
-    cv2.putText(im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
-                (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, 2, (0, 0, 255), thickness=2)
+        cls_id = int(labels[i])
+        cls_color = class_colors[cls_id]
+            
+        mess = '%s: %.2f' % (class_names[cls_id], scores[i])
+        image = plot_bbox_labels(image, bbox, mess, cls_color, text_scale=ts)
 
-    for i, tlwh in enumerate(tlwhs):
-        x1, y1, w, h = tlwh
-        intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
-        obj_id = int(obj_ids[i])
-        id_text = '{}'.format(int(obj_id))
-        if ids2 is not None:
-            id_text = id_text + ', {}'.format(int(ids2[i]))
-        color = get_color(abs(obj_id))
-        cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)
-        cv2.putText(im, id_text, (intbox[0], intbox[1]), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255),
-                    thickness=text_thickness)
-    return im
+    return image