Prechádzať zdrojové kódy

build a new YOLO-Tutorial project for my book

yjh0410 1 rok pred
rodič
commit
264112178f
100 zmenil súbory, kde vykonal 8769 pridanie a 16 odobranie
  1. 0 16
      .gitignore
  2. 10 0
      odlab/.gitignore
  3. 201 0
      odlab/LICENSE
  4. 21 0
      odlab/README.md
  5. 111 0
      odlab/benchmark.py
  6. 21 0
      odlab/config/__init__.py
  7. 98 0
      odlab/config/detr_config.py
  8. 167 0
      odlab/config/fcos_config.py
  9. 175 0
      odlab/config/retinanet_config.py
  10. 262 0
      odlab/config/yolof_config.py
  11. 34 0
      odlab/datasets/__init__.py
  12. 175 0
      odlab/datasets/coco.py
  13. BIN
      odlab/datasets/demo/images/000000000632.jpg
  14. BIN
      odlab/datasets/demo/images/000000000785.jpg
  15. BIN
      odlab/datasets/demo/images/000000000872.jpg
  16. BIN
      odlab/datasets/demo/images/000000000885.jpg
  17. BIN
      odlab/datasets/demo/images/000000001000.jpg
  18. BIN
      odlab/datasets/demo/images/000000001268.jpg
  19. BIN
      odlab/datasets/demo/images/000000001296.jpg
  20. BIN
      odlab/datasets/demo/images/000000001532.jpg
  21. BIN
      odlab/datasets/demo/videos/000006.mp4
  22. 359 0
      odlab/datasets/transforms.py
  23. 256 0
      odlab/demo.py
  24. 100 0
      odlab/engine.py
  25. 10 0
      odlab/evaluator/__init__.py
  26. 98 0
      odlab/evaluator/coco_evaluator.py
  27. 242 0
      odlab/main.py
  28. 15 0
      odlab/models/backbone/__init__.py
  29. 254 0
      odlab/models/backbone/resnet.py
  30. 95 0
      odlab/models/basic/attn.py
  31. 297 0
      odlab/models/basic/conv.py
  32. 54 0
      odlab/models/basic/mlp.py
  33. 55 0
      odlab/models/basic/norm.py
  34. 246 0
      odlab/models/basic/transformer.py
  35. 40 0
      odlab/models/detectors/__init__.py
  36. 57 0
      odlab/models/detectors/detr/README.md
  37. 25 0
      odlab/models/detectors/detr/build.py
  38. 212 0
      odlab/models/detectors/detr/criterion.py
  39. 347 0
      odlab/models/detectors/detr/detr.py
  40. 99 0
      odlab/models/detectors/detr/matcher.py
  41. 72 0
      odlab/models/detectors/fcos/README.md
  42. 24 0
      odlab/models/detectors/fcos/build.py
  43. 267 0
      odlab/models/detectors/fcos/criterion.py
  44. 122 0
      odlab/models/detectors/fcos/fcos.py
  45. 382 0
      odlab/models/detectors/fcos/matcher.py
  46. 55 0
      odlab/models/detectors/retinanet/README.md
  47. 24 0
      odlab/models/detectors/retinanet/build.py
  48. 136 0
      odlab/models/detectors/retinanet/criterion.py
  49. 181 0
      odlab/models/detectors/retinanet/matcher.py
  50. 123 0
      odlab/models/detectors/retinanet/retinanet.py
  51. 67 0
      odlab/models/detectors/yolof/README.md
  52. 24 0
      odlab/models/detectors/yolof/build.py
  53. 151 0
      odlab/models/detectors/yolof/criterion.py
  54. 114 0
      odlab/models/detectors/yolof/matcher.py
  55. 106 0
      odlab/models/detectors/yolof/yolof.py
  56. 42 0
      odlab/models/head/__init__.py
  57. 185 0
      odlab/models/head/fcos_head.py
  58. 203 0
      odlab/models/head/retinanet_head.py
  59. 185 0
      odlab/models/head/yolof_head.py
  60. 57 0
      odlab/models/neck/__init__.py
  61. 72 0
      odlab/models/neck/dilated_encoder.py
  62. 80 0
      odlab/models/neck/fpn.py
  63. 142 0
      odlab/models/neck/hybrid_encoder.py
  64. 25 0
      odlab/models/neck/spp.py
  65. 179 0
      odlab/test.py
  66. 52 0
      odlab/train.sh
  67. 1 0
      odlab/utils/__init__.py
  68. 204 0
      odlab/utils/box_ops.py
  69. 122 0
      odlab/utils/distributed_utils.py
  70. 98 0
      odlab/utils/dn_compoments.py
  71. 60 0
      odlab/utils/lr_scheduler.py
  72. 572 0
      odlab/utils/misc.py
  73. 102 0
      odlab/utils/optimizer.py
  74. 107 0
      odlab/utils/plot_utils.py
  75. 177 0
      odlab/utils/vis_tools.py
  76. 110 0
      odlab/utils/weight_init.py
  77. 10 0
      yolo/.gitignore
  78. 0 0
      yolo/LICENSE
  79. 0 0
      yolo/README.md
  80. 0 0
      yolo/config/__init__.py
  81. 0 0
      yolo/config/gelan_config.py
  82. 0 0
      yolo/config/rtdetr_config.py
  83. 0 0
      yolo/config/yolov1_config.py
  84. 0 0
      yolo/config/yolov2_config.py
  85. 0 0
      yolo/config/yolov3_config.py
  86. 0 0
      yolo/config/yolov5_af_config.py
  87. 0 0
      yolo/config/yolov5_config.py
  88. 0 0
      yolo/config/yolov6_config.py
  89. 0 0
      yolo/config/yolov7_af_config.py
  90. 0 0
      yolo/config/yolov8_config.py
  91. 0 0
      yolo/dataset/__init__.py
  92. 0 0
      yolo/dataset/build.py
  93. 0 0
      yolo/dataset/coco.py
  94. 0 0
      yolo/dataset/customed.py
  95. 0 0
      yolo/dataset/data_augment/ssd_augment.py
  96. 0 0
      yolo/dataset/data_augment/strong_augment.py
  97. 0 0
      yolo/dataset/data_augment/yolo_augment.py
  98. 0 0
      yolo/dataset/scripts/COCO2017.sh
  99. 0 0
      yolo/dataset/scripts/VOC2007.sh
  100. 0 0
      yolo/dataset/scripts/VOC2012.sh

+ 0 - 16
.gitignore

@@ -1,16 +0,0 @@
-*.pt
-*.pth
-*.pkl
-*.onnx
-*.pyc
-*.zip
-weights
-__pycache__
-det_results
-.vscode
-deployment/OpenVINO/cpp/build
-cluster.json
-train_nebula.py
-train_nebula.sh
-make_data_nebula.sh
-dataset/make_dataset_nebula.py

+ 10 - 0
odlab/.gitignore

@@ -0,0 +1,10 @@
+*.pt
+*.pth
+*.pkl
+*.onnx
+*.pyc
+*.zip
+weights
+__pycache__
+det_results
+.vscode

+ 201 - 0
odlab/LICENSE

@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2020 - present, Facebook, Inc
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

+ 21 - 0
odlab/README.md

@@ -0,0 +1,21 @@
+# General Object Detection Laboratory
+The codebase of my research of General Object Detection
+## Requirements
+- We recommend you to use Anaconda to create a conda environment:
+```Shell
+conda create -n odlab python=3.8
+```
+
+- Then, activate the environment:
+```Shell
+conda activate odlab
+```
+
+- Requirements:
+```Shell
+pip install -r requirements.txt 
+```
+
+My torch environment:
+- PyTorch = 2.2.0+cu121
+- Torchvision = 0.17.0+cu121

+ 111 - 0
odlab/benchmark.py

@@ -0,0 +1,111 @@
+import argparse
+import numpy as np
+import time
+import os
+import torch
+
+from datasets import build_dataset, build_transform
+from utils.misc import compute_flops, fuse_conv_bn
+from utils.misc import load_weight
+
+from config import build_config
+from models.detectors import build_model
+
+
+parser = argparse.ArgumentParser(description='Benchmark')
+# Model
+parser.add_argument('-m', '--model', default='fcos_r18_1x',
+                    help='build detector')
+parser.add_argument('--fuse_conv_bn', action='store_true', default=False,
+                    help='fuse conv and bn')
+parser.add_argument('--topk', default=100, type=int,
+                    help='NMS threshold')
+parser.add_argument('--weight', default=None, type=str,
+                    help='Trained state_dict file path to open')
+# Data root
+parser.add_argument('--root', default='/data/datasets/COCO',
+                    help='data root')
+# cuda
+parser.add_argument('--cuda', action='store_true', default=False, 
+                    help='use cuda.')
+
+args = parser.parse_args()
+
+
+def test(cfg, model, device, dataset, transform):
+    # Step-1: Compute FLOPs and Params
+    compute_flops(
+        model=model,
+        min_size=cfg['test_min_size'],
+        max_size=cfg['test_max_size'],
+        device=device)
+
+    # Step-2: Compute FPS
+    num_images = 2002
+    total_time = 0
+    count = 0
+    with torch.no_grad():
+        for index in range(num_images):
+            if index % 500 == 0:
+                print('Testing image {:d}/{:d}....'.format(index+1, num_images))
+            image, _ = dataset[index]
+            orig_h, orig_w = image.height, image.width
+
+            # PreProcess
+            x, _ = transform(image)
+            x = x.unsqueeze(0).to(device)
+
+            # star time
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()    
+
+            # inference
+            bboxes, scores, labels = model(x)
+            
+            # Rescale bboxes
+            bboxes[..., 0::2] *= orig_w
+            bboxes[..., 1::2] *= orig_h
+
+            # end time
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+
+            # print("detection time used ", elapsed, "s")
+            if index > 1:
+                total_time += elapsed
+                count += 1
+            
+        print('- FPS :', 1.0 / (total_time / count))
+
+
+
+if __name__ == '__main__':
+    # get device
+    if args.cuda:
+        print('use cuda')
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+
+    # Dataset & Model Config
+    cfg = build_config(args)
+
+    # Transform
+    transform = build_transform(cfg, is_train=False)
+
+    # Dataset
+    args.dataset = 'coco'
+    dataset, dataset_info = build_dataset(args, is_train=False)
+
+    # Model
+    model = build_model(args, cfg, device, dataset_info['num_classes'], False)
+    model = load_weight(model, args.weight, args.fuse_conv_bn)
+    model.to(device).eval()
+
+    # fuse conv bn
+    if args.fuse_conv_bn:
+        print('fuse conv and bn ...')
+        model = fuse_conv_bn(model)
+
+    # run
+    test(cfg, model, device, dataset, transform)

+ 21 - 0
odlab/config/__init__.py

@@ -0,0 +1,21 @@
+# ----------------------- Model Config -----------------------
+from .retinanet_config import retinanet_cfg
+from .fcos_config      import fcos_cfg
+from .yolof_config     import yolof_cfg
+from .detr_config      import detr_cfg
+
+def build_config(args):
+    # RetinaNet
+    if args.model in retinanet_cfg.keys():
+        return retinanet_cfg[args.model]
+    # FCOS
+    elif args.model in fcos_cfg.keys():
+        return fcos_cfg[args.model]
+    # YOLOF
+    elif args.model in yolof_cfg.keys():
+        return yolof_cfg[args.model]
+    # DETR
+    elif args.model in detr_cfg.keys():
+        return detr_cfg[args.model]
+    else:
+        raise NotImplementedError('Unknown Model: {}'.format(args.model))

+ 98 - 0
odlab/config/detr_config.py

@@ -0,0 +1,98 @@
+# Plain DETR
+
+detr_cfg = {
+    'detr_r50':{
+        # ---------------- Model config ----------------
+        ## Model scale
+        # Backbone
+        'backbone': 'resnet50',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'pretrained_weight': 'spark_resnet50',  # Cls: imagenet1k_v2; MIM: spark_resnet50
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone
+        'max_stride': 32,
+        'out_stride': 16,
+        # Transformer Ecndoer
+        'hidden_dim': 256,
+        'en_num_heads': 8,
+        'en_num_layers': 6,
+        'en_ffn_dim': 2048,
+        'en_dropout': 0.1,
+        'en_act': 'gelu',
+        'en_pre_norm': True,
+        # Transformer Decoder
+        'transformer': 'detr_transformer',
+        'de_num_heads': 8,
+        'de_num_layers': 6,
+        'de_ffn_dim': 2048,
+        'de_dropout': 0.0,
+        'de_act': 'gelu',
+        'de_pre_norm': True,
+        'rpe_hidden_dim': 512,
+        'use_checkpoint': False,
+        'proposal_feature_levels': 3,
+        'proposal_tgt_strides': [8, 16, 32],
+        'num_queries_one2one': 300,
+        'num_queries_one2many': 1500,
+        # Post process
+        'train_topk': 300,
+        'train_conf_thresh': 0.001,
+        'train_nms_thresh': 0.5,
+        'test_topk': 300,
+        'test_conf_thresh': 0.001,
+        'test_nms_thresh': 0.5,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ---------------- Assignment config ----------------
+        'matcher_hpy': {'cost_class': 2.0,
+                        'cost_bbox': 1.0,
+                        'cost_giou': 2.0,},
+        # ---------------- Loss config ----------------
+        'k_one2many': 6,
+        'lambda_one2many': 1.0,
+        'loss_coeff': {'class': 2,
+                       'bbox': 1,
+                       'giou': 2,},
+        # ----------------- Training -----------------
+        ## Optimizer
+        'optimizer': 'adamw',
+        'base_lr': 0.0002 / 16,
+        'backbone_lr_ratio': 0.1,
+        'momentum': None,
+        'weight_decay': 0.05,
+        'clip_max_norm': 0.1,
+        ## Params dict
+        'param_dict_type': 'detr',
+        'lr_backbone_names': ['backbone',],
+        'lr_linear_proj_names': ["reference_points", "sampling_offsets",],  # These two names are not required by PlainDETR
+        'lr_linear_proj_mult': 0.1,
+        'wd_norm_names': ["norm", "bias", "rpb_mlp", "cpb_mlp", "level_embed",],
+        'wd_norm_mult': 0.0,
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 1000,
+        'warmup_factor': 0.00066667,
+        ## Training scheduler
+        'scheduler': '1x',
+        'max_epoch': 12,      # 1x
+        'lr_epoch': [11],     # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [800],   # short edge of image
+        'train_min_size2': [400, 500, 600],
+        'train_max_size': 1333,
+        'test_min_size': [800],
+        'test_max_size': 1333,
+        'random_crop_size': [320, 600],
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': True,
+        'trans_config': None,
+        'box_format': 'xywh',
+        'normalize_coords': False,
+    },
+
+}

+ 167 - 0
odlab/config/fcos_config.py

@@ -0,0 +1,167 @@
+# Fully Convolutional One-Stage object detector
+
+
+fcos_cfg = {
+    'fcos_r18_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet18',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone
+        'pretrained_weight': 'imagenet1k_v1',
+        'max_stride': 128,
+        'out_stride': [8, 16, 32, 64, 128],
+        ## Neck
+        'neck': 'basic_fpn',
+        'fpn_p6_feat': True,
+        'fpn_p7_feat': True,
+        'fpn_p6_from_c5': False,
+        ## Head
+        'head': 'fcos_head',
+        'head_dim': 256,
+        'num_cls_head': 4,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': 'GN',
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.6,
+        'test_topk': 100,
+        'test_conf_thresh': 0.5,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'fcos_matcher',
+        'matcher_hpy':{'center_sampling_radius': 1.5,
+                       'object_sizes_of_interest': [[-1, 64], [64, 128], [128, 256], [256, 512], [512, float('inf')]]
+                       },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        'loss_ctn_weight': 1.0,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.01 / 16,
+        'backbone_lr_ratio': 1.0 / 1.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': -1.0,
+        'param_dict_type': 'default',
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 12,      # 1x
+        'lr_epoch': [8, 11],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [800],   # short edge of image
+        'train_max_size': 1333,
+        'test_min_size': [800],
+        'test_max_size': 1333,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
+    'fcos_r50_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet50',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone
+        'pretrained_weight': 'imagenet1k_v1',
+        'max_stride': 128,
+        'out_stride': [8, 16, 32, 64, 128],
+        ## Neck
+        'neck': 'basic_fpn',
+        'fpn_p6_feat': True,
+        'fpn_p7_feat': True,
+        'fpn_p6_from_c5': False,
+        ## Head
+        'head': 'fcos_head',
+        'head_dim': 256,
+        'num_cls_head': 4,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': 'GN',
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.65,
+        'test_topk': 100,
+        'test_conf_thresh': 0.5,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'fcos_matcher',
+        'matcher_hpy':{'center_sampling_radius': 1.5,
+                       'object_sizes_of_interest': [[-1, 64], [64, 128], [128, 256], [256, 512], [512, float('inf')]]
+                       },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        'loss_ctn_weight': 1.0,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.01 / 16,
+        'backbone_lr_ratio': 1.0 / 1.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': -1.0,
+        'param_dict_type': 'default',
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 12,      # 1x
+        'lr_epoch': [8, 11],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [800],   # short edge of image
+        'train_max_size': 1333,
+        'test_min_size': [800],
+        'test_max_size': 1333,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
+}

+ 175 - 0
odlab/config/retinanet_config.py

@@ -0,0 +1,175 @@
+# RetinaNet
+
+
+retinanet_cfg = {
+    'retinanet_r18_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet18',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'pretrained_weight': 'imagenet1k_v1',
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone        
+        'max_stride': 128,
+        'out_stride': [8, 16, 32, 64, 128],
+        ## Neck
+        'neck': 'basic_fpn',
+        'fpn_p6_feat': True,
+        'fpn_p7_feat': True,
+        'fpn_p6_from_c5': True,
+        ## Head
+        'head': 'retinanet_head',
+        'head_dim': 256,
+        'num_cls_head': 4,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': None,
+        'anchor_config': {'basic_size': [[32, 32], [64, 64], [128, 128], [256, 256], [512, 512]],
+                          'aspect_ratio': [0.5, 1.0, 2.0],
+                          'area_scale': [2 ** 0, 2 ** (1. / 3.), 2 ** (2. / 3.)]},
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.6,
+        'test_topk': 100,
+        'test_conf_thresh': 0.3,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'retinanet_matcher',
+        'matcher_hpy': {'iou_thresh': [0.4, 0.5],
+                        'iou_labels': [0, -1, 1], # [negative sample, ignored sample, positive sample]
+                        'allow_low_quality_matches': True,
+                        },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        'use_giou_loss': False,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.01 / 16,
+        'backbone_lr_ratio': 1.0 / 1.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': -1.0,
+        'param_dict_type': 'default',
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 12,      # 1x
+        'lr_epoch': [8, 11],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [800],   # short edge of image
+        'train_max_size': 1333,
+        'test_min_size': [800],
+        'test_max_size': 1333,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
+    'retinanet_r50_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet50',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'pretrained_weight': 'imagenet1k_v1',
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone        
+        'max_stride': 128,
+        'out_stride': [8, 16, 32, 64, 128],
+        ## Neck
+        'neck': 'basic_fpn',
+        'fpn_p6_feat': True,
+        'fpn_p7_feat': True,
+        'fpn_p6_from_c5': True,
+        ## Head
+        'head': 'retinanet_head',
+        'head_dim': 256,
+        'num_cls_head': 4,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': None,
+        'anchor_config': {'basic_size': [[32, 32], [64, 64], [128, 128], [256, 256], [512, 512]],
+                          'aspect_ratio': [0.5, 1.0, 2.0],
+                          'area_scale': [2 ** 0, 2 ** (1. / 3.), 2 ** (2. / 3.)]},
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.6,
+        'test_topk': 100,
+        'test_conf_thresh': 0.3,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'retinanet_matcher',
+        'matcher_hpy': {'iou_thresh': [0.4, 0.5],
+                        'iou_labels': [0, -1, 1], # [negative sample, ignored sample, positive sample]
+                        'allow_low_quality_matches': True,
+                        },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        'use_giou_loss': False,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.01 / 16,
+        'backbone_lr_ratio': 1.0 / 1.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': -1.0,
+        'param_dict_type': 'default',
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 12,      # 1x
+        'lr_epoch': [8, 11],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [800],   # short edge of image
+        'train_max_size': 1333,
+        'test_min_size': [800],
+        'test_max_size': 1333,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
+}

+ 262 - 0
odlab/config/yolof_config.py

@@ -0,0 +1,262 @@
+# Fully Convolutional One-Stage object detector
+
+
+yolof_cfg = {
+    # --------------- C5 level ---------------
+    'yolof_r18_c5_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet18',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'pretrained_weight': 'imagenet1k_v1',
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone
+        'max_stride': 32,
+        'out_stride': 32,
+        ## Neck
+        'neck': 'dilated_encoder',
+        'neck_dilations': [2, 4, 6, 8],
+        'neck_expand_ratio': 0.25,
+        'neck_act': 'relu',
+        'neck_norm': 'BN',
+        ## Head
+        'head': 'yolof_head',
+        'head_dim': 512,
+        'num_cls_head': 2,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': 'BN',
+        'center_clamp': 32,         
+        'anchor_size': [[32, 32], [64, 64], [128, 128], [256, 256], [512, 512]],
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.6,
+        'test_topk': 300,
+        'test_conf_thresh': 0.3,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'yolof_matcher',
+        'matcher_hpy': {'topk_candidates': 4,
+                        'iou_thresh': 0.15,
+                        'ignore_thresh': 0.7,
+                        },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.12 / 64,
+        'backbone_lr_ratio': 1.0 / 3.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': 10.0,
+        'param_dict_type': 'default',
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 1500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 12,      # 1x
+        'lr_epoch': [8, 11],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [800],   # short edge of image
+        'train_max_size': 1333,
+        'test_min_size': [800],
+        'test_max_size': 1333,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+            {'name': 'RandomShift', 'max_shift': 32},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
+    'yolof_r50_c5_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet50',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'pretrained_weight': 'imagenet1k_v1',
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone
+        'max_stride': 32,
+        'out_stride': 32,
+        ## Neck
+        'neck': 'dilated_encoder',
+        'neck_dilations': [2, 4, 6, 8],
+        'neck_expand_ratio': 0.25,
+        'neck_act': 'relu',
+        'neck_norm': 'BN',
+        ## Head
+        'head': 'yolof_head',
+        'head_dim': 512,
+        'num_cls_head': 2,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': 'BN',
+        'center_clamp': 32,         
+        'anchor_size': [[32, 32], [64, 64], [128, 128], [256, 256], [512, 512]],
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.6,
+        'test_topk': 300,
+        'test_conf_thresh': 0.3,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'yolof_matcher',
+        'matcher_hpy': {'topk_candidates': 4,
+                        'iou_thresh': 0.15,
+                        'ignore_thresh': 0.7,
+                        },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.12 / 64,
+        'backbone_lr_ratio': 1.0 / 3.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': 10.0,
+        'param_dict_type': 'default',
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 1500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 12,      # 1x
+        'lr_epoch': [8, 11],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [800],   # short edge of image
+        'train_max_size': 1333,
+        'test_min_size': [800],
+        'test_max_size': 1333,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+            {'name': 'RandomShift', 'max_shift': 32},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
+    # --------------- Dilated C5 level ---------------
+    'yolof_r50_dc5_1x':{
+        # ----------------- Model-----------------
+        ## Backbone
+        'backbone': 'resnet50',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': True,
+        'pretrained': True,
+        'pretrained_weight': 'imagenet1k_v1',
+        'freeze_at': 1,  # freeze stem layer + layer1 of the backbone
+        'max_stride': 16,
+        'out_stride': 16,
+        ## Neck
+        'neck': 'dilated_encoder',
+        'neck_dilations': [4, 8, 12, 16],
+        'neck_expand_ratio': 0.25,
+        'neck_act': 'relu',
+        'neck_norm': 'BN',
+        ## Head
+        'head': 'yolof_head',
+        'head_dim': 512,
+        'num_cls_head': 2,
+        'num_reg_head': 4,
+        'head_act': 'relu',
+        'head_norm': 'BN',
+        'center_clamp': 32,         
+        'anchor_size': [[16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]],
+        ## Post-process
+        'train_topk': 1000,
+        'train_conf_thresh': 0.05,
+        'train_nms_thresh': 0.6,
+        'test_topk': 300,
+        'test_conf_thresh': 0.3,
+        'test_nms_thresh': 0.45,
+        'nms_class_agnostic': True,  # We prefer to use class-agnostic NMS in the demo.
+        # ----------------- Label Assignment -----------------
+        'matcher': 'yolof_matcher',
+        'matcher_hpy': {'topk_candidates': 8,
+                        'iou_thresh': 0.1,
+                        'ignore_thresh': 0.7,
+                        },
+        # ----------------- Loss weight -----------------
+        ## Loss hyper-parameters
+        'focal_loss_alpha': 0.25,
+        'focal_loss_gamma': 2.0,
+        'loss_cls_weight': 1.0,
+        'loss_reg_weight': 1.0,
+        # ----------------- Training -----------------
+        ## Training scheduler
+        'scheduler': '1x',
+        ## Optimizer
+        'optimizer': 'sgd',
+        'base_lr': 0.12 / 64,
+        'backbone_lr_ratio': 1.0 / 3.0,
+        'momentum': 0.9,
+        'weight_decay': 1e-4,
+        'clip_max_norm': 10.0,
+        'param_dict_type': 'default',
+        ## LR Scheduler
+        'lr_scheduler': 'step',
+        'warmup': 'linear',
+        'warmup_iters': 1500,
+        'warmup_factor': 0.00066667,
+        ## Epoch
+        'max_epoch': 12,      # 1x
+        'lr_epoch': [8, 11],  # 1x
+        # ----------------- Input -----------------
+        ## Transforms
+        'train_min_size': [800],   # short edge of image
+        'train_max_size': 1333,
+        'test_min_size': [800],
+        'test_max_size': 1333,
+        ## Pixel mean & std
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        ## Transforms
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomResize'},
+            {'name': 'RandomShift', 'max_shift': 32},
+        ],
+        'box_format': 'xyxy',
+        'normalize_coords': False,
+    },
+
+}

+ 34 - 0
odlab/datasets/__init__.py

@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch.utils.data
+from torch.utils.data import DataLoader, DistributedSampler
+
+from .coco import build_coco, coco_labels, coco_indexs
+from .transforms import build_transform
+
+
+def build_dataset(args, transform=None, is_train=False):
+    if args.dataset == 'coco':
+        dataset = build_coco(args, transform, is_train)
+        dataset_info = {
+            'class_labels': dataset.coco_labels,
+            'num_classes': 80
+        }
+
+    return dataset, dataset_info
+
+def build_dataloader(args, dataset, batch_size, collate_fn, is_train=False):
+    if args.distributed:
+        sampler = DistributedSampler(dataset) if is_train else DistributedSampler(dataset, shuffle=False)
+    else:
+        sampler = torch.utils.data.RandomSampler(dataset) if is_train else torch.utils.data.SequentialSampler(dataset)
+
+    if is_train:
+        batch_sampler = torch.utils.data.BatchSampler(sampler, batch_size, drop_last=True)
+        dataloader = DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=args.num_workers)
+    else:
+        dataloader = DataLoader(dataset, batch_size, sampler=sampler, drop_last=False, collate_fn=collate_fn, num_workers=args.num_workers)
+    
+    return dataloader
+
+
+coco_labels = {1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra', 25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee', 35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove', 41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle', 46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed', 67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier', 90: 'toothbrush'}

+ 175 - 0
odlab/datasets/coco.py

@@ -0,0 +1,175 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO dataset which returns image_id for evaluation.
+
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+
+import torch
+import torch.utils.data
+import torchvision
+
+try:
+    from .transforms import build_transform
+except:
+    from transforms import build_transform
+
+
+# coco_labels = ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+coco_labels = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',  'traffic light',  'fire hydrant',  'stop sign',  'parking meter',  'bench',  'bird',  'cat',  'dog',  'horse',  'sheep',  'cow',  'elephant',  'bear',  'zebra',  'giraffe',  'backpack',  'umbrella',  'handbag',  'tie',  'suitcase',  'frisbee',  'skis',  'snowboard',  'sports ball',  'kite',  'baseball bat',  'baseball glove',  'skateboard',  'surfboard',  'tennis racket',  'bottle',  'wine glass',  'cup',  'fork',  'knife',  'spoon',  'bowl',  'banana',  'apple',  'sandwich',  'orange',  'broccoli',  'carrot',  'hot dog',  'pizza',  'donut',  'cake',  'chair',  'couch',  'potted plant',  'bed',  'dining table',  'toilet',  'tv',  'laptop',  'mouse',  'remote',  'keyboard',  'cell phone',  'microwave',  'oven',  'toaster',  'sink',  'refrigerator',  'book',  'clock',  'vase',  'scissors',  'teddy bear',  'hair drier',  'toothbrush')
+coco_indexs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self.coco_labels = coco_labels  # 80 coco labels for detection task
+        self.coco_indexs = coco_indexs  # all original coco label index
+        self._transforms = transforms
+
+    def prepare(self, image, target):
+        w, h = image.size
+        # load an image
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        # load an annotation
+        anno = target["annotations"]
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        # bbox target
+        boxes = [obj["bbox"] for obj in anno]
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        # class target
+        classes = [self.coco_indexs.index(obj["category_id"]) for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        # filter invalid bbox
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["image_id"] = image_id
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+
+        return image, target
+
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+
+        return img, target
+
+
+def build_coco(args, transform=None, is_train=False):
+    root = Path(args.root)
+    assert root.exists(), f'provided COCO path {root} does not exist'
+    PATHS = {
+        "train": (root / "train2017", root / "annotations" / 'instances_train2017.json'),
+        "val":   (root / "val2017",   root / "annotations" / 'instances_val2017.json'),
+    }
+
+    image_set = "train" if is_train else "val"
+    img_folder, ann_file = PATHS[image_set]
+
+    # build transform
+    dataset = CocoDetection(img_folder, ann_file, transform)
+
+    return dataset
+
+
+if __name__ == "__main__":
+    import argparse
+    import cv2
+    import numpy as np
+    
+    parser = argparse.ArgumentParser(description='COCO-Dataset')
+
+    # opt
+    parser.add_argument('--root', default='/Users/liuhaoran/Desktop/python_work/object-detection/dataset/COCO/',
+                        help='data root')
+    parser.add_argument('--is_train', action="store_true", default=False,
+                        help='mixup augmentation.')    
+    args = parser.parse_args()
+
+    np.random.seed(0)
+    class_colors = [(np.random.randint(255),
+                     np.random.randint(255),
+                     np.random.randint(255)) for _ in range(80)]
+
+    # config
+    cfg = {
+        # input size
+        'train_min_size': [800],
+        'train_max_size': 1333,
+        'test_min_size': 800,
+        'test_max_size': 1333,
+        'pixel_mean': [0.485, 0.456, 0.406],
+        'pixel_std':  [0.229, 0.224, 0.225],
+        # trans config
+        'detr_style': False,
+        'trans_config': [
+            {'name': 'RandomResize', 'random_sizes': [400, 500, 600, 700, 800], 'max_size': 1333},
+            {'name': 'RandomHFlip'},
+            {'name': 'RandomShift', 'max_shift': 100}
+        ],
+        'box_format': 'xywh',
+        'normalize_coords': False,
+    }
+    
+    # build dataset
+    transform = build_transform(cfg, is_train=True)
+    dataset = build_coco(args, transform, is_train=args.is_train)
+
+    for index, (image, target) in enumerate(dataset):
+        print("{} / {}".format(index, len(dataset)))
+        # to numpy
+        image = image.permute(1, 2, 0).numpy()
+        # denormalize
+        image = (image * cfg['pixel_std'] + cfg['pixel_mean']) * 255
+        image = image.astype(np.uint8)[..., (2, 1, 0)].copy()
+        orig_h, orig_w = image.shape[:2]
+
+        tgt_bboxes = target["boxes"]
+        tgt_labels = target["labels"]
+        for box, label in zip(tgt_bboxes, tgt_labels):
+            if cfg['normalize_coords']:
+                box[..., [0, 2]] *= orig_w
+                box[..., [1, 3]] *= orig_h
+            if cfg['box_format'] == 'xywh':
+                box_x1y1 = box[..., :2] - box[..., 2:] * 0.5
+                box_x2y2 = box[..., :2] + box[..., 2:] * 0.5
+                box = torch.cat([box_x1y1, box_x2y2], dim=-1)
+            # get box target
+            x1, y1, x2, y2 = box.long()
+            # get class label
+            cls_name = coco_labels[label.item()]
+            color = class_colors[label.item()]
+            # draw bbox
+            image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
+            # put the test on the bbox
+            cv2.putText(image, cls_name, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA)
+
+        cv2.imshow("data", image)
+        cv2.waitKey(0)
+

BIN
odlab/datasets/demo/images/000000000632.jpg


BIN
odlab/datasets/demo/images/000000000785.jpg


BIN
odlab/datasets/demo/images/000000000872.jpg


BIN
odlab/datasets/demo/images/000000000885.jpg


BIN
odlab/datasets/demo/images/000000001000.jpg


BIN
odlab/datasets/demo/images/000000001268.jpg


BIN
odlab/datasets/demo/images/000000001296.jpg


BIN
odlab/datasets/demo/images/000000001532.jpg


BIN
odlab/datasets/demo/videos/000006.mp4


+ 359 - 0
odlab/datasets/transforms.py

@@ -0,0 +1,359 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import PIL
+import random
+
+import torch
+import torchvision
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+
+
+# ----------------- Basic transform functions -----------------
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+
+    fields = ["labels", "area", "iscrowd"]
+
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+
+        for field in fields:
+            target[field] = target[field][keep]
+
+    return cropped_image, target
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+
+    return flipped_image, target
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+
+    return rescaled_image, target
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+# ----------------- Basic transform  -----------------
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target=None):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, img: PIL.Image.Image, target: dict = None):
+        w = random.randint(self.min_size, min(img.width, self.max_size))
+        h = random.randint(self.min_size, min(img.height, self.max_size))
+        region = T.RandomCrop.get_params(img, [h, w])
+        return crop(img, target, region)
+
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target=None):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+
+class RandomShift(object):
+    def __init__(self, p=0.5, max_shift=32):
+        self.p = p
+        self.max_shift = max_shift
+
+    def __call__(self, image, target=None):
+        if random.random() < self.p:
+            img_h, img_w = image.height, image.width
+            shift_x = random.randint(-self.max_shift, self.max_shift)
+            shift_y = random.randint(-self.max_shift, self.max_shift)
+            shifted_image = F.affine(image, translate=[shift_x, shift_y], angle=0, scale=1.0, shear=0)
+
+            target = target.copy()
+            target["boxes"][..., [0, 2]] += shift_x
+            target["boxes"][..., [1, 3]] += shift_y
+            target["boxes"][..., [0, 2]] = target["boxes"][..., [0, 2]].clip(0, img_w)
+            target["boxes"][..., [1, 3]] = target["boxes"][..., [1, 3]].clip(0, img_h)
+
+            return shifted_image, target
+
+        return image, target
+
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+
+    def __call__(self, img, target=None):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+
+class ToTensor(object):
+    def __call__(self, img, target=None):
+        return F.to_tensor(img), target
+
+class Normalize(object):
+    def __init__(self, mean, std, normalize_coords=False):
+        self.mean = mean
+        self.std = std
+        self.normalize_coords = normalize_coords
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        if self.normalize_coords:
+            target = target.copy()
+            h, w = image.shape[-2:]
+            if "boxes" in target:
+                boxes = target["boxes"]
+                boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+                target["boxes"] = boxes
+        return image, target
+
+class RefineBBox(object):
+    def __init__(self, min_box_size=1):
+        self.min_box_size = min_box_size
+
+    def __call__(self, img, target):
+        boxes  = target["boxes"].clone()
+        labels = target["labels"].clone()
+
+        tgt_boxes_wh = boxes[..., 2:] - boxes[..., :2]
+        min_tgt_size = torch.min(tgt_boxes_wh, dim=-1)[0]
+
+        keep = (min_tgt_size >= self.min_box_size)
+
+        target["boxes"] = boxes[keep]
+        target["labels"] = labels[keep]
+
+        return img, target
+
+class ConvertBoxFormat(object):
+    def __init__(self, box_format="xyxy"):
+        self.box_format = box_format
+
+    def __call__(self, image, target=None):
+        # convert box format
+        if self.box_format == "xyxy" or target is None:
+            pass
+        elif self.box_format == "xywh":
+            target = target.copy()
+            if "boxes" in target:
+                boxes_xyxy = target["boxes"]
+                boxes_xywh = torch.zeros_like(boxes_xyxy)
+                boxes_xywh[..., :2] = (boxes_xyxy[..., :2] + boxes_xyxy[..., 2:]) * 0.5   # cxcy
+                boxes_xywh[..., 2:] = boxes_xyxy[..., 2:] - boxes_xyxy[..., :2]           # bwbh
+                target["boxes"] = boxes_xywh
+        else:
+            raise NotImplementedError("Unknown box format: {}".format(self.box_format))
+
+        return image, target
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target=None):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
+
+
+# build transforms
+def build_transform(cfg=None, is_train=False):
+    # ---------------- Transform for Training ----------------
+    if is_train:
+        transforms = []
+        trans_config = cfg['trans_config']
+        # build transform
+        if not cfg['detr_style']:
+            for t in trans_config:
+                if t['name'] == 'RandomHFlip':
+                    transforms.append(RandomHorizontalFlip())
+                if t['name'] == 'RandomResize':
+                    transforms.append(RandomResize(cfg['train_min_size'], max_size=cfg['train_max_size']))
+                if t['name'] == 'RandomSizeCrop':
+                    transforms.append(RandomSizeCrop(t['min_crop_size'], max_size=t['max_crop_size']))
+                if t['name'] == 'RandomShift':
+                    transforms.append(RandomShift(max_shift=t['max_shift']))
+                if t['name'] == 'RefineBBox':
+                    transforms.append(RefineBBox(min_box_size=t['min_box_size']))
+            transforms.extend([
+                ToTensor(),
+                Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords']),
+                ConvertBoxFormat(cfg['box_format'])
+            ])
+        # build transform for DETR-style detector
+        else:
+            transforms = [
+                RandomHorizontalFlip(),
+                RandomSelect(
+                    RandomResize(cfg['train_min_size'], max_size=cfg['train_max_size']),
+                    Compose([
+                        RandomResize(cfg['train_min_size2']),
+                        RandomSizeCrop(*cfg['random_crop_size']),
+                        RandomResize(cfg['train_min_size'], max_size=cfg['train_max_size']),
+                    ])
+                ),
+                ToTensor(),
+                Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords']),
+                ConvertBoxFormat(cfg['box_format'])
+            ]
+
+    # ---------------- Transform for Evaluating ----------------
+    else:
+        transforms = [
+            RandomResize(cfg['test_min_size'], max_size=cfg['test_max_size']),
+            ToTensor(),
+            Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords']),
+            ConvertBoxFormat(cfg['box_format'])
+        ]
+    
+    return Compose(transforms)

+ 256 - 0
odlab/demo.py

@@ -0,0 +1,256 @@
+import cv2
+import os
+import time
+import numpy as np
+import imageio
+import argparse
+from PIL import Image
+
+import torch
+
+# load transform
+from datasets import coco_labels, build_transform
+
+# load some utils
+from utils.misc import load_weight
+from utils.vis_tools import visualize
+
+from config import build_config
+from models.detectors import build_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='General Object Detection Demo')
+    # Basic
+    parser.add_argument('--mode', default='image',
+                        type=str, help='Use the data from image, video or camera')
+    parser.add_argument('--cuda', action='store_true', default=False,
+                        help='Use cuda')
+    parser.add_argument('--path_to_img', default='./dataset/demo/images/',
+                        type=str, help='The path to image files')
+    parser.add_argument('--path_to_vid', default='dataset/demo/videos/',
+                        type=str, help='The path to video files')
+    parser.add_argument('--path_to_save', default='det_results/demos/',
+                        type=str, help='The path to save the detection results')
+    parser.add_argument('-vt', '--visual_threshold', default=0.3, type=float,
+                        help='Final confidence threshold')
+    parser.add_argument('--show', action='store_true', default=False,
+                        help='show visualization')
+    parser.add_argument('--gif', action='store_true', default=False, 
+                        help='generate gif.')
+    # Model
+    parser.add_argument('-m', '--model', default='fcos_r18_1x', type=str,
+                        help='build detector')
+    parser.add_argument('-nc', '--num_classes', default=80, type=int,
+                        help='number of classes.')
+    parser.add_argument('--weight', default=None,
+                        type=str, help='Trained state_dict file path to open')
+    parser.add_argument('-ct', '--conf_thresh', default=0.1, type=float,
+                        help='confidence threshold')
+    parser.add_argument('-nt', '--nms_thresh', default=0.5, type=float,
+                        help='NMS threshold')
+    parser.add_argument('--topk', default=100, type=int,
+                        help='topk candidates for testing')
+    parser.add_argument("--deploy", action="store_true", default=False,
+                        help="deploy mode or not")
+    parser.add_argument('--fuse_conv_bn', action='store_true', default=False,
+                        help='fuse Conv & BN')
+
+    return parser.parse_args()
+                    
+
+def detect(args, model, device, transform, class_names, class_colors):
+    # path to save
+    save_path = os.path.join(args.path_to_save, args.mode)
+    os.makedirs(save_path, exist_ok=True)
+
+    # ------------------------- Camera ----------------------------
+    if args.mode == 'camera':
+        print('use camera !!!')
+        fourcc = cv2.VideoWriter_fourcc(*'XVID')
+        save_size = (640, 480)
+        cur_time = time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time()))
+        save_video_name = os.path.join(save_path, cur_time+'.avi')
+        fps = 15.0
+        out = cv2.VideoWriter(save_video_name, fourcc, fps, save_size)
+        print(save_video_name)
+        image_list = []
+
+        cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
+        while True:
+            ret, frame = cap.read()
+            if ret:
+                if cv2.waitKey(1) == ord('q'):
+                    break
+                orig_h, orig_w, _ = frame.shape
+
+                # to PIL
+                image = Image.fromarray(cv2.cvtColor(frame,cv2.COLOR_BGR2RGB))
+
+                # prepare
+                x = transform(image)[0]
+                x = x.unsqueeze(0).to(device)
+                
+                # Inference
+                t0 = time.time()
+                bboxes, scores, labels = model(x)
+                print("Infer. time: {}".format(time.time() - t0, "s"))
+                
+                # Rescale bboxes
+                bboxes[..., 0::2] *= orig_w
+                bboxes[..., 1::2] *= orig_h
+
+                # vis detection
+                frame_vis = visualize(frame, bboxes, scores, labels, args.visual_threshold, class_colors, class_names)
+                frame_resized = cv2.resize(frame_vis, save_size)
+                out.write(frame_resized)
+
+                if args.gif:
+                    gif_resized = cv2.resize(frame, (640, 480))
+                    gif_resized_rgb = gif_resized[..., (2, 1, 0)]
+                    image_list.append(gif_resized_rgb)
+
+                if args.show:
+                    cv2.imshow('detection', frame_resized)
+                    cv2.waitKey(1)
+            else:
+                break
+        cap.release()
+        out.release()
+        cv2.destroyAllWindows()
+
+        # generate GIF
+        if args.gif:
+            save_gif_path =  os.path.join(save_path, 'gif_files')
+            os.makedirs(save_gif_path, exist_ok=True)
+            save_gif_name = os.path.join(save_gif_path, '{}.gif'.format(cur_time))
+            print('generating GIF ...')
+            imageio.mimsave(save_gif_name, image_list, fps=fps)
+            print('GIF done: {}'.format(save_gif_name))
+
+    # ------------------------- Video ---------------------------
+    elif args.mode == 'video':
+        video = cv2.VideoCapture(args.path_to_vid)
+        fourcc = cv2.VideoWriter_fourcc(*'XVID')
+        save_size = (640, 480)
+        cur_time = time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time()))
+        save_video_name = os.path.join(save_path, cur_time+'.avi')
+        fps = 15.0
+        out = cv2.VideoWriter(save_video_name, fourcc, fps, save_size)
+        print(save_video_name)
+        image_list = []
+
+        while(True):
+            ret, frame = video.read()
+            
+            if ret:
+                # ------------------------- Detection ---------------------------
+                orig_h, orig_w, _ = frame.shape
+
+                # to PIL
+                image = Image.fromarray(cv2.cvtColor(frame,cv2.COLOR_BGR2RGB))
+
+                # prepare
+                x = transform(image)[0]
+                x = x.unsqueeze(0).to(device)
+                
+                # Inference
+                t0 = time.time()
+                bboxes, scores, labels = model(x)
+                print("Infer. time: {}".format(time.time() - t0, "s"))
+                
+                # Rescale bboxes
+                bboxes[..., 0::2] *= orig_w
+                bboxes[..., 1::2] *= orig_h
+
+                # vis detection
+                frame_vis = visualize(frame, bboxes, scores, labels, args.visual_threshold, class_colors, class_names)
+                frame_resized = cv2.resize(frame_vis, save_size)
+                out.write(frame_resized)
+
+                if args.gif:
+                    gif_resized = cv2.resize(frame, (640, 480))
+                    gif_resized_rgb = gif_resized[..., (2, 1, 0)]
+                    image_list.append(gif_resized_rgb)
+
+                if args.show:
+                    cv2.imshow('detection', frame_resized)
+                    cv2.waitKey(1)
+            else:
+                break
+        video.release()
+        out.release()
+        cv2.destroyAllWindows()
+
+        # generate GIF
+        if args.gif:
+            save_gif_path =  os.path.join(save_path, 'gif_files')
+            os.makedirs(save_gif_path, exist_ok=True)
+            save_gif_name = os.path.join(save_gif_path, '{}.gif'.format(cur_time))
+            print('generating GIF ...')
+            imageio.mimsave(save_gif_name, image_list, fps=fps)
+            print('GIF done: {}'.format(save_gif_name))
+
+    # ------------------------- Image ----------------------------
+    elif args.mode == 'image':
+        for i, img_id in enumerate(os.listdir(args.path_to_img)):
+            cv2_image = cv2.imread((args.path_to_img + '/' + img_id), cv2.IMREAD_COLOR)
+            orig_h, orig_w, _ = cv2_image.shape
+
+            # to PIL
+            image = Image.fromarray(cv2.cvtColor(cv2_image,cv2.COLOR_BGR2RGB))
+
+            # prepare
+            x = transform(image)[0]
+            x = x.unsqueeze(0).to(device)
+            
+            # Inference
+            t0 = time.time()
+            bboxes, scores, labels = model(x)
+            print("Infer. time: {}".format(time.time() - t0, "s"))
+            
+            # Rescale bboxes
+            bboxes[..., 0::2] *= orig_w
+            bboxes[..., 1::2] *= orig_h
+
+            # vis detection
+            img_processed = visualize(cv2_image, bboxes, scores, labels, args.visual_threshold, class_colors, class_names)
+            cv2.imwrite(os.path.join(save_path, str(i).zfill(6)+'.jpg'), img_processed)
+            if args.show:
+                cv2.imshow('detection', img_processed)
+                cv2.waitKey(0)
+
+
+def run():
+    args = parse_args()
+    # cuda
+    if args.cuda:
+        print('use cuda')
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+
+    # Dataset & Model Config
+    cfg = build_config(args)
+
+    # Transform
+    transform = build_transform(cfg, is_train=False)
+
+    np.random.seed(0)
+    class_colors = [(np.random.randint(255),
+                     np.random.randint(255),
+                     np.random.randint(255))
+                     for _ in range(args.num_classes)]
+
+    # Model
+    model = build_model(args, cfg, device, args.num_classes, False)
+    model = load_weight(model, args.weight, args.fuse_conv_bn)
+    model.to(device).eval()
+
+    print("================= DETECT =================")
+    # run
+    detect(args, model, device, transform, coco_labels, class_colors)
+
+
+if __name__ == '__main__':
+    run()

+ 100 - 0
odlab/engine.py

@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Train and eval functions used in main.py
+"""
+import math
+import sys
+from typing import Iterable
+
+import torch
+from utils import distributed_utils
+from utils.misc import MetricLogger, SmoothedValue
+from utils.vis_tools import vis_data
+
+
+def train_one_epoch(cfg,
+                    model       : torch.nn.Module,
+                    criterion   : torch.nn.Module,
+                    data_loader : Iterable,
+                    optimizer   : torch.optim.Optimizer,
+                    device      : torch.device,
+                    epoch       : int,
+                    vis_target  : bool,
+                    warmup_lr_scheduler,
+                    class_labels = None,
+                    model_ema    = None,
+                    debug       :bool = False
+                    ):
+    model.train()
+    criterion.train()
+    metric_logger = MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{} / {}]'.format(epoch, cfg['max_epoch'])
+    epoch_size = len(data_loader)
+    print_freq = 10
+
+    iteration = 0
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+        ni = iteration + epoch * epoch_size
+        # WarmUp
+        if ni < cfg['warmup_iters']:
+            warmup_lr_scheduler(ni, optimizer)
+        elif ni == cfg['warmup_iters']:
+            print('Warmup stage is over.')
+            warmup_lr_scheduler.set_lr(optimizer, cfg['base_lr'])
+
+        # To device
+        images, masks = samples
+        images = images.to(device)
+        masks  = masks.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        # Visualize train targets
+        if vis_target:
+            vis_data(images, targets, masks, class_labels, cfg['normalize_coords'], cfg['box_format'])
+
+        # Inference
+        outputs = model(images, masks, targets)
+
+        # Compute loss
+        loss_dict = criterion(outputs, targets)
+        loss_weight_dict = criterion.weight_dict
+        losses = sum(loss_dict[k] * loss_weight_dict[k] for k in loss_dict.keys() if k in loss_weight_dict)
+
+        # Reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = distributed_utils.reduce_dict(loss_dict)
+        loss_dict_reduced_scaled = {k: v * loss_weight_dict[k] for k, v in loss_dict_reduced.items() if k in loss_weight_dict}
+        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
+
+        loss_value = losses_reduced_scaled.item()
+
+        # Check loss
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        # Backward
+        optimizer.zero_grad()
+        losses.backward()
+        if cfg['clip_max_norm'] > 0:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg['clip_max_norm'])
+        optimizer.step()
+        iteration += 1
+
+        # ema
+        if model_ema is not None:
+            model_ema.update(model)
+
+        metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+        if debug:
+            print("For debug mode, we only train the model with 1 iteration.")
+            break
+    
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}

+ 10 - 0
odlab/evaluator/__init__.py

@@ -0,0 +1,10 @@
+from evaluator.coco_evaluator import COCOAPIEvaluator
+
+
+def build_evluator(args, cfg, device, testset=False):
+    evaluator = None
+    # COCO Evaluator
+    if args.dataset == 'coco':
+        evaluator = COCOAPIEvaluator(args, cfg, device, testset)
+
+    return evaluator

+ 98 - 0
odlab/evaluator/coco_evaluator.py

@@ -0,0 +1,98 @@
+import json
+import os
+import contextlib
+import torch
+from pycocotools.cocoeval import COCOeval
+
+from datasets import build_dataset, build_transform
+
+
+class COCOAPIEvaluator():
+    def __init__(self, args, cfg, device, testset=False):
+        # ----------------- Basic parameters -----------------
+        self.ddp_mode = True if args.distributed else False
+        self.image_set = 'test2017' if testset else 'val2017'
+        self.device = device
+        self.testset = testset
+        # ----------------- Metrics -----------------
+        self.map = 0.
+        self.ap50_95 = 0.
+        self.ap50 = 0.
+        # ----------------- Dataset -----------------
+        self.transform = build_transform(cfg, is_train=False)
+        self.dataset, self.dataset_info = build_dataset(args, self.transform, is_train=False)
+
+
+    @torch.no_grad()
+    def evaluate(self, model):
+        ids = []
+        coco_results = []
+        model.eval()
+        model.trainable = False
+
+        # start testing
+        for index, (image, target) in enumerate(self.dataset):
+            if index % 500 == 0:
+                print('[Eval: %d / %d]'%(index, len(self.dataset)))
+            # image id
+            id_ = int(target['image_id'])
+            ids.append(id_)
+            
+            # inference
+            image = image.unsqueeze(0).to(self.device)
+            outputs = model(image)
+            bboxes, scores, cls_inds = outputs
+
+            # rescale bbox
+            orig_h, orig_w = target["orig_size"].tolist()
+            bboxes[..., 0::2] *= orig_w
+            bboxes[..., 1::2] *= orig_h
+            
+            # reformat results
+            for i, box in enumerate(bboxes):
+                x1 = float(box[0])
+                y1 = float(box[1])
+                x2 = float(box[2])
+                y2 = float(box[3])
+                label = self.dataset.coco_indexs[int(cls_inds[i])]
+                
+                # COCO json format
+                bbox = [x1, y1, x2 - x1, y2 - y1]
+                score = float(scores[i])
+                A = {"image_id": id_,
+                     "category_id": label,
+                     "bbox": bbox,
+                     "score": score}
+                coco_results.append(A)
+
+        model.train()
+        model.trainable = True
+        annType = ['segm', 'bbox', 'keypoints']
+        # Evaluate the Dt (detection) json comparing with the ground truth
+        if len(coco_results) > 0:
+            print('evaluating ......')
+            cocoGt = self.dataset.coco
+            if self.testset:
+                json.dump(coco_results, open('coco_test-dev.json', 'w'))
+                cocoDt = cocoGt.loadRes('coco_test-dev.json')
+            else:
+                # suppress pycocotools prints
+                with open(os.devnull, 'w') as devnull:
+                    with contextlib.redirect_stdout(devnull):
+                        cocoDt = cocoGt.loadRes(coco_results)
+                        cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1])
+                        cocoEval.params.imgIds = ids
+                cocoEval.evaluate()
+                cocoEval.accumulate()
+                cocoEval.summarize()
+                # update mAP
+                ap50_95, ap50 = cocoEval.stats[0], cocoEval.stats[1]
+                print('ap50_95 : ', ap50_95)
+                print('ap50 : ', ap50)
+                self.map = ap50_95
+                self.ap50_95 = ap50_95
+                self.ap50 = ap50
+            del coco_results
+        else:
+            print('No coco detection results !')
+

+ 242 - 0
odlab/main.py

@@ -0,0 +1,242 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os
+import random
+import argparse
+import numpy as np
+from copy import deepcopy
+
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from utils import distributed_utils
+from utils.misc import compute_flops, collate_fn
+from utils.misc import get_param_dict, ModelEMA
+from utils.optimizer import build_optimizer
+from utils.lr_scheduler import build_wp_lr_scheduler, build_lr_scheduler
+
+from config import build_config
+from evaluator import build_evluator
+from datasets import build_dataset, build_dataloader, build_transform
+
+from models.detectors import build_model
+from engine import train_one_epoch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('General 2D Object Detection', add_help=False)
+    # Random seed
+    parser.add_argument('--seed', default=42, type=int)
+    # GPU
+    parser.add_argument('--cuda', action='store_true', default=False,
+                        help='use cuda.')
+    # Batch size
+    parser.add_argument('-bs', '--batch_size', default=16, type=int, 
+                        help='total batch size on all GPUs.')
+    # Model
+    parser.add_argument('-m', '--model', default='yolof_r18_c5_1x',
+                        help='build object detector')
+    parser.add_argument('-p', '--pretrained', default=None, type=str,
+                        help='load pretrained weight')
+    parser.add_argument('-r', '--resume', default=None, type=str,
+                        help='keep training')
+    parser.add_argument('--ema', default=None, type=str,
+                        help='use Model EMA trick.')
+    # Dataset
+    parser.add_argument('--root', default='/Users/liuhaoran/Desktop/python_work/object-detection/dataset/COCO/',
+                        help='data root')
+    parser.add_argument('-d', '--dataset', default='coco',
+                        help='coco, voc, widerface, crowdhuman')
+    parser.add_argument('--vis_tgt', action="store_true", default=False,
+                        help="visualize input data.")
+    # Dataloader
+    parser.add_argument('--num_workers', default=2, type=int, 
+                        help='Number of workers used in dataloading')
+    # Epoch
+    parser.add_argument('--eval_epoch', default=2, type=int,
+                        help='interval between evaluations')
+    parser.add_argument('--save_folder', default='weights/', type=str, 
+                        help='path to save weight')
+    parser.add_argument('--eval_first', action="store_true", default=False,
+                        help="visualize input data.")
+    # DDP train
+    parser.add_argument('-dist', '--distributed', action='store_true', default=False,
+                        help='distributed training')
+    parser.add_argument('--dist_url', default='env://', 
+                        help='url used to set up distributed training')
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--sybn', action='store_true', default=False, 
+                        help='use sybn.')
+    parser.add_argument('--find_unused_parameters', action='store_true', default=False, 
+                        help='set find_unused_parameters as True.')
+    # Debug setting
+    parser.add_argument('--debug', action='store_true', default=False, 
+                        help='debug codes.')
+
+    return parser.parse_args()
+
+
+def fix_random_seed(args):
+    seed = args.seed + distributed_utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def main():
+    args = parse_args()
+    print("Setting Arguments.. : ", args)
+    print("----------------------------------------------------------")
+
+    # path to save model
+    path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
+    os.makedirs(path_to_save, exist_ok=True)
+
+
+    # ---------------------------- Build DDP ----------------------------
+    distributed_utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(distributed_utils.get_sha()))
+    world_size = distributed_utils.get_world_size()
+    print('World size: {}'.format(world_size))
+    per_gpu_batch = args.batch_size // world_size
+
+
+    # ---------------------------- Build CUDA ----------------------------
+    if args.cuda and torch.cuda.is_available():
+        print('use cuda')
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+
+
+    # ---------------------------- Fix random seed ----------------------------
+    fix_random_seed(args)
+
+
+    # ---------------------------- Build config ----------------------------
+    cfg = build_config(args)
+    print('Model config: ', cfg)
+
+
+    # ---------------------------- Build Dataset ----------------------------
+    transforms = build_transform(cfg, is_train=True)
+    dataset, dataset_info = build_dataset(args, transforms, is_train=True)
+
+
+    # ---------------------------- Build Dataloader ----------------------------
+    train_loader = build_dataloader(args, dataset, per_gpu_batch, collate_fn, is_train=True)
+
+
+    # ---------------------------- Build model ----------------------------
+    ## Build model
+    model, criterion = build_model(args, cfg, dataset_info['num_classes'], is_val=True)
+    model.to(device)
+    model_without_ddp = model
+    ## Calcute Params & GFLOPs
+    if distributed_utils.is_main_process():
+        model_copy = deepcopy(model_without_ddp)
+        model_copy.trainable = False
+        model_copy.eval()
+        compute_flops(model=model_copy,
+                      min_size=cfg['test_min_size'],
+                      max_size=cfg['test_max_size'],
+                      device=device)
+        del model_copy
+    if args.distributed:
+        dist.barrier()
+
+
+    # ---------------------------- Build Optimizer ----------------------------
+    cfg['base_lr'] = cfg['base_lr'] * args.batch_size
+    param_dicts = None
+    if 'param_dict_type' in cfg.keys() and cfg['param_dict_type'] != 'default':
+        print("- Param dict type: {}".format(cfg['param_dict_type']))
+        param_dicts = get_param_dict(model_without_ddp, cfg)
+    optimizer, start_epoch = build_optimizer(cfg, model_without_ddp, param_dicts, args.resume)
+
+
+    # ---------------------------- Build LR Scheduler ----------------------------
+    wp_lr_scheduler = build_wp_lr_scheduler(cfg, cfg['base_lr'])
+    lr_scheduler    = build_lr_scheduler(cfg, optimizer, args.resume)
+
+
+    # ---------------------------- Build Model EMA ----------------------------
+    model_ema = None
+    if 'use_ema' in cfg.keys() and cfg['use_ema']:
+        print("Build Model EMA for {}".format(args.model))
+        model_ema = ModelEMA(cfg, model, start_epoch * len(train_loader))
+
+
+    # ---------------------------- Build DDP model ----------------------------
+    if args.distributed:
+        model = DDP(model, device_ids=[args.gpu], find_unused_parameters=args.find_unused_parameters)
+        model_without_ddp = model.module
+
+
+    # ---------------------------- Build Evaluator ----------------------------
+    evaluator = build_evluator(args, cfg, device)
+
+
+    # ----------------------- Eval before training -----------------------
+    if args.eval_first and distributed_utils.is_main_process():
+        evaluator.evaluate(model_without_ddp)
+        return
+
+
+    # ----------------------- Training -----------------------
+    print("Start training")
+    best_map = -1.
+    for epoch in range(start_epoch, cfg['max_epoch']):
+        if args.distributed:
+            train_loader.batch_sampler.sampler.set_epoch(epoch)
+
+        # Train one epoch
+        train_one_epoch(cfg,
+                        model,
+                        criterion,
+                        train_loader,
+                        optimizer,
+                        device,
+                        epoch,
+                        args.vis_tgt,
+                        wp_lr_scheduler,
+                        dataset_info['class_labels'],
+                        model_ema=model_ema,
+                        debug=args.debug)
+        
+        # LR Scheduler
+        lr_scheduler.step()
+
+        # Evaluate
+        if distributed_utils.is_main_process():
+            model_eval = model_ema.ema if model_ema is not None else model_without_ddp
+            if (epoch % args.eval_epoch) == 0 or (epoch == cfg['max_epoch'] - 1):
+                if evaluator is None:
+                    cur_map = 0.
+                else:
+                    evaluator.evaluate(model_eval)
+                    cur_map = evaluator.map
+                # Save model
+                if cur_map > best_map:
+                    # update best-map
+                    best_map = cur_map
+                    # save model
+                    print('Saving state, epoch:', epoch + 1)
+                    torch.save({'model':        model_eval.state_dict(),
+                                'optimizer':    optimizer.state_dict(),
+                                'lr_scheduler': lr_scheduler.state_dict(),
+                                'mAP':          round(cur_map*100, 1),
+                                'epoch':        epoch,
+                                'args':         args}, 
+                                os.path.join(path_to_save, '{}_best.pth'.format(args.model)))
+        if args.distributed:
+            dist.barrier()
+
+        if args.debug:
+            print("For debug mode, we only train the model with 1 epoch.")
+            exit(0)
+
+
+if __name__ == '__main__':
+    main()

+ 15 - 0
odlab/models/backbone/__init__.py

@@ -0,0 +1,15 @@
+from .resnet           import build_resnet
+from .swin_transformer import build_swin_transformer
+
+
+def build_backbone(cfg):
+    print('==============================')
+    print('Backbone: {}'.format(cfg['backbone']))
+    # ResNet
+    if cfg['backbone'] in ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']:
+        return build_resnet(cfg)
+    # SwinTransformer
+    elif cfg['backbone'] in ['swin_T_224_1k', 'swin_S_224_22k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']:
+        return build_swin_transformer(cfg)
+    
+                           

+ 254 - 0
odlab/models/backbone/resnet.py

@@ -0,0 +1,254 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backbone modules.
+"""
+
+import torch
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from torchvision.models.resnet import (ResNet18_Weights,
+                                       ResNet34_Weights,
+                                       ResNet50_Weights,
+                                       ResNet101_Weights)
+
+model_urls = {
+    # IN1K-Cls pretrained weights
+    'resnet18':  ResNet18_Weights,
+    'resnet34':  ResNet34_Weights,
+    'resnet50':  ResNet50_Weights,
+    'resnet101': ResNet101_Weights,
+}
+spark_model_urls = {
+    # SparK's IN1K-MAE pretrained weights
+    'spark_resnet18': None,
+    'spark_resnet34': None,
+    'spark_resnet50': "https://github.com/yjh0410/RT-ODLab/releases/download/backbone_weight/resnet50_in1k_spark_pretrained_timm_style.pth",
+    'spark_resnet101': None,
+}
+
+
+# Frozen BatchNormazlizarion
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+# -------------------- ResNet series --------------------
+class ResNet(nn.Module):
+    """Standard ResNet backbone."""
+    def __init__(self,
+                 name               :str  = "resnet50",
+                 res5_dilation      :bool = False,
+                 norm_type          :str  = "BN",
+                 freeze_at          :int  = 0,
+                 pretrained_weights :str  = "imagenet1k_v1"):
+        super().__init__()
+        # Pretrained
+        assert pretrained_weights in [None, "imagenet1k_v1", "imagenet1k_v2"]
+        if pretrained_weights is not None:
+            if name in ('resnet18', 'resnet34'):
+                pretrained_weights = model_urls[name].IMAGENET1K_V1
+            else:
+                if pretrained_weights == "imagenet1k_v1":
+                    pretrained_weights = model_urls[name].IMAGENET1K_V1
+                else:
+                    pretrained_weights = model_urls[name].IMAGENET1K_V2
+        else:
+            pretrained_weights = None
+        print('- Backbone pretrained weight: ', pretrained_weights)
+
+        # Norm layer
+        print("- Norm layer of backbone: {}".format(norm_type))
+        if norm_type == 'BN':
+            norm_layer = nn.BatchNorm2d
+        elif norm_type == 'FrozeBN':
+            norm_layer = FrozenBatchNorm2d
+        else:
+            raise NotImplementedError("Unknown norm type: {}".format(norm_type))
+
+        # Backbone
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, res5_dilation],
+            norm_layer=norm_layer, weights=pretrained_weights)
+        return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.feat_dims = [128, 256, 512] if name in ('resnet18', 'resnet34') else [512, 1024, 2048]
+ 
+        # Freeze
+        print("- Freeze at {}".format(freeze_at))
+        if freeze_at >= 0:
+            for name, parameter in backbone.named_parameters():
+                if freeze_at == 0: # Only freeze stem layer
+                    if 'layer1' not in name and 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                        parameter.requires_grad_(False)
+                elif freeze_at == 1: # Freeze stem layer + layer1
+                    if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                        parameter.requires_grad_(False)
+                elif freeze_at == 2: # Freeze stem layer + layer1 + layer2
+                    if 'layer3' not in name and 'layer4' not in name:
+                        parameter.requires_grad_(False)
+                elif freeze_at == 3: # Freeze stem layer + layer1 + layer2 + layer3
+                    if 'layer4' not in name:
+                        parameter.requires_grad_(False)
+                else: # Freeze all resnet's layers
+                    parameter.requires_grad_(False)
+
+    def forward(self, x):
+        xs = self.body(x)
+        fmp_list = []
+        for name, fmp in xs.items():
+            fmp_list.append(fmp)
+
+        return fmp_list
+
+class SparkResNet(nn.Module):
+    """ResNet backbone with SparK pretrained."""
+    def __init__(self,
+                 name          :str  = "resnet50",
+                 res5_dilation :bool = False,
+                 norm_type     :str  = "BN",
+                 freeze_at     :int  = 0,
+                 pretrained    :bool = False):
+        super().__init__()
+        # Norm layer
+        print("- Norm layer of backbone: {}".format(norm_type))
+        if norm_type == 'BN':
+            norm_layer = nn.BatchNorm2d
+        elif norm_type == 'FrozeBN':
+            norm_layer = FrozenBatchNorm2d
+        else:
+            raise NotImplementedError("Unknown norm type: {}".format(norm_type))
+
+        # Backbone
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, res5_dilation], norm_layer=norm_layer)
+        return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.feat_dims = [128, 256, 512] if name in ('resnet18', 'resnet34') else [512, 1024, 2048]
+
+        # Load pretrained
+        if pretrained:
+            self.load_pretrained(name)
+
+        # Freeze
+        print("- Freeze at {}".format(freeze_at))
+        if freeze_at >= 0:
+            for name, parameter in backbone.named_parameters():
+                if freeze_at == 0: # Only freeze stem layer
+                    if 'layer1' not in name and 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                        parameter.requires_grad_(False)
+                elif freeze_at == 1: # Freeze stem layer + layer1
+                    if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                        parameter.requires_grad_(False)
+                elif freeze_at == 2: # Freeze stem layer + layer1 + layer2
+                    if 'layer3' not in name and 'layer4' not in name:
+                        parameter.requires_grad_(False)
+                elif freeze_at == 3: # Freeze stem layer + layer1 + layer2 + layer3
+                    if 'layer4' not in name:
+                        parameter.requires_grad_(False)
+                else: # Freeze all resnet's layers
+                    parameter.requires_grad_(False)
+
+    def load_pretrained(self, name):
+        url = spark_model_urls["spark_" + name]
+        if url is not None:
+            print('Loading backbone pretrained weight from : {}'.format(url))
+            # checkpoint state dict
+            checkpoint_state_dict = torch.hub.load_state_dict_from_url(
+                url=url, map_location="cpu", check_hash=True)
+            # model state dict
+            model_state_dict = self.body.state_dict()
+            # check
+            for k in list(checkpoint_state_dict.keys()):
+                if k in model_state_dict:
+                    shape_model = tuple(model_state_dict[k].shape)
+                    shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
+                    if shape_model != shape_checkpoint:
+                        checkpoint_state_dict.pop(k)
+                else:
+                    checkpoint_state_dict.pop(k)
+                    print('Unused key: ', k)
+            # load the weight
+            self.body.load_state_dict(checkpoint_state_dict)
+        else:
+            print('No backbone pretrained for {}.'.format(name))
+
+    def forward(self, x):
+        xs = self.body(x)
+        fmp_list = []
+        for name, fmp in xs.items():
+            fmp_list.append(fmp)
+
+        return fmp_list
+
+
+# build backbone
+def build_resnet(cfg):
+    # ResNet series
+    if cfg['pretrained_weight'] in spark_model_urls.keys():
+        backbone = SparkResNet(
+            name           = cfg['backbone'],
+            res5_dilation  = cfg['res5_dilation'],
+            norm_type      = cfg['backbone_norm'],
+            pretrained     = cfg['pretrained'],
+            freeze_at      = cfg['freeze_at'])
+    else:
+        backbone = ResNet(
+            name               = cfg['backbone'],
+            res5_dilation      = cfg['res5_dilation'],
+            norm_type          = cfg['backbone_norm'],
+            pretrained_weights = cfg['pretrained_weight'],
+            freeze_at          = cfg['freeze_at'])
+
+    return backbone, backbone.feat_dims
+
+
+if __name__ == '__main__':
+    cfg = {
+        'backbone':      'resnet50',
+        'backbone_norm': 'FrozeBN',
+        'pretrained_weight': 'imagenet1k_v1',
+        'res5_dilation': False,
+        'freeze_at': 0,
+    }
+    model, feat_dim = build_resnet(cfg)
+    print(feat_dim)
+
+    x = torch.randn(2, 3, 320, 320)
+    output = model(x)
+    for y in output:
+        print(y.size())

+ 95 - 0
odlab/models/basic/attn.py

@@ -0,0 +1,95 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# ----------------- BoxRPM Cross Attention Ops -----------------
+class GlobalCrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim            :int   = 256,
+        num_heads      :int   = 8,
+        qkv_bias       :bool  = True,
+        qk_scale       :float = None,
+        attn_drop      :float = 0.0,
+        proj_drop      :float = 0.0,
+        rpe_hidden_dim :int   = 512,
+        feature_stride :int   = 16,
+    ):
+        super().__init__()
+        # --------- Basic parameters ---------
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.feature_stride = feature_stride
+
+        # --------- Network parameters ---------
+        self.cpb_mlp1 = self.build_cpb_mlp(2, rpe_hidden_dim, num_heads)
+        self.cpb_mlp2 = self.build_cpb_mlp(2, rpe_hidden_dim, num_heads)
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def build_cpb_mlp(self, in_dim, hidden_dim, out_dim):
+        cpb_mlp = nn.Sequential(nn.Linear(in_dim, hidden_dim, bias=True),
+                                nn.ReLU(inplace=True),
+                                nn.Linear(hidden_dim, out_dim, bias=False))
+        return cpb_mlp
+
+    def forward(self,
+                query,
+                reference_points,
+                k_input_flatten,
+                v_input_flatten,
+                input_spatial_shapes,
+                input_padding_mask=None,
+                ):
+        assert input_spatial_shapes.size(0) == 1, 'This is designed for single-scale decoder.'
+        h, w = input_spatial_shapes[0]
+        stride = self.feature_stride
+
+        ref_pts = torch.cat([
+            reference_points[:, :, :, :2] - reference_points[:, :, :, 2:] / 2,
+            reference_points[:, :, :, :2] + reference_points[:, :, :, 2:] / 2,
+        ], dim=-1)  # B, nQ, 1, 4
+
+        pos_x = torch.linspace(0.5, w - 0.5, w, dtype=torch.float32, device=w.device)[None, None, :, None] * stride  # 1, 1, w, 1
+        pos_y = torch.linspace(0.5, h - 0.5, h, dtype=torch.float32, device=h.device)[None, None, :, None] * stride  # 1, 1, h, 1
+
+        delta_x = ref_pts[..., 0::2] - pos_x  # B, nQ, w, 2
+        delta_y = ref_pts[..., 1::2] - pos_y  # B, nQ, h, 2
+
+        rpe_x, rpe_y = self.cpb_mlp1(delta_x), self.cpb_mlp2(delta_y)  # B, nQ, w/h, nheads
+        rpe = (rpe_x[:, :, None] + rpe_y[:, :, :, None]).flatten(2, 3) # B, nQ, h, w, nheads ->  B, nQ, h*w, nheads
+        rpe = rpe.permute(0, 3, 1, 2)
+
+        B_, N, C = k_input_flatten.shape
+        k = self.k(k_input_flatten).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = self.v(v_input_flatten).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        B_, N, C = query.shape
+        q = self.q(query).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        q = q * self.scale
+
+        attn = q @ k.transpose(-2, -1)
+        attn += rpe
+        if input_padding_mask is not None:
+            attn += input_padding_mask[:, None, None] * -100
+
+        fmin, fmax = torch.finfo(attn.dtype).min, torch.finfo(attn.dtype).max
+        torch.clip_(attn, min=fmin, max=fmax)
+
+        attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x

+ 297 - 0
odlab/models/basic/conv.py

@@ -0,0 +1,297 @@
+import math
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .norm import LayerNorm2D
+
+
+def get_conv2d(c1, c2, k, p, s, d, g):
+    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g)
+
+    return conv
+
+def get_activation(act_type=None):
+    if act_type is None:
+        return nn.Identity()
+    elif act_type == 'relu':
+        return nn.ReLU(inplace=True)
+    elif act_type == 'lrelu':
+        return nn.LeakyReLU(0.1, inplace=True)
+    elif act_type == 'mish':
+        return nn.Mish(inplace=True)
+    elif act_type == 'silu':
+        return nn.SiLU(inplace=True)
+    elif act_type == 'gelu':
+        return nn.GELU()
+    else:
+        raise NotImplementedError(act_type)
+
+def get_norm(norm_type, dim):
+    if norm_type == 'BN':
+        return nn.BatchNorm2d(dim)
+    elif norm_type == 'GN':
+        return nn.GroupNorm(num_groups=32, num_channels=dim)
+    elif norm_type is None:
+        return nn.Identity()
+    else:
+        raise NotImplementedError(norm_type)
+
+
+# ----------------- CNN ops -----------------
+class ConvModule(nn.Module):
+    def __init__(self,
+                 c1,
+                 c2,
+                 k=1,
+                 p=0,
+                 s=1,
+                 d=1,
+                 act_type='relu',
+                 norm_type='BN', 
+                 depthwise=False):
+        super(ConvModule, self).__init__()
+        convs = []
+        if depthwise:
+            convs.append(get_conv2d(c1, c1, k=k, p=p, s=s, d=d, g=c1))
+            # depthwise conv
+            if norm_type:
+                convs.append(get_norm(norm_type, c1))
+            if act_type:
+                convs.append(get_activation(act_type))
+            # pointwise conv
+            convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1))
+            if norm_type:
+                convs.append(get_norm(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+
+        else:
+            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=1))
+            if norm_type:
+                convs.append(get_norm(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+            
+        self.convs = nn.Sequential(*convs)
+
+
+    def forward(self, x):
+        return self.convs(x)
+
+class BasicConv(nn.Module):
+    def __init__(self, 
+                 in_dim,                   # in channels
+                 out_dim,                  # out channels 
+                 kernel_size=1,            # kernel size 
+                 padding=0,                # padding
+                 stride=1,                 # padding
+                 dilation=1,               # dilation
+                 act_type  :str = 'lrelu', # activation
+                 norm_type :str = 'BN',    # normalization
+                 depthwise :bool = False
+                ):
+        super(BasicConv, self).__init__()
+        self.depthwise = depthwise
+        if not depthwise:
+            self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=1)
+            self.norm = get_norm(norm_type, out_dim)
+        else:
+            self.conv1 = get_conv2d(in_dim, in_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=in_dim)
+            self.norm1 = get_norm(norm_type, in_dim)
+            self.conv2 = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=1)
+            self.norm2 = get_norm(norm_type, out_dim)
+        self.act  = get_activation(act_type)
+
+    def forward(self, x):
+        if not self.depthwise:
+            return self.act(self.norm(self.conv(x)))
+        else:
+            # Depthwise conv
+            x = self.norm1(self.conv1(x))
+            # Pointwise conv
+            x = self.norm2(self.conv2(x))
+            return x
+
+class UpSampleWrapper(nn.Module):
+    """Upsample last feat map to specific stride."""
+    def __init__(self, in_dim, upsample_factor):
+        super(UpSampleWrapper, self).__init__()
+        # ---------- Basic parameters ----------
+        self.upsample_factor = upsample_factor
+
+        # ---------- Network parameters ----------
+        if upsample_factor == 1:
+            self.upsample = nn.Identity()
+        else:
+            scale = int(math.log2(upsample_factor))
+            dim = in_dim
+            layers = []
+            for _ in range(scale-1):
+                layers += [
+                    nn.ConvTranspose2d(dim, dim, kernel_size=2, stride=2),
+                    LayerNorm2D(dim),
+                    nn.GELU()
+                ]
+            layers += [nn.ConvTranspose2d(dim, dim, kernel_size=2, stride=2)]
+            self.upsample = nn.Sequential(*layers)
+            self.out_dim = dim
+
+    def forward(self, x):
+        x = self.upsample(x)
+
+        return x
+
+
+# ----------------- RepCNN module -----------------
+class RepVggBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, act_type='relu', norm_type='BN'):
+        super().__init__()
+        # ----------------- Basic parameters -----------------
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # ----------------- Network parameters -----------------
+        self.conv1 = BasicConv(in_dim, out_dim, kernel_size=3, padding=1, act_type=None, norm_type=norm_type)
+        self.conv2 = BasicConv(in_dim, out_dim, kernel_size=1, padding=0, act_type=None, norm_type=norm_type)
+        self.act   = get_activation(act_type) 
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            y = self.conv1(x) + self.conv2(x)
+
+        return self.act(y)
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(self.in_dim, self.out_dim, 3, 1, padding=1)
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias 
+        # self.__delattr__('conv1')
+        # self.__delattr__('conv2')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return F.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch: BasicConv):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+class RepCSPLayer(nn.Module):
+    def __init__(self,
+                 in_dim     :int   = 256,
+                 out_dim    :int   = 256,
+                 num_blocks :int   = 3,
+                 expansion  :float = 1.0,
+                 act_type   :str   = "relu",
+                 norm_type  :str   = "GN",):
+        super(RepCSPLayer, self).__init__()
+        # ----------------- Basic parameters -----------------
+        inter_dim = int(out_dim * expansion)
+        # ----------------- Network parameters -----------------
+        self.conv1 = BasicConv(in_dim, inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.conv2 = BasicConv(in_dim, inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.bottlenecks = nn.Sequential(*[
+            RepVggBlock(inter_dim, inter_dim, act_type, norm_type) for _ in range(num_blocks)
+        ])
+        if inter_dim != out_dim:
+            self.conv3 = BasicConv(inter_dim, out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+
+        return self.conv3(x_1 + x_2)
+
+
+# ----------------- CNN module -----------------
+class YoloBottleneck(nn.Module):
+    def __init__(self,
+                 in_dim       :int,
+                 out_dim      :int,
+                 kernel_size  :List  = [1, 3],
+                 expand_ratio :float = 0.5,
+                 shortcut     :bool  = False,
+                 act_type     :str   = 'silu',
+                 norm_type    :str   = 'BN',
+                 depthwise    :bool  = False,
+                 ) -> None:
+        super(YoloBottleneck, self).__init__()
+        inter_dim = int(out_dim * expand_ratio)
+        # ----------------- Network setting -----------------
+        self.conv_layer1 = BasicConv(in_dim, inter_dim,
+                                     kernel_size=kernel_size[0], padding=kernel_size[0]//2, stride=1,
+                                     act_type=act_type, norm_type=norm_type)
+        self.conv_layer2 = BasicConv(inter_dim, out_dim,
+                                     kernel_size=kernel_size[1], padding=kernel_size[1]//2, stride=1,
+                                     act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        self.shortcut = shortcut and in_dim == out_dim
+
+    def forward(self, x):
+        h = self.conv_layer2(self.conv_layer1(x))
+
+        return x + h if self.shortcut else h
+
+class ELANLayer(nn.Module):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 expand_ratio :float = 0.5,
+                 num_blocks   :int   = 1,
+                 shortcut     :bool  = False,
+                 act_type     :str   = 'silu',
+                 norm_type    :str   = 'BN',
+                 depthwise    :bool  = False,
+                 ) -> None:
+        super(ELANLayer, self).__init__()
+        self.inter_dim = round(out_dim * expand_ratio)
+        self.input_proj  = BasicConv(in_dim, self.inter_dim * 2, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.output_proj = BasicConv((2 + num_blocks) * self.inter_dim, out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.module = nn.ModuleList([YoloBottleneck(self.inter_dim,
+                                                    self.inter_dim,
+                                                    kernel_size  = [3, 3],
+                                                    expand_ratio = 1.0,
+                                                    shortcut     = shortcut,
+                                                    act_type     = act_type,
+                                                    norm_type    = norm_type,
+                                                    depthwise    = depthwise)
+                                                    for _ in range(num_blocks)])
+
+    def forward(self, x):
+        # Input proj
+        x1, x2 = torch.chunk(self.input_proj(x), 2, dim=1)
+        out = list([x1, x2])
+
+        # Bottlenecl
+        out.extend(m(out[-1]) for m in self.module)
+
+        # Output proj
+        out = self.output_proj(torch.cat(out, dim=1))
+
+        return out

+ 54 - 0
odlab/models/basic/mlp.py

@@ -0,0 +1,54 @@
+import torch.nn as nn
+
+
+def get_activation(act_type=None):
+    if act_type == 'relu':
+        return nn.ReLU(inplace=True)
+    elif act_type == 'lrelu':
+        return nn.LeakyReLU(0.1, inplace=True)
+    elif act_type == 'gelu':
+        return nn.GELU()
+    elif act_type == 'mish':
+        return nn.Mish(inplace=True)
+    elif act_type == 'silu':
+        return nn.SiLU(inplace=True)
+    elif act_type is None:
+        return nn.Identity()
+
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([in_dim] + h, h + [out_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+class FFN(nn.Module):
+    def __init__(self, d_model=256, ffn_dim=1024, dropout=0., act_type='relu', pre_norm=False):
+        super().__init__()
+        # ----------- Basic parameters -----------
+        self.pre_norm = pre_norm
+        self.ffn_dim = ffn_dim
+        # ----------- Network parameters -----------
+        self.linear1 = nn.Linear(d_model, self.ffn_dim)
+        self.activation = get_activation(act_type)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(self.ffn_dim, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, src):
+        if self.pre_norm:
+            src = self.norm(src)
+            src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+            src = src + self.dropout3(src2)
+        else:
+            src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+            src = src + self.dropout3(src2)
+            src = self.norm(src)
+        
+        return src

+ 55 - 0
odlab/models/basic/norm.py

@@ -0,0 +1,55 @@
+import torch
+import torch.nn as nn
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+class LayerNorm2D(nn.Module):
+    def __init__(self, normalized_shape, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.ln = norm_layer(normalized_shape) if norm_layer is not None else nn.Identity()
+
+    def forward(self, x):
+        """
+        x: N C H W
+        """
+        x = x.permute(0, 2, 3, 1)
+        x = self.ln(x)
+        x = x.permute(0, 3, 1, 2)
+        return x
+    

+ 246 - 0
odlab/models/basic/transformer.py

@@ -0,0 +1,246 @@
+import math
+import copy
+import warnings
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..basic.mlp import FFN, MLP
+from ..basic.conv import LayerNorm2D, BasicConv
+
+
+# ----------------- Basic Ops -----------------
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    """Copy from timm"""
+    with torch.no_grad():
+        """Copy from timm"""
+        def norm_cdf(x):
+            return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+        if (mean < a - 2 * std) or (mean > b + 2 * std):
+            warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                        "The distribution of values may be incorrect.",
+                        stacklevel=2)
+
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        tensor.erfinv_()
+
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        tensor.clamp_(min=a, max=b)
+
+        return tensor
+    
+def get_clones(module, N):
+    if N <= 0:
+        return None
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0., max=1.)
+    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
+
+def build_transformer(cfg, num_classes=80, return_intermediate=False):
+    if cfg['transformer'] == 'plain_detr_transformer':
+        return PlainDETRTransformer(d_model             = cfg['hidden_dim'],
+                                    num_heads           = cfg['de_num_heads'],
+                                    ffn_dim             = cfg['de_ffn_dim'],
+                                    dropout             = cfg['de_dropout'],
+                                    act_type            = cfg['de_act'],
+                                    pre_norm            = cfg['de_pre_norm'],
+                                    rpe_hidden_dim      = cfg['rpe_hidden_dim'],
+                                    feature_stride      = cfg['out_stride'],
+                                    num_layers          = cfg['de_num_layers'],
+                                    return_intermediate = return_intermediate,
+                                    use_checkpoint      = cfg['use_checkpoint'],
+                                    num_queries_one2one = cfg['num_queries_one2one'],
+                                    num_queries_one2many    = cfg['num_queries_one2many'],
+                                    proposal_feature_levels = cfg['proposal_feature_levels'],
+                                    proposal_in_stride      = cfg['out_stride'],
+                                    proposal_tgt_strides    = cfg['proposal_tgt_strides'],
+                                    )
+    elif cfg['transformer'] == 'rtdetr_transformer':
+        return RTDETRTransformer(in_dims             = cfg['backbone_feat_dims'],
+                                 hidden_dim          = cfg['hidden_dim'],
+                                 strides             = cfg['out_stride'],
+                                 num_classes         = num_classes,
+                                 num_queries         = cfg['num_queries'],
+                                 num_heads           = cfg['de_num_heads'],
+                                 num_layers          = cfg['de_num_layers'],
+                                 num_levels          = 3,
+                                 num_points          = cfg['de_num_points'],
+                                 ffn_dim             = cfg['de_ffn_dim'],
+                                 dropout             = cfg['de_dropout'],
+                                 act_type            = cfg['de_act'],
+                                 pre_norm            = cfg['de_pre_norm'],
+                                 return_intermediate = return_intermediate,
+                                 num_denoising       = cfg['dn_num_denoising'],
+                                 label_noise_ratio   = cfg['dn_label_noise_ratio'],
+                                 box_noise_scale     = cfg['dn_box_noise_scale'],
+                                 learnt_init_query   = cfg['learnt_init_query'],
+                                 )
+
+
+# ----------------- Transformer Encoder -----------------
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model   :int   = 256,
+                 num_heads :int   = 8,
+                 ffn_dim   :int   = 1024,
+                 dropout   :float = 0.1,
+                 act_type  :str   = "relu",
+                 pre_norm  :bool = False,
+                 ):
+        super().__init__()
+        # ----------- Basic parameters -----------
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.dropout = dropout
+        self.act_type = act_type
+        self.pre_norm = pre_norm
+        # ----------- Basic parameters -----------
+        # Multi-head Self-Attn
+        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+        self.norm = nn.LayerNorm(d_model)
+
+        # Feedforwaed Network
+        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_pre_norm(self, src, pos_embed):
+        """
+        Input:
+            src:       [torch.Tensor] -> [B, N, C]
+            pos_embed: [torch.Tensor] -> [B, N, C]
+        Output:
+            src:       [torch.Tensor] -> [B, N, C]
+        """
+        src = self.norm(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+
+        # -------------- MHSA --------------
+        src2 = self.self_attn(q, k, value=src)[0]
+        src = src + self.dropout(src2)
+
+        # -------------- FFN --------------
+        src = self.ffn(src)
+        
+        return src
+
+    def forward_post_norm(self, src, pos_embed):
+        """
+        Input:
+            src:       [torch.Tensor] -> [B, N, C]
+            pos_embed: [torch.Tensor] -> [B, N, C]
+        Output:
+            src:       [torch.Tensor] -> [B, N, C]
+        """
+        q = k = self.with_pos_embed(src, pos_embed)
+
+        # -------------- MHSA --------------
+        src2 = self.self_attn(q, k, value=src)[0]
+        src = src + self.dropout(src2)
+        src = self.norm(src)
+
+        # -------------- FFN --------------
+        src = self.ffn(src)
+        
+        return src
+
+    def forward(self, src, pos_embed):
+        if self.pre_norm:
+            return self.forward_pre_norm(src, pos_embed)
+        else:
+            return self.forward_post_norm(src, pos_embed)
+
+class TransformerEncoder(nn.Module):
+    def __init__(self,
+                 d_model        :int   = 256,
+                 num_heads      :int   = 8,
+                 num_layers     :int   = 1,
+                 ffn_dim        :int   = 1024,
+                 pe_temperature :float = 10000.,
+                 dropout        :float = 0.1,
+                 act_type       :str   = "relu",
+                 pre_norm       :bool  = False,
+                 ):
+        super().__init__()
+        # ----------- Basic parameters -----------
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.ffn_dim = ffn_dim
+        self.dropout = dropout
+        self.act_type = act_type
+        self.pre_norm = pre_norm
+        self.pe_temperature = pe_temperature
+        self.pos_embed = None
+        # ----------- Basic parameters -----------
+        self.encoder_layers = get_clones(
+            TransformerEncoderLayer(d_model, num_heads, ffn_dim, dropout, act_type, pre_norm), num_layers)
+
+    def build_2d_sincos_position_embedding(self, device, w, h, embed_dim=256, temperature=10000.):
+        assert embed_dim % 4 == 0, \
+            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        
+        # ----------- Check cahed pos_embed -----------
+        if self.pos_embed is not None and \
+            self.pos_embed.shape[2:] == [h, w]:
+            return self.pos_embed
+        
+        # ----------- Generate grid coords -----------
+        grid_w = torch.arange(int(w), dtype=torch.float32)
+        grid_h = torch.arange(int(h), dtype=torch.float32)
+        grid_w, grid_h = torch.meshgrid([grid_w, grid_h])  # shape: [H, W]
+
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None] # shape: [N, C]
+        out_h = grid_h.flatten()[..., None] @ omega[None] # shape: [N, C]
+
+        # shape: [1, N, C]
+        pos_embed = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h),torch.cos(out_h)], dim=1)[None, :, :]
+        pos_embed = pos_embed.to(device)
+        self.pos_embed = pos_embed
+
+        return pos_embed
+
+    def forward(self, src):
+        """
+        Input:
+            src:  [torch.Tensor] -> [B, C, H, W]
+        Output:
+            src:  [torch.Tensor] -> [B, C, H, W]
+        """
+        # -------- Transformer encoder --------
+        channels, fmp_h, fmp_w = src.shape[1:]
+        # [B, C, H, W] -> [B, N, C], N=HxW
+        src_flatten = src.flatten(2).permute(0, 2, 1).contiguous()
+        memory = src_flatten
+
+        # PosEmbed: [1, N, C]
+        pos_embed = self.build_2d_sincos_position_embedding(
+            src.device, fmp_w, fmp_h, channels, self.pe_temperature)
+        
+        # Transformer Encoder layer
+        for encoder in self.encoder_layers:
+            memory = encoder(memory, pos_embed=pos_embed)
+
+        # Output: [B, N, C] -> [B, C, N] -> [B, C, H, W]
+        src = memory.permute(0, 2, 1).contiguous()
+        src = src.view([-1, channels, fmp_h, fmp_w])
+
+        return src

+ 40 - 0
odlab/models/detectors/__init__.py

@@ -0,0 +1,40 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+from .retinanet.build import build_retinanet
+from .fcos.build      import build_fcos
+from .yolof.build     import build_yolof
+from .detr.build      import build_detr
+
+
+def build_model(args, cfg, num_classes=80, is_val=False):
+    # ------------ build object detector ------------
+    ## RetinaNet    
+    if 'retinanet' in args.model:
+        model, criterion = build_retinanet(cfg, num_classes, is_val)
+    ## FCOS    
+    elif 'fcos' in args.model:
+        model, criterion = build_fcos(cfg, num_classes, is_val)
+    ## YOLOF    
+    elif 'yolof' in args.model:
+        model, criterion = build_yolof(cfg, num_classes, is_val)
+    ## DETR    
+    elif 'detr' in args.model:
+        model, criterion = build_detr(cfg, num_classes, is_val)
+    else:
+        raise NotImplementedError("Unknown detector: {}".args.model)
+    
+    if is_val:
+        # ------------ Keep training from the given weight ------------
+        if args.resume is not None:
+            print('keep training: ', args.resume)
+            checkpoint = torch.load(args.resume, map_location='cpu')
+            # checkpoint state dict
+            checkpoint_state_dict = checkpoint.pop("model")
+            model.load_state_dict(checkpoint_state_dict)
+
+        return model, criterion
+
+    else:      
+        return model
+    

+ 57 - 0
odlab/models/detectors/detr/README.md

@@ -0,0 +1,57 @@
+# PlainDETR
+
+Our `PlainDETR-R50-1x` baseline on COCO-val:
+```Shell
+```
+
+## Results on COCO
+
+| Model           |  Scale     |  Pretrained  |  FPS  | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | Weight | Logs  |
+| --------------- | ---------- | ------------ | ----- | ---------------------- |  ---------------  | ------ | ----- |
+| PlainDETR-R50   |  800,1333  |   IN1K-Cls   |       |                        |                   |  |  |
+| PlainDETR-R50   |  800,1333  |   IN1K-MIM   |       |                        |                   |  |  |
+
+- We explore whether PlainDETR can still be powerful when using ResNet as the backbone.
+- We set up two comparative experiments, using the ResNet-50 pre-trained for the IN1K classification task and the ResNet-50 pre-trained by IN1K's MIM as the backbone of PlainDETR. Among them, we used the MIM pre-trained ResNet-50 provided by [SparK](https://github.com/keyu-tian/SparK).
+
+
+## Train PlainDETR
+### Single GPU
+Taking training **PlainDETR** on COCO as the example,
+```Shell
+python main.py --cuda -d coco --root path/to/coco -m plain_detr_r50 --batch_size 16 --eval_epoch 2
+```
+
+### Multi GPU
+Taking training **PlainDETR** on COCO as the example,
+```Shell
+python -m torch.distributed.run --nproc_per_node=8 train.py --cuda -dist -d coco --root path/to/coco -m plain_detr_r50 --batch_size 16 --eval_epoch 2 
+```
+
+## Test PlainDETR
+Taking testing **PlainDETR** on COCO-val as the example,
+```Shell
+python test.py --cuda -d coco --root path/to/coco -m plain_detr_r50 --weight path/to/plain_detr_r50.pth -vt 0.4 --show 
+```
+
+## Evaluate PlainDETR
+Taking evaluating **PlainDETR** on COCO-val as the example,
+```Shell
+python main.py --cuda -d coco --root path/to/coco -m plain_detr_r50 --resume path/to/plain_detr_r50.pth --eval_first
+```
+
+## Demo
+### Detect with Image
+```Shell
+python demo.py --mode image --path_to_img path/to/image_dirs/ --cuda -m plain_detr_r50 --weight path/to/weight -vt 0.4 --show
+```
+
+### Detect with Video
+```Shell
+python demo.py --mode video --path_to_vid path/to/video --cuda -m plain_detr_r50 --weight path/to/weight -vt 0.4 --show --gif
+```
+
+### Detect with Camera
+```Shell
+python demo.py --mode camera --cuda -m plain_detr_r50 --weight path/to/weight -vt 0.4 --show --gif
+```

+ 25 - 0
odlab/models/detectors/detr/build.py

@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .criterion import build_criterion
+from .detr import DETR
+
+
+# build object detector
+def build_detr(cfg, num_classes=80, is_val=False):
+    # -------------- Build RT-DETR --------------
+    model = DETR(cfg         = cfg,
+                 num_classes = num_classes,
+                 conf_thresh = cfg['train_conf_thresh'] if is_val else cfg['test_conf_thresh'],
+                 nms_thresh  = cfg['train_nms_thresh']  if is_val else cfg['test_nms_thresh'],
+                 topk        = cfg['train_topk']        if is_val else cfg['test_topk'],
+                 use_nms     = False,
+                 )
+            
+    # -------------- Build criterion --------------
+    criterion = None
+    if is_val:
+        # build criterion for training
+        criterion = build_criterion(cfg, num_classes, aux_loss=True)
+        
+    return model, criterion

+ 212 - 0
odlab/models/detectors/detr/criterion.py

@@ -0,0 +1,212 @@
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .matcher import HungarianMatcher
+
+from utils.misc import sigmoid_focal_loss
+from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, bbox2delta
+from utils.distributed_utils import is_dist_avail_and_initialized, get_world_size
+
+
+# build criterion
+def build_criterion(cfg, num_classes, aux_loss=True):
+    criterion = Criterion(cfg, num_classes, aux_loss)
+
+    return criterion
+    
+    
+class Criterion(nn.Module):
+    def __init__(self, cfg, num_classes=80, aux_loss=False):
+        super().__init__()
+        # ------------ Basic parameters ------------
+        self.cfg = cfg
+        self.num_classes = num_classes
+        self.k_one2many = cfg['k_one2many']
+        self.lambda_one2many = cfg['lambda_one2many']
+        self.aux_loss = aux_loss
+        self.losses = ['labels', 'boxes']
+        # ------------- Focal loss -------------
+        self.alpha = 0.25
+        self.gamma = 2.0
+        # ------------ Matcher ------------
+        self.matcher = HungarianMatcher(cost_class = cfg['matcher_hpy']['cost_class'],
+                                        cost_bbox  = cfg['matcher_hpy']['cost_bbox'],
+                                        cost_giou  = cfg['matcher_hpy']['cost_giou']
+                                        )
+        # ------------- Loss weight -------------
+        weight_dict = {'loss_cls':  cfg['loss_coeff']['class'],
+                       'loss_box':  cfg['loss_coeff']['bbox'],
+                       'loss_giou': cfg['loss_coeff']['giou']}
+        if aux_loss:
+            aux_weight_dict = {}
+            for i in range(cfg['de_num_layers'] - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            aux_weight_dict.update({k + "_enc": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        new_dict = dict()
+        for key, value in weight_dict.items():
+            new_dict[key] = value
+            new_dict[key + "_one2many"] = value
+        self.weight_dict = new_dict
+
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        # prepare class targets
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]).to(src_logits.device)
+        target_classes = torch.full(src_logits.shape[:2],
+                                    self.num_classes,
+                                    dtype=torch.int64,
+                                    device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        # to one-hot labels
+        target_classes_onehot = torch.zeros([*src_logits.shape[:2], self.num_classes + 1],
+                                            dtype=src_logits.dtype,
+                                            layout=src_logits.layout,
+                                            device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+        target_classes_onehot = target_classes_onehot[..., :-1]
+
+        # focal loss
+        loss_cls = sigmoid_focal_loss(src_logits, target_classes_onehot, self.alpha, self.gamma)
+
+        losses = {}
+        losses['loss_cls'] = loss_cls.sum() / num_boxes
+
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        # prepare bbox targets
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0).to(src_boxes.device)
+        
+        # compute L1 loss
+        src_deltas = outputs["pred_deltas"][idx]
+        src_boxes_old = outputs["pred_boxes_old"][idx]
+        target_deltas = bbox2delta(src_boxes_old, target_boxes)
+        loss_bbox = F.l1_loss(src_deltas, target_deltas, reduction="none")
+
+        # compute GIoU loss
+        bbox_giou = generalized_box_iou(box_cxcywh_to_xyxy(src_boxes),
+                                        box_cxcywh_to_xyxy(target_boxes))
+        loss_giou = 1 - torch.diag(bbox_giou)
+        
+        losses = {}
+        losses['loss_box'] = loss_bbox.sum() / num_boxes
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+
+        return losses
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'boxes': self.loss_boxes,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def compute_loss(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {
+            k: v
+            for k, v in outputs.items()
+            if k != "aux_outputs" and k != "enc_outputs"
+        }
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor(
+            [num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            kwargs = {}
+            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes, **kwargs)
+            losses.update(l_dict)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    kwargs = {}
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        if "enc_outputs" in outputs:
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["labels"] = torch.zeros_like(bt["labels"])
+            indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                kwargs = {}
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs)
+                l_dict = {k + "_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        return losses
+
+    def forward(self, outputs, targets):
+        # --------------------- One-to-one losses ---------------------
+        outputs_one2one = {k: v for k, v in outputs.items() if "one2many" not in k}
+        loss_dict = self.compute_loss(outputs_one2one, targets)
+
+        # --------------------- One-to-many losses ---------------------
+        outputs_one2many = {k[:-9]: v for k, v in outputs.items() if "one2many" in k}
+        if len(outputs_one2many) > 0:
+            # Copy targets
+            multi_targets = copy.deepcopy(targets)
+            for target in multi_targets:
+                target["boxes"] = target["boxes"].repeat(self.k_one2many, 1)
+                target["labels"] = target["labels"].repeat(self.k_one2many)
+            # Compute one-to-many losses
+            one2many_loss_dict = self.compute_loss(outputs_one2many, multi_targets)
+            # add one2many losses in to the final loss_dict
+            for k, v in one2many_loss_dict.items():
+                if k + "_one2many" in loss_dict.keys():
+                    loss_dict[k + "_one2many"] += v * self.lambda_one2many
+                else:
+                    loss_dict[k + "_one2many"] = v * self.lambda_one2many
+
+        return loss_dict

+ 347 - 0
odlab/models/detectors/detr/detr.py

@@ -0,0 +1,347 @@
+import math
+import torch
+import torch.nn as nn
+
+from ...backbone          import build_backbone
+from ...basic.mlp         import MLP
+from ...basic.conv        import BasicConv, UpSampleWrapper
+from ...basic.transformer import TransformerEncoder, PlainDETRTransformer, get_clones
+
+from utils.misc import multiclass_nms
+
+
+# DETR
+class DETR(nn.Module):
+    def __init__(self,
+                 cfg,
+                 num_classes = 80,
+                 conf_thresh = 0.1,
+                 nms_thresh  = 0.5,
+                 topk        = 300,
+                 use_nms     = False,
+                 ca_nms      = False,
+                 ):
+        super().__init__()
+        # ---------------- Basic setting ----------------
+        self.stride = cfg['out_stride']
+        self.upsample_factor = cfg['max_stride'] // cfg['out_stride']
+        self.num_classes = num_classes
+        ## Transformer parameters
+        self.num_queries_one2one = cfg['num_queries_one2one']
+        self.num_queries_one2many = cfg['num_queries_one2many']
+        self.num_queries = self.num_queries_one2one + self.num_queries_one2many
+        ## Post-process parameters
+        self.ca_nms = ca_nms
+        self.use_nms = use_nms
+        self.num_topk = topk
+        self.nms_thresh = nms_thresh
+        self.conf_thresh = conf_thresh
+
+        # ---------------- Network setting ----------------
+        ## Backbone Network
+        self.backbone, feat_dims = build_backbone(cfg)
+
+        ## Input projection
+        self.input_proj = BasicConv(feat_dims[-1], cfg['hidden_dim'], kernel_size=1, act_type=None, norm_type='GN')
+
+        ## Transformer Encoder
+        self.transformer_encoder = TransformerEncoder(d_model    = cfg['hidden_dim'],
+                                                      num_heads  = cfg['en_num_heads'],
+                                                      num_layers = cfg['en_num_layers'],
+                                                      ffn_dim    = cfg['en_ffn_dim'],
+                                                      dropout    = cfg['en_dropout'],
+                                                      act_type   = cfg['en_act'],
+                                                      pre_norm   = cfg['en_pre_norm'],
+                                                      )
+
+        ## Upsample layer
+        self.upsample = UpSampleWrapper(cfg['hidden_dim'], self.upsample_factor)
+        
+        ## Output projection
+        self.output_proj = BasicConv(cfg['hidden_dim'], cfg['hidden_dim'], kernel_size=3, padding=1, act_type='silu', norm_type='BN')
+        
+        ## Transformer
+        self.query_embed = nn.Embedding(self.num_queries, cfg['hidden_dim'])
+        self.transformer = PlainDETRTransformer(d_model             = cfg['hidden_dim'],
+                                                num_heads           = cfg['de_num_heads'],
+                                                ffn_dim             = cfg['de_ffn_dim'],
+                                                dropout             = cfg['de_dropout'],
+                                                act_type            = cfg['de_act'],
+                                                pre_norm            = cfg['de_pre_norm'],
+                                                rpe_hidden_dim      = cfg['rpe_hidden_dim'],
+                                                feature_stride      = cfg['out_stride'],
+                                                num_layers          = cfg['de_num_layers'],
+                                                use_checkpoint      = cfg['use_checkpoint'],
+                                                num_queries_one2one = cfg['num_queries_one2one'],
+                                                num_queries_one2many    = cfg['num_queries_one2many'],
+                                                proposal_feature_levels = cfg['proposal_feature_levels'],
+                                                proposal_in_stride      = cfg['out_stride'],
+                                                proposal_tgt_strides    = cfg['proposal_tgt_strides'],
+                                                return_intermediate = True,
+                                                )
+    
+        ## Detect Head
+        class_embed = nn.Linear(cfg['hidden_dim'], num_classes)
+        bbox_embed = MLP(cfg['hidden_dim'], cfg['hidden_dim'], 4, 3)
+
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        class_embed.bias.data = torch.ones(num_classes) * bias_value
+        nn.init.constant_(bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(bbox_embed.layers[-1].bias.data, 0)
+
+        self.class_embed = get_clones(class_embed, cfg['de_num_layers'] + 1)
+        self.bbox_embed  = get_clones(bbox_embed, cfg['de_num_layers'] + 1)
+        nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.class_embed = self.class_embed
+
+    def get_posembed(self, d_model, mask, temperature=10000, normalize=False):
+        not_mask = ~mask
+        scale = 2 * torch.pi
+        num_pos_feats = d_model // 2
+
+        # -------------- Generate XY coords --------------
+        ## [B, H, W]
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        ## Normalize coords
+        if normalize:
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + 1e-6)
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + 1e-6)
+        else:
+            y_embed = y_embed - 0.5
+            x_embed = x_embed - 0.5
+        # [H, W] -> [B, H, W, 2]
+        pos = torch.stack([x_embed, y_embed], dim=-1)
+
+        # -------------- Sine-PosEmbedding --------------
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
+        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats
+        dim_t = temperature ** (2 * dim_t_)
+
+        x_embed = pos[..., 0] * scale
+        y_embed = pos[..., 1] * scale
+        pos_x = x_embed[..., None] / dim_t
+        pos_y = y_embed[..., None] / dim_t
+        pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
+        pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
+        pos_embed = torch.cat((pos_y, pos_x), dim=-1)
+        
+        # [B, H, W, C] -> [B, C, H, W]
+        pos_embed = pos_embed.permute(0, 3, 1, 2)
+        
+        return pos_embed
+
+    def post_process(self, box_pred, cls_pred):
+        # Top-k select
+        cls_pred = cls_pred[0].flatten().sigmoid_()
+        box_pred = box_pred[0]
+
+        # Keep top k top scoring indices only.
+        num_topk = min(self.num_topk, box_pred.size(0))
+
+        # Topk candidates
+        predicted_prob, topk_idxs = cls_pred.sort(descending=True)
+        topk_scores = predicted_prob[:num_topk]
+        topk_idxs = topk_idxs[:self.num_topk]
+
+        # Filter out the proposals with low confidence score
+        keep_idxs = topk_scores > self.conf_thresh
+        topk_scores = topk_scores[keep_idxs]
+        topk_idxs = topk_idxs[keep_idxs]
+        topk_box_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
+
+        ## Top-k results
+        topk_labels = topk_idxs % self.num_classes
+        topk_bboxes = box_pred[topk_box_idxs]
+
+        topk_scores = topk_scores.cpu().numpy()
+        topk_labels = topk_labels.cpu().numpy()
+        topk_bboxes = topk_bboxes.cpu().numpy()
+
+        # nms
+        if self.use_nms:
+            topk_scores, topk_labels, topk_bboxes = multiclass_nms(
+                topk_scores, topk_labels, topk_bboxes, self.nms_thresh, self.num_classes, self.nms_class_agnostic)
+
+        return topk_bboxes, topk_scores, topk_labels
+
+    def resize_mask(self, src, mask=None):
+        bs, c, h, w = src.shape
+        if mask is not None:
+            # [B, H, W]
+            mask = nn.functional.interpolate(mask[None].float(), size=[h, w]).bool()[0]
+        else:
+            mask = torch.zeros([bs, h, w], device=src.device, dtype=torch.bool)
+
+        return mask
+    
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_coord_old, outputs_deltas):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [
+            {"pred_logits": a, "pred_boxes": b, "pred_boxes_old": c, "pred_deltas": d, }
+            for a, b, c, d in zip(outputs_class[:-1], outputs_coord[:-1], outputs_coord_old[:-1], outputs_deltas[:-1])
+        ]
+
+    def inference_single_image(self, x):
+        # ----------- Image Encoder -----------
+        pyramid_feats = self.backbone(x)
+        src = self.input_proj(pyramid_feats[-1])
+        src = self.transformer_encoder(src)
+        src = self.upsample(src)
+        src = self.output_proj(src)
+
+        # ----------- Prepare inputs for Transformer -----------
+        mask = self.resize_mask(src)
+        pos_embed = self.get_posembed(src.shape[1], mask, normalize=False)
+        query_embeds = self.query_embed.weight[:self.num_queries_one2one]
+        self_attn_mask = None
+
+        # -----------Transformer -----------
+        (
+            hs,
+            init_reference,
+            inter_references,
+            _,
+            _,
+            _,
+            _,
+            max_shape
+        ) = self.transformer(src, mask, pos_embed, query_embeds, self_attn_mask)
+
+        # ----------- Process outputs -----------
+        outputs_classes_one2one = []
+        outputs_coords_one2one = []
+        outputs_deltas_one2one = []
+
+        for lid in range(hs.shape[0]):
+            if lid == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lid - 1]
+            outputs_class = self.class_embed[lid](hs[lid])
+            tmp = self.bbox_embed[lid](hs[lid])
+            outputs_coord = self.transformer.decoder.delta2bbox(reference, tmp, max_shape)  # xyxy
+
+            outputs_classes_one2one.append(outputs_class[:, :self.num_queries_one2one])
+            outputs_coords_one2one.append(outputs_coord[:, :self.num_queries_one2one])
+            outputs_deltas_one2one.append(tmp[:, :self.num_queries_one2one])
+
+        outputs_classes_one2one = torch.stack(outputs_classes_one2one)
+        outputs_coords_one2one = torch.stack(outputs_coords_one2one)
+
+        # ------------ Post process ------------
+        cls_pred = outputs_classes_one2one[-1]
+        box_pred = outputs_coords_one2one[-1]
+        
+        # post-process
+        bboxes, scores, labels = self.post_process(box_pred, cls_pred)
+        # normalize bbox
+        bboxes[..., 0::2] /= x.shape[-1]
+        bboxes[..., 1::2] /= x.shape[-2]
+        bboxes = bboxes.clip(0., 1.)
+
+        return bboxes, scores, labels
+        
+    def forward(self, x, src_mask=None, targets=None):
+        if not self.training:
+            return self.inference_single_image(x)
+
+        # ----------- Image Encoder -----------
+        pyramid_feats = self.backbone(x)
+        src = self.input_proj(pyramid_feats[-1])
+        src = self.transformer_encoder(src)
+        src = self.upsample(src)
+        src = self.output_proj(src)
+
+        # ----------- Prepare inputs for Transformer -----------
+        mask = self.resize_mask(src, src_mask)
+        pos_embed = self.get_posembed(src.shape[1], mask, normalize=False)
+        query_embeds = self.query_embed.weight
+        self_attn_mask = torch.zeros(
+            [self.num_queries, self.num_queries, ]).bool().to(src.device)
+        self_attn_mask[self.num_queries_one2one:, 0: self.num_queries_one2one, ] = True
+        self_attn_mask[0: self.num_queries_one2one, self.num_queries_one2one:, ] = True
+
+        # -----------Transformer -----------
+        (
+            hs,
+            init_reference,
+            inter_references,
+            enc_outputs_class,
+            enc_outputs_coord_unact,
+            enc_outputs_delta,
+            output_proposals,
+            max_shape
+        ) = self.transformer(src, mask, pos_embed, query_embeds, self_attn_mask)
+
+        # ----------- Process outputs -----------
+        outputs_classes_one2one = []
+        outputs_coords_one2one = []
+        outputs_coords_old_one2one = []
+        outputs_deltas_one2one = []
+
+        outputs_classes_one2many = []
+        outputs_coords_one2many = []
+        outputs_coords_old_one2many = []
+        outputs_deltas_one2many = []
+
+        for lid in range(hs.shape[0]):
+            if lid == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lid - 1]
+            outputs_class = self.class_embed[lid](hs[lid])
+            tmp = self.bbox_embed[lid](hs[lid])
+            outputs_coord = self.transformer.decoder.box_xyxy_to_cxcywh(
+                self.transformer.decoder.delta2bbox(reference, tmp, max_shape))
+
+            outputs_classes_one2one.append(outputs_class[:, 0: self.num_queries_one2one])
+            outputs_classes_one2many.append(outputs_class[:, self.num_queries_one2one:])
+
+            outputs_coords_one2one.append(outputs_coord[:, 0: self.num_queries_one2one])
+            outputs_coords_one2many.append(outputs_coord[:, self.num_queries_one2one:])
+
+            outputs_coords_old_one2one.append(reference[:, :self.num_queries_one2one])
+            outputs_coords_old_one2many.append(reference[:, self.num_queries_one2one:])
+            outputs_deltas_one2one.append(tmp[:, :self.num_queries_one2one])
+            outputs_deltas_one2many.append(tmp[:, self.num_queries_one2one:])
+
+        outputs_classes_one2one = torch.stack(outputs_classes_one2one)
+        outputs_coords_one2one = torch.stack(outputs_coords_one2one)
+
+        outputs_classes_one2many = torch.stack(outputs_classes_one2many)
+        outputs_coords_one2many = torch.stack(outputs_coords_one2many)
+
+        out = {
+            "pred_logits": outputs_classes_one2one[-1],
+            "pred_boxes": outputs_coords_one2one[-1],
+            "pred_logits_one2many": outputs_classes_one2many[-1],
+            "pred_boxes_one2many": outputs_coords_one2many[-1],
+
+            "pred_boxes_old": outputs_coords_old_one2one[-1],
+            "pred_deltas": outputs_deltas_one2one[-1],
+            "pred_boxes_old_one2many": outputs_coords_old_one2many[-1],
+            "pred_deltas_one2many": outputs_deltas_one2many[-1],
+        }
+
+        out["aux_outputs"] = self._set_aux_loss(
+            outputs_classes_one2one, outputs_coords_one2one, outputs_coords_old_one2one, outputs_deltas_one2one
+        )
+        out["aux_outputs_one2many"] = self._set_aux_loss(
+            outputs_classes_one2many, outputs_coords_one2many, outputs_coords_old_one2many, outputs_deltas_one2many
+        )
+
+        out["enc_outputs"] = {
+            "pred_logits": enc_outputs_class,
+            "pred_boxes": enc_outputs_coord_unact,
+            "pred_boxes_old": output_proposals,
+            "pred_deltas": enc_outputs_delta,
+        }
+
+        return out

+ 99 - 0
odlab/models/detectors/detr/matcher.py

@@ -0,0 +1,99 @@
+# ------------------------------------------------------------------------
+# Plain-DETR
+# Copyright (c) 2023 Xi'an Jiaotong University & Microsoft Research Asia.
+# Licensed under The MIT License [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+
+from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, bbox2delta
+
+
+class HungarianMatcher(nn.Module):
+    def __init__(self,
+                 cost_class: float = 1,
+                 cost_bbox:  float = 1,
+                 cost_giou:  float = 1,
+                 ):
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert (
+            cost_class != 0 or cost_bbox != 0 or cost_giou != 0
+        ), "all costs cant be 0"
+
+    def forward(self, outputs, targets):
+        """ Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        with torch.no_grad():
+            bs, num_queries = outputs["pred_logits"].shape[:2]
+
+            # We flatten to compute the cost matrices in a batch
+            out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
+            out_bbox = outputs["pred_boxes"].flatten(0, 1)
+
+            # Also concat the target labels and boxes
+            tgt_ids = torch.cat([v["labels"] for v in targets]).to(out_prob.device)
+            tgt_bbox = torch.cat([v["boxes"] for v in targets]).to(out_prob.device)
+
+            # Compute the classification cost.
+            alpha = 0.25
+            gamma = 2.0
+            neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+            pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+            # Compute the L1 cost between boxes
+            out_delta = outputs["pred_deltas"].flatten(0, 1)
+            out_bbox_old = outputs["pred_boxes_old"].flatten(0, 1)
+            tgt_delta = bbox2delta(out_bbox_old, tgt_bbox)
+            cost_bbox = torch.cdist(out_delta[:, None], tgt_delta, p=1).squeeze(1)
+
+            # Compute the giou cost betwen boxes
+            cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
+                                             box_cxcywh_to_xyxy(tgt_bbox)
+            )
+
+            # Final cost matrix
+            C = self.cost_bbox  * cost_bbox + \
+                self.cost_class * cost_class + \
+                self.cost_giou  * cost_giou
+            C = C.view(bs, num_queries, -1).cpu()
+
+            sizes = [len(v["boxes"]) for v in targets]
+            indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+            
+            return [(torch.as_tensor(i, dtype=torch.int64),  # batch index
+                     torch.as_tensor(j, dtype=torch.int64))  # query index
+                     for i, j in indices]
+        

+ 72 - 0
odlab/models/detectors/fcos/README.md

@@ -0,0 +1,72 @@
+# FCOS: Fully Convolutional One-Stage Object Detector
+
+Our `FCOS-R50-1x` baseline on COCO-val:
+```Shell
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.391
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.579
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.422
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.236
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.428
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.501
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.326
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.559
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.625
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.450
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.685
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.758
+```
+
+- FCOS
+
+| Model        |  scale     |  FPS  | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | Weight | Logs  |
+| -------------| ---------- | ----- | ---------------------- |  ---------------  | ------ | ----- |
+| FCOS_R18_1x  |  800,1333  |       |          34.1          |        52.2       | [ckpt](https://github.com/yjh0410/ODLab/releases/download/detection_weights/fcos_r18_1x_coco.pth) | [Logs](https://github.com/yjh0410/ODLab/releases/download/detection_weights/FCOS-R18-1x.txt) |
+| FCOS_R50_1x  |  800,1333  |       |          39.1          |        57.9       | [ckpt](https://github.com/yjh0410/ODLab/releases/download/detection_weights/fcos_r50_1x_coco.pth) | [Logs](https://github.com/yjh0410/ODLab/releases/download/detection_weights/FCOS-R50-1x.txt) |
+
+- Real-time FCOS
+
+| Model          |  scale     |  FPS  | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | Weight | Logs  |
+| ---------------| ---------- | ----- | ---------------------- |  ---------------  | ------ | ----- |
+| FCOS_RT_R18_4x |  512,736   |       |                        |                   |        |  |
+| FCOS_RT_R50_4x |  512,736   |       |          43.9          |        60.2       |        |  |
+
+## Train FCOS
+### Single GPU
+Taking training **FCOS_R18_1x** on COCO as the example,
+```Shell
+python main.py --cuda -d coco --root path/to/coco -m fcos_r18_1x --batch_size 16 --eval_epoch 2
+```
+
+### Multi GPU
+Taking training **FCOS_R18_1x** on COCO as the example,
+```Shell
+python -m torch.distributed.run --nproc_per_node=8 train.py --cuda -dist -d coco --root path/to/coco -m fcos_r18_1x --batch_size 16 --eval_epoch 2 
+```
+
+## Test FCOS
+Taking testing **FCOS_R18_1x** on COCO-val as the example,
+```Shell
+python test.py --cuda -d coco --root path/to/coco -m fcos_r18_1x --weight path/to/fcos_r18_1x.pth -vt 0.4 --show 
+```
+
+## Evaluate FCOS
+Taking evaluating **FCOS_R18_1x** on COCO-val as the example,
+```Shell
+python main.py --cuda -d coco --root path/to/coco -m fcos_r18_1x --resume path/to/fcos_r18_1x.pth --eval_first
+```
+
+## Demo
+### Detect with Image
+```Shell
+python demo.py --mode image --path_to_img path/to/image_dirs/ --cuda -m fcos_r18_1x --weight path/to/weight -vt 0.4 --show
+```
+
+### Detect with Video
+```Shell
+python demo.py --mode video --path_to_vid path/to/video --cuda -m fcos_r18_1x --weight path/to/weight -vt 0.4 --show --gif
+```
+
+### Detect with Camera
+```Shell
+python demo.py --mode camera --cuda -m fcos_r18_1x --weight path/to/weight -vt 0.4 --show --gif
+```

+ 24 - 0
odlab/models/detectors/fcos/build.py

@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .criterion import build_criterion
+from .fcos import FCOS
+
+
+# build FCOS
+def build_fcos(cfg, num_classes=80, is_val=False):
+    # -------------- Build FCOS --------------
+    model = FCOS(cfg         = cfg,
+                 num_classes = num_classes,
+                 conf_thresh = cfg['train_conf_thresh'] if is_val else cfg['test_conf_thresh'],
+                 nms_thresh  = cfg['train_nms_thresh']  if is_val else cfg['test_nms_thresh'],
+                 topk        = cfg['train_topk']        if is_val else cfg['test_topk'],
+                 )
+            
+    # -------------- Build Criterion --------------
+    criterion = None
+    if is_val:
+        # build criterion for training
+        criterion = build_criterion(cfg, num_classes)
+
+    return model, criterion

+ 267 - 0
odlab/models/detectors/fcos/criterion.py

@@ -0,0 +1,267 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from utils.box_ops import get_ious
+from utils.misc import sigmoid_focal_loss
+from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
+
+from .matcher import FcosMatcher, SimOtaMatcher
+
+
+class Criterion(nn.Module):
+    def __init__(self, cfg, num_classes=90):
+        super().__init__()
+        # ------------- Basic parameters -------------
+        self.cfg = cfg
+        self.num_classes = num_classes
+        # ------------- Focal loss -------------
+        self.alpha = cfg['focal_loss_alpha']
+        self.gamma = cfg['focal_loss_gamma']
+        # ------------- Loss weight -------------
+        self.weight_dict = {'loss_cls': cfg['loss_cls_weight'],
+                            'loss_reg': cfg['loss_reg_weight'],
+                            'loss_ctn': cfg['loss_ctn_weight']}
+        # ------------- Matcher -------------
+        self.matcher_cfg = cfg['matcher_hpy']
+        if cfg['matcher'] == 'fcos_matcher':
+            self.matcher = FcosMatcher(num_classes,
+                                       self.matcher_cfg['center_sampling_radius'],
+                                       self.matcher_cfg['object_sizes_of_interest'],
+                                       [1., 1., 1., 1.]
+                                       )
+        elif cfg['matcher'] == 'simota':
+            self.matcher = SimOtaMatcher(num_classes,
+                                         self.matcher_cfg['soft_center_radius'],
+                                         self.matcher_cfg['topk_candidates'])
+        else:
+            raise NotImplementedError("Unknown matcher: {}.".format(cfg['matcher']))
+
+    def loss_labels(self, pred_cls, tgt_cls, num_boxes=1.0):
+        """
+            pred_cls: (Tensor) [N, C]
+            tgt_cls:  (Tensor) [N, C]
+        """
+        # cls loss: [V, C]
+        loss_cls = sigmoid_focal_loss(pred_cls, tgt_cls, self.alpha, self.gamma)
+
+        return loss_cls.sum() / num_boxes
+
+    def loss_bboxes_ltrb(self, pred_delta, tgt_delta, bbox_quality=None, num_boxes=1.0):
+        """
+            pred_box: (Tensor) [N, 4]
+            tgt_box:  (Tensor) [N, 4]
+        """
+        pred_delta = torch.cat((-pred_delta[..., :2], pred_delta[..., 2:]), dim=-1)
+        tgt_delta = torch.cat((-tgt_delta[..., :2], tgt_delta[..., 2:]), dim=-1)
+
+        eps = torch.finfo(torch.float32).eps
+
+        pred_area = (pred_delta[..., 2] - pred_delta[..., 0]).clamp_(min=0) \
+            * (pred_delta[..., 3] - pred_delta[..., 1]).clamp_(min=0)
+        tgt_area = (tgt_delta[..., 2] - tgt_delta[..., 0]).clamp_(min=0) \
+            * (tgt_delta[..., 3] - tgt_delta[..., 1]).clamp_(min=0)
+
+        w_intersect = (torch.min(pred_delta[..., 2], tgt_delta[..., 2])
+                    - torch.max(pred_delta[..., 0], tgt_delta[..., 0])).clamp_(min=0)
+        h_intersect = (torch.min(pred_delta[..., 3], tgt_delta[..., 3])
+                    - torch.max(pred_delta[..., 1], tgt_delta[..., 1])).clamp_(min=0)
+
+        area_intersect = w_intersect * h_intersect
+        area_union = tgt_area + pred_area - area_intersect
+        ious = area_intersect / area_union.clamp(min=eps)
+
+        # giou
+        g_w_intersect = torch.max(pred_delta[..., 2], tgt_delta[..., 2]) \
+            - torch.min(pred_delta[..., 0], tgt_delta[..., 0])
+        g_h_intersect = torch.max(pred_delta[..., 3], tgt_delta[..., 3]) \
+            - torch.min(pred_delta[..., 1], tgt_delta[..., 1])
+        ac_uion = g_w_intersect * g_h_intersect
+        gious = ious - (ac_uion - area_union) / ac_uion.clamp(min=eps)
+        loss_box = 1 - gious
+
+        if bbox_quality is not None:
+            loss_box = loss_box * bbox_quality.view(loss_box.size())
+
+        return loss_box.sum() / num_boxes
+
+    def loss_bboxes_xyxy(self, pred_box, gt_box, num_boxes=1.0):
+        ious = get_ious(pred_box, gt_box, box_mode="xyxy", iou_type='giou')
+        loss_box = 1.0 - ious
+
+        return loss_box.sum() / num_boxes
+    
+    def fcos_loss(self, outputs, targets):
+        """
+            outputs['pred_cls']: (Tensor) [B, M, C]
+            outputs['pred_reg']: (Tensor) [B, M, 4]
+            outputs['pred_ctn']: (Tensor) [B, M, 1]
+            outputs['strides']: (List) [8, 16, 32, ...] stride of the model output
+            targets: (List) [dict{'boxes': [...], 
+                                 'labels': [...], 
+                                 'orig_size': ...}, ...]
+        """
+        # -------------------- Pre-process --------------------
+        device = outputs['pred_cls'][0].device
+        fpn_strides = outputs['strides']
+        anchors = outputs['anchors']
+        pred_cls = torch.cat(outputs['pred_cls'], dim=1).view(-1, self.num_classes)
+        pred_delta = torch.cat(outputs['pred_reg'], dim=1).view(-1, 4)
+        pred_ctn = torch.cat(outputs['pred_ctn'], dim=1).view(-1, 1)
+        masks = ~torch.cat(outputs['mask'], dim=1).view(-1)
+
+        # -------------------- Label Assignment --------------------
+        gt_classes, gt_deltas, gt_centerness = self.matcher(fpn_strides, anchors, targets)
+        gt_classes = gt_classes.flatten().to(device)
+        gt_deltas = gt_deltas.view(-1, 4).to(device)
+        gt_centerness = gt_centerness.view(-1, 1).to(device)
+
+        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
+        num_foreground = foreground_idxs.sum()
+
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_foreground)
+        num_foreground = torch.clamp(num_foreground / get_world_size(), min=1).item()
+
+        num_foreground_centerness = gt_centerness[foreground_idxs].sum()
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_foreground_centerness)
+        num_targets = torch.clamp(num_foreground_centerness / get_world_size(), min=1).item()
+
+        # -------------------- classification loss --------------------
+        gt_classes_target = torch.zeros_like(pred_cls)
+        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1
+        valid_idxs = (gt_classes >= 0) & masks
+        loss_labels = self.loss_labels(
+            pred_cls[valid_idxs], gt_classes_target[valid_idxs], num_foreground)
+
+        # -------------------- regression loss --------------------
+        loss_bboxes = self.loss_bboxes_ltrb(
+            pred_delta[foreground_idxs], gt_deltas[foreground_idxs], gt_centerness[foreground_idxs], num_targets)
+
+        # -------------------- centerness loss --------------------
+        loss_centerness = F.binary_cross_entropy_with_logits(
+            pred_ctn[foreground_idxs],  gt_centerness[foreground_idxs], reduction='none')
+        loss_centerness = loss_centerness.sum() / num_foreground
+
+        loss_dict = dict(
+                loss_cls = loss_labels,
+                loss_reg = loss_bboxes,
+                loss_ctn = loss_centerness,
+        )
+
+        return loss_dict
+    
+    def ota_loss(self, outputs, targets):
+        """
+            outputs['pred_cls']: (Tensor) [B, M, C]
+            outputs['pred_reg']: (Tensor) [B, M, 4]
+            outputs['pred_box']: (Tensor) [B, M, 4]
+            outputs['pred_ctn']: (Tensor) [B, M, 1]
+            outputs['strides']: (List) [8, 16, 32, ...] stride of the model output
+            targets: (List) [dict{'boxes': [...], 
+                                 'labels': [...], 
+                                 'orig_size': ...}, ...]
+        """
+        # -------------------- Pre-process --------------------
+        device = outputs['pred_cls'][0].device
+        batch_size =  outputs['pred_cls'][0].shape[0]
+        fpn_strides = outputs['strides']
+        anchors = outputs['anchors']
+        pred_cls = torch.cat(outputs['pred_cls'], dim=1)   # [B, M, C]
+        pred_box = torch.cat(outputs['pred_box'], dim=1)   # [B, M, 4]
+        pred_ctn = torch.cat(outputs['pred_ctn'], dim=1)   # [B, M, 1]
+        masks = ~torch.cat(outputs['mask'], dim=1).view(-1)
+
+        # -------------------- Label Assignment --------------------
+        gt_classes = []
+        gt_bboxes = []
+        gt_centerness = []
+        for batch_idx in range(batch_size):
+            tgt_labels = targets[batch_idx]["labels"].to(device)  # [N,]
+            tgt_bboxes = targets[batch_idx]["boxes"].to(device)   # [N, 4]
+            # refine target
+            tgt_boxes_wh = tgt_bboxes[..., 2:] - tgt_bboxes[..., :2]
+            min_tgt_size = torch.min(tgt_boxes_wh, dim=-1)[0]
+            keep = (min_tgt_size >= 8)
+            tgt_bboxes = tgt_bboxes[keep]
+            tgt_labels = tgt_labels[keep]
+            # label assignment
+            assigned_result = self.matcher(fpn_strides=fpn_strides,
+                                           anchors=anchors,
+                                           pred_cls=pred_cls[batch_idx].detach(),
+                                           pred_box=pred_box[batch_idx].detach(),
+                                           pred_iou=pred_ctn[batch_idx].detach(),
+                                           gt_labels=tgt_labels,
+                                           gt_bboxes=tgt_bboxes
+                                           )
+            gt_classes.append(assigned_result['assigned_labels'])
+            gt_bboxes.append(assigned_result['assigned_bboxes'])
+            gt_centerness.append(assigned_result['assign_metrics'])
+
+        # List[B, M, C] -> Tensor[BM, C]
+        gt_classes = torch.cat(gt_classes, dim=0)         # [BM,]
+        gt_bboxes = torch.cat(gt_bboxes, dim=0)           # [BM, 4]
+        gt_centerness = torch.cat(gt_centerness, dim=0)   # [BM,]
+
+        valid_idxs = (gt_classes >= 0) & masks
+        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
+        num_foreground = foreground_idxs.sum()
+
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_foreground)
+        num_foreground = torch.clamp(num_foreground / get_world_size(), min=1).item()
+
+        # -------------------- classification loss --------------------
+        pred_cls = pred_cls.view(-1, self.num_classes)
+        gt_classes_target = torch.zeros_like(pred_cls)
+        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1
+        loss_labels = self.loss_labels(pred_cls[valid_idxs], gt_classes_target[valid_idxs], num_foreground)
+
+        # -------------------- regression loss --------------------
+        pred_box = pred_box.view(-1, 4)
+        pred_box_pos = pred_box[foreground_idxs]
+        gt_box_pos = gt_bboxes[foreground_idxs]
+        loss_bboxes = self.loss_bboxes_xyxy(pred_box_pos, gt_box_pos, num_foreground)
+
+        # -------------------- centerness loss --------------------
+        pred_ctn = pred_ctn.view(-1)
+        pred_ctn_pos = pred_ctn[foreground_idxs]
+        gt_ctn_pos = gt_centerness[foreground_idxs]
+        loss_centerness = F.binary_cross_entropy_with_logits(pred_ctn_pos, gt_ctn_pos, reduction='none')
+        loss_centerness = loss_centerness.sum() / num_foreground
+
+        loss_dict = dict(
+                loss_cls = loss_labels,
+                loss_reg = loss_bboxes,
+                loss_ctn = loss_centerness,
+        )
+
+        return loss_dict
+    
+    def forward(self, outputs, targets):
+        """
+            outputs['pred_cls']: (Tensor) [B, M, C]
+            outputs['pred_reg']: (Tensor) [B, M, 4]
+            outputs['pred_ctn']: (Tensor) [B, M, 1]
+            outputs['strides']: (List) [8, 16, 32, ...] stride of the model output
+            targets: (List) [dict{'boxes': [...], 
+                                 'labels': [...], 
+                                 'orig_size': ...}, ...]
+        """
+        if self.cfg['matcher'] == "fcos_matcher":
+            return self.fcos_loss(outputs, targets)
+        elif self.cfg['matcher'] == "simota":
+            return self.ota_loss(outputs, targets)
+        else:
+            raise NotImplementedError
+            
+
+# build criterion
+def build_criterion(cfg, num_classes=80):
+    criterion = Criterion(cfg=cfg, num_classes=num_classes)
+    return criterion
+
+
+if __name__ == "__main__":
+    pass

+ 122 - 0
odlab/models/detectors/fcos/fcos.py

@@ -0,0 +1,122 @@
+import torch
+import torch.nn as nn
+
+# --------------- Model components ---------------
+from ...backbone import build_backbone
+from ...neck import build_neck
+from ...head import build_head
+
+# --------------- External components ---------------
+from utils.misc import multiclass_nms
+
+
+# ------------------------ Fully Convolutional One-Stage Detector ------------------------
+class FCOS(nn.Module):
+    def __init__(self, 
+                 cfg,
+                 num_classes :int   = 80, 
+                 conf_thresh :float = 0.05,
+                 nms_thresh  :float = 0.6,
+                 topk        :int   = 1000,
+                 ca_nms      :bool  = False):
+        super(FCOS, self).__init__()
+        # ---------------------- Basic Parameters ----------------------
+        self.cfg = cfg
+        self.topk = topk
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.ca_nms = ca_nms
+
+        # ---------------------- Network Parameters ----------------------
+        ## Backbone
+        self.backbone, feat_dims = build_backbone(cfg)
+
+        ## Neck
+        self.fpn = build_neck(cfg, feat_dims, cfg['head_dim'])
+        
+        ## Heads
+        self.head = build_head(cfg, cfg['head_dim'], cfg['head_dim'], num_classes)
+
+    def post_process(self, cls_preds, ctn_preds, box_preds):
+        """
+        Input:
+            cls_preds: List(Tensor) [[B, H x W, C], ...]
+            ctn_preds: List(Tensor) [[B, H x W, 1], ...]
+            box_preds: List(Tensor) [[B, H x W, 4], ...]
+        """
+        all_scores = []
+        all_labels = []
+        all_bboxes = []
+        
+        for cls_pred_i, ctn_pred_i, box_pred_i in zip(cls_preds, ctn_preds, box_preds):
+            cls_pred_i = cls_pred_i[0]
+            ctn_pred_i = ctn_pred_i[0]
+            box_pred_i = box_pred_i[0]
+            
+            # (H x W x C,)
+            scores_i = torch.sqrt(cls_pred_i.sigmoid() * ctn_pred_i.sigmoid()).flatten()
+
+            # Keep top k top scoring indices only.
+            num_topk = min(self.topk, box_pred_i.size(0))
+
+            # torch.sort is actually faster than .topk (at least on GPUs)
+            predicted_prob, topk_idxs = scores_i.sort(descending=True)
+            topk_scores = predicted_prob[:num_topk]
+            topk_idxs = topk_idxs[:num_topk]
+
+            # filter out the proposals with low confidence score
+            keep_idxs = topk_scores > self.conf_thresh
+            topk_idxs = topk_idxs[keep_idxs]
+
+            # final scores
+            scores = topk_scores[keep_idxs]
+            # final labels
+            labels = topk_idxs % self.num_classes
+            # final bboxes
+            anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
+            bboxes = box_pred_i[anchor_idxs]
+
+            all_scores.append(scores)
+            all_labels.append(labels)
+            all_bboxes.append(bboxes)
+
+        scores = torch.cat(all_scores)
+        labels = torch.cat(all_labels)
+        bboxes = torch.cat(all_bboxes)
+
+        # to cpu & numpy
+        scores = scores.cpu().numpy()
+        labels = labels.cpu().numpy()
+        bboxes = bboxes.cpu().numpy()
+
+        # nms
+        scores, labels, bboxes = multiclass_nms(
+            scores, labels, bboxes, self.nms_thresh, self.num_classes, self.ca_nms)
+
+        return bboxes, scores, labels
+
+    def forward(self, src, src_mask=None, targets=None):
+        # ---------------- Backbone ----------------
+        pyramid_feats = self.backbone(src)
+
+        # ---------------- Neck ----------------
+        pyramid_feats = self.fpn(pyramid_feats)
+
+        # ---------------- Heads ----------------
+        outputs = self.head(pyramid_feats, src_mask)
+
+        if not self.training:
+            # ---------------- PostProcess ----------------
+            cls_pred = outputs["pred_cls"]
+            ctn_pred = outputs["pred_ctn"]
+            box_pred = outputs["pred_box"]
+            bboxes, scores, labels = self.post_process(cls_pred, ctn_pred, box_pred)
+            # normalize bbox
+            bboxes[..., 0::2] /= src.shape[-1]
+            bboxes[..., 1::2] /= src.shape[-2]
+            bboxes = bboxes.clip(0., 1.)
+
+            return bboxes, scores, labels
+
+        return outputs 

+ 382 - 0
odlab/models/detectors/fcos/matcher.py

@@ -0,0 +1,382 @@
+# ---------------------------------------------------------------------
+# Copyright (c) Megvii Inc. All rights reserved.
+# ---------------------------------------------------------------------
+
+
+import math
+import torch
+import torch.nn.functional as F
+
+from utils.box_ops import *
+
+
+@torch.no_grad()
+def get_ious_and_iou_loss(inputs,
+                          targets,
+                          weight=None,
+                          box_mode="xyxy",
+                          loss_type="iou",
+                          reduction="none"):
+    """
+    Compute iou loss of type ['iou', 'giou', 'linear_iou']
+
+    Args:
+        inputs (tensor): pred values
+        targets (tensor): target values
+        weight (tensor): loss weight
+        box_mode (str): 'xyxy' or 'ltrb', 'ltrb' is currently supported.
+        loss_type (str): 'giou' or 'iou' or 'linear_iou'
+        reduction (str): reduction manner
+
+    Returns:
+        loss (tensor): computed iou loss.
+    """
+    if box_mode == "ltrb":
+        inputs = torch.cat((-inputs[..., :2], inputs[..., 2:]), dim=-1)
+        targets = torch.cat((-targets[..., :2], targets[..., 2:]), dim=-1)
+    elif box_mode != "xyxy":
+        raise NotImplementedError
+
+    eps = torch.finfo(torch.float32).eps
+
+    inputs_area = (inputs[..., 2] - inputs[..., 0]).clamp_(min=0) \
+        * (inputs[..., 3] - inputs[..., 1]).clamp_(min=0)
+    targets_area = (targets[..., 2] - targets[..., 0]).clamp_(min=0) \
+        * (targets[..., 3] - targets[..., 1]).clamp_(min=0)
+
+    w_intersect = (torch.min(inputs[..., 2], targets[..., 2])
+                   - torch.max(inputs[..., 0], targets[..., 0])).clamp_(min=0)
+    h_intersect = (torch.min(inputs[..., 3], targets[..., 3])
+                   - torch.max(inputs[..., 1], targets[..., 1])).clamp_(min=0)
+
+    area_intersect = w_intersect * h_intersect
+    area_union = targets_area + inputs_area - area_intersect
+    ious = area_intersect / area_union.clamp(min=eps)
+
+    if loss_type == "iou":
+        loss = -ious.clamp(min=eps).log()
+    elif loss_type == "linear_iou":
+        loss = 1 - ious
+    elif loss_type == "giou":
+        g_w_intersect = torch.max(inputs[..., 2], targets[..., 2]) \
+            - torch.min(inputs[..., 0], targets[..., 0])
+        g_h_intersect = torch.max(inputs[..., 3], targets[..., 3]) \
+            - torch.min(inputs[..., 1], targets[..., 1])
+        ac_uion = g_w_intersect * g_h_intersect
+        gious = ious - (ac_uion - area_union) / ac_uion.clamp(min=eps)
+        loss = 1 - gious
+    else:
+        raise NotImplementedError
+    if weight is not None:
+        loss = loss * weight.view(loss.size())
+        if reduction == "mean":
+            loss = loss.sum() / max(weight.sum().item(), eps)
+    else:
+        if reduction == "mean":
+            loss = loss.mean()
+    if reduction == "sum":
+        loss = loss.sum()
+
+    return ious, loss
+
+
+class FcosMatcher(object):
+    """
+        This code referenced to https://github.com/Megvii-BaseDetection/cvpods
+    """
+    def __init__(self, 
+                 num_classes,
+                 center_sampling_radius,
+                 object_sizes_of_interest,
+                 box_weights=[1, 1, 1, 1]):
+        self.num_classes = num_classes
+        self.center_sampling_radius = center_sampling_radius
+        self.object_sizes_of_interest = object_sizes_of_interest
+        self.box_weightss = box_weights
+
+
+    def get_deltas(self, anchors, boxes):
+        """
+        Get box regression transformation deltas (dl, dt, dr, db) that can be used
+        to transform the `anchors` into the `boxes`. That is, the relation
+        ``boxes == self.apply_deltas(deltas, anchors)`` is true.
+
+        Args:
+            anchors (Tensor): anchors, e.g., feature map coordinates
+            boxes (Tensor): target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(anchors, torch.Tensor), type(anchors)
+        assert isinstance(boxes, torch.Tensor), type(boxes)
+        deltas = torch.cat((anchors - boxes[..., :2], boxes[..., 2:] - anchors),
+                           dim=-1) * anchors.new_tensor(self.box_weightss)
+        return deltas
+
+
+    @torch.no_grad()
+    def __call__(self, fpn_strides, anchors, targets):
+        """
+            fpn_strides: (List) List[8, 16, 32, ...] stride of network output.
+            anchors: (List of Tensor) List[F, M, 2], F = num_fpn_levels
+            targets: (Dict) dict{'boxes': [...], 
+                                 'labels': [...], 
+                                 'orig_size': ...}
+        """
+        gt_classes = []
+        gt_anchors_deltas = []
+        gt_centerness = []
+        device = anchors[0].device
+
+        # List[F, M, 2] -> [M, 2]
+        anchors_over_all_feature_maps = torch.cat(anchors, dim=0).to(device)
+
+        for targets_per_image in targets:
+            # generate object_sizes_of_interest: List[[M, 2]]
+            object_sizes_of_interest = [anchors_i.new_tensor(scale_range).unsqueeze(0).expand(anchors_i.size(0), -1) 
+                                        for anchors_i, scale_range in zip(anchors, self.object_sizes_of_interest)]
+            # List[F, M, 2] -> [M, 2], M = M1 + M2 + ... + MF
+            object_sizes_of_interest = torch.cat(object_sizes_of_interest, dim=0)
+            # [N, 4]
+            tgt_box = targets_per_image['boxes'].to(device)
+            # [N, C]
+            tgt_cls = targets_per_image['labels'].to(device)
+            # [N, M, 4], M = M1 + M2 + ... + MF
+            deltas = self.get_deltas(anchors_over_all_feature_maps, tgt_box.unsqueeze(1))
+
+            has_gt = (len(tgt_cls) > 0)
+            if has_gt:
+                if self.center_sampling_radius > 0:
+                    # bbox centers: [N, 2]
+                    centers = (tgt_box[..., :2] + tgt_box[..., 2:]) * 0.5
+
+                    is_in_boxes = []
+                    for stride, anchors_i in zip(fpn_strides, anchors):
+                        radius = stride * self.center_sampling_radius
+                        # [N, 4]
+                        center_boxes = torch.cat((
+                            torch.max(centers - radius, tgt_box[:, :2]),
+                            torch.min(centers + radius, tgt_box[:, 2:]),
+                        ), dim=-1)
+                        # [N, Mi, 4]
+                        center_deltas = self.get_deltas(anchors_i, center_boxes.unsqueeze(1))
+                        # [N, Mi]
+                        is_in_boxes.append(center_deltas.min(dim=-1).values > 0)
+                    # [N, M], M = M1 + M2 + ... + MF
+                    is_in_boxes = torch.cat(is_in_boxes, dim=1)
+                else:
+                    # no center sampling, it will use all the locations within a ground-truth box
+                    # [N, M], M = M1 + M2 + ... + MF
+                    is_in_boxes = deltas.min(dim=-1).values > 0
+                # [N, M], M = M1 + M2 + ... + MF
+                max_deltas = deltas.max(dim=-1).values
+                # limit the regression range for each location
+                is_cared_in_the_level = \
+                    (max_deltas >= object_sizes_of_interest[None, :, 0]) & \
+                    (max_deltas <= object_sizes_of_interest[None, :, 1])
+
+                # [N,]
+                tgt_box_area = (tgt_box[:, 2] - tgt_box[:, 0]) * (tgt_box[:, 3] - tgt_box[:, 1])
+                # [N,] -> [N, 1] -> [N, M]
+                gt_positions_area = tgt_box_area.unsqueeze(1).repeat(
+                    1, anchors_over_all_feature_maps.size(0))
+                gt_positions_area[~is_in_boxes] = math.inf
+                gt_positions_area[~is_cared_in_the_level] = math.inf
+
+                # if there are still more than one objects for a position,
+                # we choose the one with minimal area
+                # [M,], each element is the index of ground-truth
+                positions_min_area, gt_matched_idxs = gt_positions_area.min(dim=0)
+
+                # ground truth box regression
+                # [M, 4]
+                gt_anchors_reg_deltas_i = self.get_deltas(
+                    anchors_over_all_feature_maps, tgt_box[gt_matched_idxs])
+
+                # [M,]
+                tgt_cls_i = tgt_cls[gt_matched_idxs]
+                # anchors with area inf are treated as background.
+                tgt_cls_i[positions_min_area == math.inf] = self.num_classes
+
+                # ground truth centerness
+                left_right = gt_anchors_reg_deltas_i[:, [0, 2]]
+                top_bottom = gt_anchors_reg_deltas_i[:, [1, 3]]
+                # [M,]
+                gt_centerness_i = torch.sqrt(
+                    (left_right.min(dim=-1).values / left_right.max(dim=-1).values).clamp_(min=0)
+                    * (top_bottom.min(dim=-1).values / top_bottom.max(dim=-1).values).clamp_(min=0)
+                )
+
+                gt_classes.append(tgt_cls_i)
+                gt_anchors_deltas.append(gt_anchors_reg_deltas_i)
+                gt_centerness.append(gt_centerness_i)
+
+                del centers, center_boxes, deltas, max_deltas, center_deltas
+
+            else:
+                tgt_cls_i = torch.zeros(anchors_over_all_feature_maps.shape[0], device=device) + self.num_classes
+                gt_anchors_reg_deltas_i = torch.zeros([anchors_over_all_feature_maps.shape[0], 4], device=device)
+                gt_centerness_i = torch.zeros(anchors_over_all_feature_maps.shape[0], device=device)
+
+                gt_classes.append(tgt_cls_i.long())
+                gt_anchors_deltas.append(gt_anchors_reg_deltas_i.float())
+                gt_centerness.append(gt_centerness_i.float())
+
+
+        # [B, M], [B, M, 4], [B, M]
+        return torch.stack(gt_classes), torch.stack(gt_anchors_deltas), torch.stack(gt_centerness)
+
+
+class SimOtaMatcher(object):
+    def __init__(self, num_classes, soft_center_radius=3.0, topk_candidates=13):
+        self.num_classes = num_classes
+        self.soft_center_radius = soft_center_radius
+        self.topk_candidates = topk_candidates
+
+    @torch.no_grad()
+    def __call__(self, 
+                 fpn_strides, 
+                 anchors, 
+                 pred_cls, 
+                 pred_box,
+                 pred_iou,
+                 gt_labels,
+                 gt_bboxes):
+        # [M,]
+        strides = torch.cat([torch.ones_like(anchor_i[:, 0]) * stride_i
+                                for stride_i, anchor_i in zip(fpn_strides, anchors)], dim=-1)
+        # List[F, M, 2] -> [M, 2]
+        num_gt = len(gt_labels)
+        anchors = torch.cat(anchors, dim=0)
+
+        # check gt
+        if num_gt == 0 or gt_bboxes.max().item() == 0.:
+            return {
+                'assigned_labels': gt_labels.new_full(pred_cls[..., 0].shape,
+                                                      self.num_classes,
+                                                      dtype=torch.long),
+                'assigned_bboxes': gt_bboxes.new_full(pred_box.shape, 0),
+                'assign_metrics': gt_bboxes.new_full(pred_cls[..., 0].shape, 0)
+            }
+        
+        # get inside points: [N, M]
+        is_in_gt = self.find_inside_points(gt_bboxes, anchors)
+        valid_mask = is_in_gt.sum(dim=0) > 0  # [M,]
+
+        # ----------------------------------- soft center prior -----------------------------------
+        gt_center = (gt_bboxes[..., :2] + gt_bboxes[..., 2:]) / 2.0
+        distance = (anchors.unsqueeze(0) - gt_center.unsqueeze(1)
+                    ).pow(2).sum(-1).sqrt() / strides.unsqueeze(0)  # [N, M]
+        distance = distance * valid_mask.unsqueeze(0)
+        soft_center_prior = torch.pow(10, distance - self.soft_center_radius)
+
+        # ----------------------------------- regression cost -----------------------------------
+        pair_wise_ious, _ = box_iou(gt_bboxes, pred_box)  # [N, M]
+        pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8) * 3.0
+
+        # ----------------------------------- classification cost -----------------------------------
+        ## select the predicted scores corresponded to the gt_labels
+        pred_scores = torch.sqrt(pred_cls.sigmoid() * pred_iou.sigmoid())
+        pred_scores = pred_scores.permute(1, 0)  # [M, C] -> [C, M]
+        pairwise_pred_scores = pred_scores[gt_labels.long(), :].float()   # [N, M]
+        ## scale factor
+        scale_factor = (pair_wise_ious - pairwise_pred_scores).abs().pow(2.0)
+        ## cls cost
+        pair_wise_cls_loss = F.binary_cross_entropy(
+            pairwise_pred_scores, pair_wise_ious,
+            reduction="none") * scale_factor # [N, M]
+            
+        del pairwise_pred_scores
+
+        ## foreground cost matrix
+        cost_matrix = pair_wise_cls_loss + pair_wise_ious_loss + soft_center_prior
+        max_pad_value = torch.ones_like(cost_matrix) * 1e9
+        cost_matrix = torch.where(valid_mask[None].repeat(num_gt, 1),   # [N, M]
+                                  cost_matrix, max_pad_value)
+
+        # ----------------------------------- dynamic label assignment -----------------------------------
+        matched_pred_ious, matched_gt_inds, fg_mask_inboxes = self.dynamic_k_matching(
+            cost_matrix, pair_wise_ious, num_gt)
+        del pair_wise_cls_loss, cost_matrix, pair_wise_ious, pair_wise_ious_loss
+
+        # -----------------------------------process assigned labels -----------------------------------
+        assigned_labels = gt_labels.new_full(pred_cls[..., 0].shape,
+                                             self.num_classes)  # [M,]
+        assigned_labels[fg_mask_inboxes] = gt_labels[matched_gt_inds].squeeze(-1)
+        assigned_labels = assigned_labels.long()  # [M,]
+
+        assigned_bboxes = gt_bboxes.new_full(pred_box.shape, 0)        # [M, 4]
+        assigned_bboxes[fg_mask_inboxes] = gt_bboxes[matched_gt_inds]  # [M, 4]
+
+        assign_metrics = gt_bboxes.new_full(pred_cls[..., 0].shape, 0) # [M,]
+        assign_metrics[fg_mask_inboxes] = matched_pred_ious            # [M,]
+
+        assigned_dict = dict(
+            assigned_labels=assigned_labels,
+            assigned_bboxes=assigned_bboxes,
+            assign_metrics=assign_metrics
+            )
+        
+        return assigned_dict
+
+    def find_inside_points(self, gt_bboxes, anchors):
+        """
+            gt_bboxes: Tensor -> [N, 2]
+            anchors:   Tensor -> [M, 2]
+        """
+        num_anchors = anchors.shape[0]
+        num_gt = gt_bboxes.shape[0]
+
+        anchors_expand = anchors.unsqueeze(0).repeat(num_gt, 1, 1)           # [N, M, 2]
+        gt_bboxes_expand = gt_bboxes.unsqueeze(1).repeat(1, num_anchors, 1)  # [N, M, 4]
+
+        # offset
+        lt = anchors_expand - gt_bboxes_expand[..., :2]
+        rb = gt_bboxes_expand[..., 2:] - anchors_expand
+        bbox_deltas = torch.cat([lt, rb], dim=-1)
+
+        is_in_gts = bbox_deltas.min(dim=-1).values > 0
+
+        return is_in_gts
+    
+    def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets.
+
+        Args:
+            cost_matrix (Tensor): Cost matrix.
+            pairwise_ious (Tensor): Pairwise iou matrix.
+            num_gt (int): Number of gt.
+            valid_mask (Tensor): Mask for valid bboxes.
+        Returns:
+            tuple: matched ious and gt indexes.
+        """
+        matching_matrix = torch.zeros_like(cost_matrix, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.topk_candidates, pairwise_ious.size(1))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=1)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
+
+        # sorting the batch cost matirx is faster than topk
+        _, sorted_indices = torch.sort(cost_matrix, dim=1)
+        for gt_idx in range(num_gt):
+            topk_ids = sorted_indices[gt_idx, :dynamic_ks[gt_idx]]
+            matching_matrix[gt_idx, :][topk_ids] = 1
+
+        del topk_ious, dynamic_ks, topk_ids
+
+        prior_match_gt_mask = matching_matrix.sum(0) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost_matrix[:, prior_match_gt_mask], dim=0)
+            matching_matrix[:, prior_match_gt_mask] *= 0
+            matching_matrix[cost_argmin, prior_match_gt_mask] = 1
+
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(0) > 0
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(0)[fg_mask_inboxes]
+        matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+
+        return matched_pred_ious, matched_gt_inds, fg_mask_inboxes
+        

+ 55 - 0
odlab/models/detectors/retinanet/README.md

@@ -0,0 +1,55 @@
+# RetinaNet
+
+Our `RetinaNet-R50-1x` baseline on COCO-val:
+```Shell
+
+```
+
+- ImageNet-1K_V1 pretrained
+
+| Model             |  scale     |  FPS  | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | Weight | Logs  |
+| ------------------| ---------- | ----- | ---------------------- |  ---------------  | ------ | ----- |
+| RetinaNet_R18_1x  |  800,1333  |       |          30.5          |        48.1       | [ckpt](https://github.com/yjh0410/ODLab/releases/download/detection_weights/retinanet_r18_1x_coco.pth) | [log](https://github.com/yjh0410/ODLab/releases/download/detection_weights/RetinaNet-R18-1x.txt) |
+| RetinaNet_R50_1x  |  800,1333  |       |                        |                   |  |  |
+
+
+## Train RetinaNet
+### Single GPU
+Taking training **RetinaNet_R18_1x** on COCO as the example,
+```Shell
+python main.py --cuda -d coco --root path/to/coco -m retinanet_r18_1x --batch_size 16 --eval_epoch 2
+```
+
+### Multi GPU
+Taking training **RetinaNet_R18_1x** on COCO as the example,
+```Shell
+python -m torch.distributed.run --nproc_per_node=8 train.py --cuda -dist -d coco --root path/to/coco -m retinanet_r18_1x --batch_size 16 --eval_epoch 2 
+```
+
+## Test RetinaNet
+Taking testing **RetinaNet_R18_1x** on COCO-val as the example,
+```Shell
+python test.py --cuda -d coco --root path/to/coco -m retinanet_r18_1x --weight path/to/retinanet_r18_1x.pth -vt 0.4 --show 
+```
+
+## Evaluate RetinaNet
+Taking evaluating **RetinaNet_R18_1x** on COCO-val as the example,
+```Shell
+python main.py --cuda -d coco --root path/to/coco -m retinanet_r18_1x --resume path/to/retinanet_r18_1x.pth --eval_first
+```
+
+## Demo
+### Detect with Image
+```Shell
+python demo.py --mode image --path_to_img path/to/image_dirs/ --cuda -m retinanet_r18_1x --weight path/to/weight -vt 0.4 --show
+```
+
+### Detect with Video
+```Shell
+python demo.py --mode video --path_to_vid path/to/video --cuda -m retinanet_r18_1x --weight path/to/weight -vt 0.4 --show --gif
+```
+
+### Detect with Camera
+```Shell
+python demo.py --mode camera --cuda -m retinanet_r18_1x --weight path/to/weight -vt 0.4 --show --gif
+```

+ 24 - 0
odlab/models/detectors/retinanet/build.py

@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .criterion import build_criterion
+from .retinanet import RetinaNet
+
+
+# build RetinaNet
+def build_retinanet(cfg, num_classes=80, is_val=False):
+    # -------------- Build RetinaNet --------------
+    model = RetinaNet(cfg         = cfg,
+                      num_classes = num_classes,
+                      conf_thresh = cfg['train_conf_thresh'] if is_val else cfg['test_conf_thresh'],
+                      nms_thresh  = cfg['train_nms_thresh']  if is_val else cfg['test_nms_thresh'],
+                      topk        = cfg['train_topk']        if is_val else cfg['test_topk'],
+                      ca_nms      = False if is_val else cfg['nms_class_agnostic'])
+            
+    # -------------- Build Criterion --------------
+    criterion = None
+    if is_val:
+        # build criterion for training
+        criterion = build_criterion(cfg, num_classes)
+
+    return model, criterion

+ 136 - 0
odlab/models/detectors/retinanet/criterion.py

@@ -0,0 +1,136 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+from utils.misc import sigmoid_focal_loss
+from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
+
+from .matcher import RetinaNetMatcher
+
+
+class Criterion(nn.Module):
+    def __init__(self, cfg, num_classes=80):
+        super().__init__()
+        # ------------- Basic parameters -------------
+        self.cfg = cfg
+        self.num_classes = num_classes
+        # ------------- Focal loss -------------
+        self.alpha = cfg['focal_loss_alpha']
+        self.gamma = cfg['focal_loss_gamma']
+        # ------------- Loss weight -------------
+        self.weight_dict = {'loss_cls': cfg['loss_cls_weight'],
+                            'loss_reg': cfg['loss_reg_weight']}
+        # ------------- Matcher -------------
+        self.matcher_cfg = cfg['matcher_hpy']
+        self.matcher = RetinaNetMatcher(num_classes,
+                                        iou_threshold=self.matcher_cfg['iou_thresh'],
+                                        iou_labels=self.matcher_cfg['iou_labels'],
+                                        allow_low_quality_matches=self.matcher_cfg['allow_low_quality_matches']
+                                        )
+
+    def loss_labels(self, pred_cls, tgt_cls, num_boxes):
+        """
+            pred_cls: (Tensor) [N, C]
+            tgt_cls:  (Tensor) [N, C]
+        """
+        # cls loss: [V, C]
+        loss_cls = sigmoid_focal_loss(pred_cls, tgt_cls, self.alpha, self.gamma)
+
+        return loss_cls.sum() / num_boxes
+
+    def loss_bboxes(self, pred_reg=None, pred_box=None, tgt_box=None, anchors=None, num_boxes=1, use_giou=False):
+        """
+            pred_reg: (Tensor) [Nq, 4]
+            tgt_box:  (Tensor) [Nq, 4]
+            anchors:  (Tensor) [Nq, 4]
+        """
+        # GIoU loss
+        if use_giou:
+            pred_giou = generalized_box_iou(pred_box, tgt_box)  # [N, M]
+            loss_reg = 1. - torch.diag(pred_giou)
+        
+        # L1 loss
+        else:
+            # xyxy -> cxcy&bwbh
+            tgt_cxcy = (tgt_box[..., :2] + tgt_box[..., 2:]) * 0.5
+            tgt_bwbh = tgt_box[..., 2:] - tgt_box[..., :2]
+            # encode gt box
+            tgt_offsets = (tgt_cxcy - anchors[..., :2]) / anchors[..., 2:]
+            tgt_sizes = torch.log(tgt_bwbh / anchors[..., 2:])
+            tgt_box_encode = torch.cat([tgt_offsets, tgt_sizes], dim=-1)
+            # compute l1 loss
+            loss_reg = F.l1_loss(pred_reg, tgt_box_encode, reduction='none')
+
+        return loss_reg.sum() / num_boxes
+
+    def forward(self, outputs, targets):
+        """
+            outputs['pred_cls']: (Tensor) [B, M, C]
+            outputs['pred_reg']: (Tensor) [B, M, 4]
+            outputs['strides']: (List) [8, 16, 32, ...] stride of the model output
+            targets: (List) [dict{'boxes': [...], 
+                                 'labels': [...], 
+                                 'orig_size': ...}, ...]
+            anchors: (Tensor) [M, 4]
+        """
+        # -------------------- Pre-process --------------------
+        cls_preds = torch.cat(outputs['pred_cls'], dim=1).view(-1, self.num_classes)
+        reg_preds = torch.cat(outputs['pred_reg'], dim=1).view(-1, 4)
+        box_preds = torch.cat(outputs['pred_box'], dim=1).view(-1, 4)
+        masks = ~torch.cat(outputs['mask'], dim=1).view(-1)
+        B = len(targets)
+       
+        # process anchor boxes
+        anchor_boxes = torch.cat(outputs['anchors'])
+        anchor_boxes = anchor_boxes[None].repeat(B, 1, 1)
+        anchor_boxes_xyxy = box_cxcywh_to_xyxy(anchor_boxes)
+
+        # -------------------- Label Assignment --------------------
+        tgt_classes, tgt_boxes = self.matcher(anchor_boxes_xyxy, targets)
+        tgt_classes = tgt_classes.flatten()
+        tgt_boxes = tgt_boxes.view(-1, 4)
+        del anchor_boxes_xyxy
+
+        foreground_idxs = (tgt_classes >= 0) & (tgt_classes != self.num_classes)
+        valid_idxs = (tgt_classes >= 0) & masks
+        num_foreground = foreground_idxs.sum()
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_foreground)
+        num_foreground = torch.clamp(num_foreground / get_world_size(), min=1).item()
+
+        # -------------------- Classification loss --------------------
+        gt_cls_target = torch.zeros_like(cls_preds)
+        gt_cls_target[foreground_idxs, tgt_classes[foreground_idxs]] = 1
+        loss_labels = self.loss_labels(
+            cls_preds[valid_idxs], gt_cls_target[valid_idxs], num_foreground)
+
+        # -------------------- Regression loss --------------------
+        if self.cfg['use_giou_loss']:
+            box_preds_pos = box_preds[foreground_idxs]
+            tgt_boxes_pos = tgt_boxes[foreground_idxs].to(reg_preds.device)
+            loss_bboxes = self.loss_bboxes(
+                pred_box=box_preds_pos, tgt_box=tgt_boxes_pos, num_boxes=num_foreground, use_giou=self.cfg['use_giou_loss'])
+        else:
+            reg_preds_pos = reg_preds[foreground_idxs]
+            tgt_boxes_pos = tgt_boxes[foreground_idxs].to(reg_preds.device)
+            anchors_pos = anchor_boxes.view(-1, 4)[foreground_idxs]
+            loss_bboxes = self.loss_bboxes(
+                pred_reg=reg_preds_pos, tgt_box=tgt_boxes_pos, anchors=anchors_pos, num_boxes=num_foreground, use_giou=self.cfg['use_giou_loss'])
+
+        loss_dict = dict(
+                loss_cls = loss_labels,
+                loss_reg = loss_bboxes,
+        )
+
+        return loss_dict
+
+    
+# build criterion
+def build_criterion(cfg, num_classes=80):
+    criterion = Criterion(cfg=cfg, num_classes=num_classes)
+    return criterion
+
+
+if __name__ == "__main__":
+    pass

+ 181 - 0
odlab/models/detectors/retinanet/matcher.py

@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by BaseDetection, Inc. and its affiliates.
+import torch
+from utils.box_ops import box_iou
+
+
+class RetinaNetMatcher(object):
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be matched to zero or more predicted elements.
+
+    The matching is determined by the MxN match_quality_matrix, that characterizes
+    how well each (ground-truth, prediction)-pair match each other. For example,
+    if the elements are boxes, this matrix may contain box intersection-over-union
+    overlap values.
+
+    The matcher returns (a) a vector of length N containing the index of the
+    ground-truth element m in [0, M) that matches to prediction n in [0, N).
+    (b) a vector of length N containing the labels for each prediction.
+    """
+
+    def __init__(self,
+                 num_classes, 
+                 iou_threshold, 
+                 iou_labels, 
+                 allow_low_quality_matches=False):
+        """
+        Args:
+            thresholds (list): a list of thresholds used to stratify predictions
+                into levels.
+            labels (list): a list of values to label predictions belonging at
+                each level. A label can be one of {-1, 0, 1} signifying
+                {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions with maximum match quality lower than high_threshold.
+                See set_low_quality_matches_ for more details.
+
+            For example,
+                thresholds = [0.3, 0.5]
+                labels = [0, -1, 1]
+                All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training.
+                All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
+                thus will be ignored.
+                All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        self.num_classes = num_classes
+        # Add -inf and +inf to first and last position in iou_thresholdhreshold
+        iou_threshold = iou_threshold[:]
+        assert iou_threshold[0] > 0
+        iou_threshold.insert(0, -float("inf"))
+        iou_threshold.append(float("inf"))
+        assert all(low <= high for (low, high) in zip(iou_threshold[:-1], iou_threshold[1:]))
+        assert all(label in [-1, 0, 1] for label in iou_labels)
+        assert len(iou_labels) == len(iou_threshold) - 1
+        self.iou_threshold = iou_threshold
+        self.iou_labels = iou_labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    @torch.no_grad()
+    def __call__(self, anchors, targets):
+        """
+            anchors: (Tensor) [B, M, 4] (x1, y1, x2, y2)
+            targets: (Dict) dict{'boxes': [...], 
+                                 'labels': [...], 
+                                 'orig_size': ...}
+        """
+        # list[Tensor(R, 4)], one for each image
+        gt_classes = []
+        gt_boxes = []
+        device = anchors.device
+
+        for anchors_per_image, targets_per_image in zip(anchors, targets):
+            # [N,]
+            tgt_labels = targets_per_image['labels'].to(device)
+            # [N, 4]
+            tgt_boxes = targets_per_image['boxes'].to(device)
+            # [N, M], N is the number of targets, M is the number of anchors
+            match_quality_matrix, _ = box_iou(tgt_boxes, anchors_per_image)
+            gt_matched_idxs, anchor_labels = self.matching(match_quality_matrix)
+            has_gt = len(tgt_labels) > 0
+            if has_gt:
+                # ground truth box regression
+                matched_gt_boxes = tgt_boxes[gt_matched_idxs]
+
+                gt_classes_i = tgt_labels[gt_matched_idxs]
+                # Anchors with label 0 are treated as background.
+                gt_classes_i[anchor_labels == 0] = self.num_classes
+                # Anchors with label -1 are ignored.
+                gt_classes_i[anchor_labels == -1] = -1
+            else:
+                gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes
+                matched_gt_boxes = torch.zeros_like(anchors_per_image)
+
+            gt_classes.append(gt_classes_i)
+            gt_boxes.append(matched_gt_boxes)
+
+        return torch.stack(gt_classes), torch.stack(gt_boxes)
+
+    def matching(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an N x M tensor, containing the
+                pairwise quality between N ground-truth elements and M predicted
+                elements. All elements must be >= 0 (due to the us of `torch.nonzero`
+                for selecting indices in :meth:`set_low_quality_matches_`).
+
+        Returns:
+            matches (Tensor[int64]): a vector of length M, where matches[i] is a matched
+                ground-truth index in [0, N)
+            match_labels (Tensor[int8]): a vector of length M, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), 0, dtype=torch.int64
+            )
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.iou_labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+
+        assert torch.all(match_quality_matrix >= 0)
+
+        # match_quality_matrix is N (gt) x M (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+        for (l, low, high) in zip(self.iou_labels, self.iou_threshold[:-1], self.iou_threshold[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+
+        return matches, match_labels
+
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth G find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth G.
+
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of the
+        Faster R-CNN paper: https://arxiv.org/pdf/1506.01497v3.pdf.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        gt_pred_pairs_of_highest_quality = torch.nonzero(
+            match_quality_matrix == highest_quality_foreach_gt[:, None],
+            as_tuple=False
+        )
+        # Example gt_pred_pairs_of_highest_quality:
+        #   tensor([[    0, 39796],
+        #           [    1, 32055],
+        #           [    1, 32070],
+        #           [    2, 39190],
+        #           [    2, 40255],
+        #           [    3, 40390],
+        #           [    3, 41455],
+        #           [    4, 45470],
+        #           [    5, 45325],
+        #           [    5, 46390]])
+        # Each row is a (gt index, prediction index)
+        # Note how gt items 1, 2, 3, and 5 each have two ties
+
+        pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1]
+        match_labels[pred_inds_to_update] = 1

+ 123 - 0
odlab/models/detectors/retinanet/retinanet.py

@@ -0,0 +1,123 @@
+import numpy as np
+import math
+import torch
+import torch.nn as nn
+
+# --------------- Model components ---------------
+from ...backbone import build_backbone
+from ...neck import build_neck
+from ...head import build_head
+
+# --------------- External components ---------------
+from utils.misc import multiclass_nms
+
+
+# ------------------------ RetinaNet ------------------------
+class RetinaNet(nn.Module):
+    def __init__(self, 
+                 cfg,
+                 num_classes :int   = 80, 
+                 conf_thresh :float = 0.05,
+                 nms_thresh  :float = 0.6,
+                 topk        :int   = 1000,
+                 trainable   :bool  = False,
+                 ca_nms      :bool  = False):
+        super(RetinaNet, self).__init__()
+        # ---------------------- Basic Parameters ----------------------
+        self.cfg = cfg
+        self.trainable = trainable
+        self.topk = topk
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.ca_nms = ca_nms
+
+        # ---------------------- Network Parameters ----------------------
+        ## Backbone
+        self.backbone, feat_dims = build_backbone(cfg, trainable&cfg['pretrained'])
+
+        ## Neck
+        self.fpn = build_neck(cfg, feat_dims, cfg['head_dim'])
+        
+        ## Heads
+        self.head = build_head(cfg, cfg['head_dim'], cfg['head_dim'], num_classes)
+
+    def post_process(self, cls_preds, box_preds):
+        """
+        Input:
+            cls_preds: List(Tensor) [[B, H x W, KA x C], ...]
+            box_preds: List(Tensor) [[B, H x W, KA x 4], ...]
+        """
+        all_scores = []
+        all_labels = []
+        all_bboxes = []
+        
+        for cls_pred_i, box_pred_i in zip(cls_preds, box_preds):
+            cls_pred_i = cls_pred_i[0]
+            box_pred_i = box_pred_i[0]
+            
+            # (H x W x KA x C,)
+            scores_i = cls_pred_i.sigmoid().flatten()
+
+            # Keep top k top scoring indices only.
+            num_topk = min(self.topk, box_pred_i.size(0))
+
+            # torch.sort is actually faster than .topk (at least on GPUs)
+            predicted_prob, topk_idxs = scores_i.sort(descending=True)
+            topk_scores = predicted_prob[:num_topk]
+            topk_idxs = topk_idxs[:num_topk]
+
+            # filter out the proposals with low confidence score
+            keep_idxs = topk_scores > self.conf_thresh
+            topk_idxs = topk_idxs[keep_idxs]
+
+            # final scores
+            scores = topk_scores[keep_idxs]
+            # final labels
+            labels = topk_idxs % self.num_classes
+            # final bboxes
+            anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
+            bboxes = box_pred_i[anchor_idxs]
+
+            all_scores.append(scores)
+            all_labels.append(labels)
+            all_bboxes.append(bboxes)
+
+        scores = torch.cat(all_scores)
+        labels = torch.cat(all_labels)
+        bboxes = torch.cat(all_bboxes)
+
+        # to cpu & numpy
+        scores = scores.cpu().numpy()
+        labels = labels.cpu().numpy()
+        bboxes = bboxes.cpu().numpy()
+
+        # nms
+        scores, labels, bboxes = multiclass_nms(
+            scores, labels, bboxes, self.nms_thresh, self.num_classes, self.ca_nms)
+
+        return bboxes, scores, labels
+
+    def forward(self, src, src_mask=None, targets=None):
+        # ---------------- Backbone ----------------
+        pyramid_feats = self.backbone(src)
+
+        # ---------------- Neck ----------------
+        pyramid_feats = self.fpn(pyramid_feats)
+
+        # ---------------- Heads ----------------
+        outputs = self.head(pyramid_feats, src_mask)
+
+        if not self.training:
+            # ---------------- PostProcess ----------------
+            cls_pred = outputs["pred_cls"]
+            box_pred = outputs["pred_box"]
+            bboxes, scores, labels = self.post_process(cls_pred, box_pred)
+            # normalize bbox
+            bboxes[..., 0::2] /= src.shape[-1]
+            bboxes[..., 1::2] /= src.shape[-2]
+            bboxes = bboxes.clip(0., 1.)
+
+            return bboxes, scores, labels
+
+        return outputs 

+ 67 - 0
odlab/models/detectors/yolof/README.md

@@ -0,0 +1,67 @@
+# YOLOF: You Only Look One-level Feature
+
+Our `YOLOF-R50-1x` baseline on COCO-val:
+```Shell
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.380
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.577
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.405
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.199
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.425
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.523
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.315
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.513
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.555
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.333
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.628
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.736
+```
+
+- ImageNet-1K_V1 pretrained
+
+| Model            |  scale     |  FPS  | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | Weight | Logs  |
+| ---------------- | ---------- | ----- | ---------------------- |  ---------------  | ------ | ----- |
+| YOLOF_R18_C5_1x  |  800,1333  |       |          32.8          |       51.2        | [ckpt](https://github.com/yjh0410/ODLab/releases/download/detection_weights/yolof_r18_c5_1x_coco.pth) | [log](https://github.com/yjh0410/ODLab/releases/download/detection_weights/YOLOF-R18-C5-1x.txt) |
+| YOLOF_R50_C5_1x  |  800,1333  |       |          38.0          |       57.7        | [ckpt](https://github.com/yjh0410/ODLab/releases/download/detection_weights/yolof_r50_c5_1x_coco.pth) | [log](https://github.com/yjh0410/ODLab/releases/download/detection_weights/YOLOF-R50-C5-1x.txt) |
+| YOLOF_R50_DC5_1x |  800,1333  |       |          39.5          |       58.5        | [ckpt](https://github.com/yjh0410/ODLab/releases/download/detection_weights/yolof_r50_dc5_1x_coco.pth) | [log](https://github.com/yjh0410/ODLab/releases/download/detection_weights/YOLOF-R50-DC5-1x.txt) |
+
+
+## Train YOLOF
+### Single GPU
+Taking training **YOLOF_R18_C5_1x** on COCO as the example,
+```Shell
+python main.py --cuda -d coco --root path/to/coco -m yolof_r18_c5_1x --batch_size 16 --eval_epoch 2
+```
+
+### Multi GPU
+Taking training **YOLOF_R18_C5_1x** on COCO as the example,
+```Shell
+python -m torch.distributed.run --nproc_per_node=8 train.py --cuda -dist -d coco --root path/to/coco -m yolof_r18_c5_1x --batch_size 16 --eval_epoch 2 
+```
+
+## Test YOLOF
+Taking testing **YOLOF_R18_C5_1x** on COCO-val as the example,
+```Shell
+python test.py --cuda -d coco --root path/to/coco -m yolof_r18_c5_1x --weight path/to/yolof_r18_c5_1x.pth -vt 0.4 --show 
+```
+
+## Evaluate YOLOF
+Taking evaluating **YOLOF_R18_C5_1x** on COCO-val as the example,
+```Shell
+python main.py --cuda -d coco --root path/to/coco -m yolof_r18_c5_1x --resume path/to/yolof_r18_c5_1x.pth --eval_first
+```
+
+## Demo
+### Detect with Image
+```Shell
+python demo.py --mode image --path_to_img path/to/image_dirs/ --cuda -m yolof_r18_c5_1x --weight path/to/weight -vt 0.4 --show
+```
+
+### Detect with Video
+```Shell
+python demo.py --mode video --path_to_vid path/to/video --cuda -m yolof_r18_c5_1x --weight path/to/weight -vt 0.4 --show --gif
+```
+
+### Detect with Camera
+```Shell
+python demo.py --mode camera --cuda -m yolof_r18_c5_1x --weight path/to/weight -vt 0.4 --show --gif
+```

+ 24 - 0
odlab/models/detectors/yolof/build.py

@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .criterion import build_criterion
+from .yolof import YOLOF
+
+
+# build YOLOF
+def build_yolof(cfg, num_classes=80, is_val=False):
+    # -------------- Build YOLOF --------------
+    model = YOLOF(cfg         = cfg,
+                  num_classes = num_classes,
+                  conf_thresh = cfg['train_conf_thresh'] if is_val else cfg['test_conf_thresh'],
+                  nms_thresh  = cfg['train_nms_thresh']  if is_val else cfg['test_nms_thresh'],
+                  topk        = cfg['train_topk']        if is_val else cfg['test_topk'],
+                  )
+            
+    # -------------- Build Criterion --------------
+    criterion = None
+    if is_val:
+        # build criterion for training
+        criterion = build_criterion(cfg, num_classes)
+
+    return model, criterion

+ 151 - 0
odlab/models/detectors/yolof/criterion.py

@@ -0,0 +1,151 @@
+# ---------------------------------------------------------------------
+# Copyright (c) Megvii Inc. All rights reserved.
+# ---------------------------------------------------------------------
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.box_ops import *
+from utils.misc import sigmoid_focal_loss
+from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
+
+from .matcher import UniformMatcher
+
+
+class Criterion(nn.Module):
+    """
+        This code referenced to https://github.com/megvii-model/YOLOF/blob/main/playground/detection/coco/yolof/yolof_base/yolof.py
+    """
+    def __init__(self, cfg, num_classes=80):
+        super().__init__()
+        # ------------- Basic parameters -------------
+        self.cfg = cfg
+        self.num_classes = num_classes
+        # ------------- Focal loss -------------
+        self.alpha = cfg['focal_loss_alpha']
+        self.gamma = cfg['focal_loss_gamma']
+        # ------------- Loss weight -------------
+        self.weight_dict = {'loss_cls': cfg['loss_cls_weight'],
+                            'loss_reg': cfg['loss_reg_weight']}
+        # ------------- Matcher -------------
+        self.matcher_cfg = cfg['matcher_hpy']
+        self.matcher = UniformMatcher(self.matcher_cfg['topk_candidates'])
+
+    def loss_labels(self, pred_cls, tgt_cls, num_boxes):
+        """
+            pred_cls: (Tensor) [N, C]
+            tgt_cls:  (Tensor) [N, C]
+        """
+        # cls loss: [V, C]
+        loss_cls = sigmoid_focal_loss(pred_cls, tgt_cls, self.alpha, self.gamma)
+
+        return loss_cls.sum() / num_boxes
+
+    def loss_bboxes(self, pred_box, tgt_box, num_boxes):
+        """
+            pred_box: (Tensor) [N, 4]
+            tgt_box:  (Tensor) [N, 4]
+        """
+        # giou
+        pred_giou = generalized_box_iou(pred_box, tgt_box)  # [N, M]
+        # giou loss
+        loss_reg = 1. - torch.diag(pred_giou)
+
+        return loss_reg.sum() / num_boxes
+
+    def forward(self, outputs, targets):
+        """
+            outputs['pred_cls']: (Tensor) [B, M, C]
+            outputs['pred_box']: (Tensor) [B, M, 4]
+            targets: (List) [dict{'boxes': [...], 
+                                 'labels': [...], 
+                                 'orig_size': ...}, ...]
+        """
+        # -------------------- Pre-process --------------------
+        pred_box = outputs['pred_box']
+        pred_cls = outputs['pred_cls'].reshape(-1, self.num_classes)
+        anchor_boxes = outputs['anchors']
+        masks = ~outputs['mask']
+        device = pred_box.device
+        B = len(targets)
+
+        # -------------------- Label assignment --------------------
+        indices = self.matcher(pred_box, anchor_boxes, targets)
+
+        # [M, 4] -> [1, M, 4] -> [B, M, 4]
+        anchor_boxes = box_cxcywh_to_xyxy(anchor_boxes)
+        anchor_boxes = anchor_boxes[None].repeat(B, 1, 1)
+
+        ious = []
+        pos_ious = []
+        for i in range(B):
+            src_idx, tgt_idx = indices[i]
+            # iou between predbox and tgt box
+            iou, _ = box_iou(pred_box[i, ...], (targets[i]['boxes']).clone())
+            if iou.numel() == 0:
+                max_iou = iou.new_full((iou.size(0),), 0)
+            else:
+                max_iou = iou.max(dim=1)[0]
+            # iou between anchorbox and tgt box
+            a_iou, _ = box_iou(anchor_boxes[i], (targets[i]['boxes']).clone())
+            if a_iou.numel() == 0:
+                pos_iou = a_iou.new_full((0,), 0)
+            else:
+                pos_iou = a_iou[src_idx, tgt_idx]
+            ious.append(max_iou)
+            pos_ious.append(pos_iou)
+
+        ious = torch.cat(ious)
+        ignore_idx = ious > self.matcher_cfg['ignore_thresh']
+        pos_ious = torch.cat(pos_ious)
+        pos_ignore_idx = pos_ious < self.matcher_cfg['iou_thresh']
+
+        src_idx = torch.cat(
+            [src + idx * anchor_boxes[0].shape[0] for idx, (src, _) in
+             enumerate(indices)])
+        # [BM,]
+        gt_cls = torch.full(pred_cls.shape[:1],
+                                self.num_classes,
+                                dtype=torch.int64,
+                                device=device)
+        gt_cls[ignore_idx] = -1
+        tgt_cls_o = torch.cat([t['labels'][J] for t, (_, J) in zip(targets, indices)])
+        tgt_cls_o[pos_ignore_idx] = -1
+
+        gt_cls[src_idx] = tgt_cls_o.to(device)
+
+        foreground_idxs = (gt_cls >= 0) & (gt_cls != self.num_classes)
+        num_foreground = foreground_idxs.sum()
+
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_foreground)
+        num_foreground = torch.clamp(num_foreground / get_world_size(), min=1).item()
+
+        # -------------------- Classification loss --------------------
+        gt_cls_target = torch.zeros_like(pred_cls)
+        gt_cls_target[foreground_idxs, gt_cls[foreground_idxs]] = 1
+        valid_idxs = (gt_cls >= 0) & masks
+        loss_labels = self.loss_labels(pred_cls[valid_idxs], gt_cls_target[valid_idxs], num_foreground)
+
+        # -------------------- Regression loss --------------------
+        tgt_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0).to(device)
+        tgt_boxes = tgt_boxes[~pos_ignore_idx]
+        matched_pred_box = pred_box.reshape(-1, 4)[src_idx[~pos_ignore_idx.cpu()]]
+        loss_bboxes = self.loss_bboxes(matched_pred_box, tgt_boxes, num_foreground)
+
+        loss_dict = dict(
+                loss_cls = loss_labels,
+                loss_reg = loss_bboxes,
+        )
+
+        return loss_dict
+
+
+def build_criterion(cfg, num_classes=80):
+    criterion = Criterion(cfg=cfg, num_classes=num_classes)
+    return criterion
+
+    
+if __name__ == "__main__":
+    pass

+ 114 - 0
odlab/models/detectors/yolof/matcher.py

@@ -0,0 +1,114 @@
+# ---------------------------------------------------------------------
+# Copyright (c) Megvii Inc. All rights reserved.
+# ---------------------------------------------------------------------
+
+
+import numpy as np
+import torch
+from torch import nn
+from utils.box_ops import *
+
+
+class UniformMatcher(nn.Module):
+    """
+    This code referenced to https://github.com/megvii-model/YOLOF/blob/main/playground/detection/coco/yolof/yolof_base/uniform_matcher.py
+    Uniform Matching between the anchors and gt boxes, which can achieve
+    balance in positive anchors.
+
+    Args:
+        match_times(int): Number of positive anchors for each gt box.
+    """
+
+    def __init__(self, match_times: int = 4):
+        super().__init__()
+        self.match_times = match_times
+
+    @torch.no_grad()
+    def forward(self, pred_boxes, anchor_boxes, targets):
+        """
+            pred_boxes:   (Tensor) -> [B, num_queries, 4]
+            anchor_boxes: (Tensor) -> [num_queries, 4]
+            targets:      (Dict)   -> dict{'boxes': [...], 'labels': [...]}
+        """
+
+        bs, num_queries = pred_boxes.shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        # [B, num_queries, 4] -> [M, 4]
+        out_bbox = pred_boxes.flatten(0, 1)
+        # [num_queries, 4] -> [1, num_queries, 4] -> [B, num_queries, 4] -> [M, 4]
+        anchor_boxes = anchor_boxes[None].repeat(bs, 1, 1)
+        anchor_boxes = anchor_boxes.flatten(0, 1)
+
+        # Also concat the target boxes
+        tgt_bbox = torch.cat([v['boxes'] for v in targets])
+
+        # Compute the L1 cost between boxes
+        # Note that we use anchors and predict boxes both
+        cost_bbox = torch.cdist(box_xyxy_to_cxcywh(out_bbox), 
+                                box_xyxy_to_cxcywh(tgt_bbox), 
+                                p=1)
+        cost_bbox_anchors = torch.cdist(anchor_boxes, 
+                                        box_xyxy_to_cxcywh(tgt_bbox), 
+                                        p=1)
+
+        # Final cost matrix: [B, M, N], M=num_queries, N=num_tgt
+        C = cost_bbox
+        C = C.view(bs, num_queries, -1).cpu()
+        C1 = cost_bbox_anchors
+        C1 = C1.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v['boxes']) for v in targets]  # the number of object instances in each image
+        all_indices_list = [[] for _ in range(bs)]
+        # positive indices when matching predict boxes and gt boxes
+        # len(indices) = batch size
+        # len(tupe) = topk
+        indices = [
+            tuple(
+                torch.topk(
+                    c[i],
+                    k=self.match_times,
+                    dim=0,
+                    largest=False)[1].numpy().tolist()
+            )
+            for i, c in enumerate(C.split(sizes, -1))
+        ]
+        # positive indices when matching anchor boxes and gt boxes
+        indices1 = [
+            tuple(
+                torch.topk(
+                    c[i],
+                    k=self.match_times,
+                    dim=0,
+                    largest=False)[1].numpy().tolist())
+            for i, c in enumerate(C1.split(sizes, -1))]
+
+        # concat the indices according to image ids
+        # img_id = batch_id
+        for img_id, (idx, idx1) in enumerate(zip(indices, indices1)):
+            img_idx_i = [
+                np.array(idx_ + idx1_)
+                for (idx_, idx1_) in zip(idx, idx1)
+            ] # 'i' is the index of queris
+            img_idx_j = [
+                np.array(list(range(len(idx_))) + list(range(len(idx1_))))
+                for (idx_, idx1_) in zip(idx, idx1)
+            ] # 'j' is the index of tgt
+            all_indices_list[img_id] = [*zip(img_idx_i, img_idx_j)]
+
+        # re-organize the positive indices
+        all_indices = []
+        for img_id in range(bs):
+            all_idx_i = []
+            all_idx_j = []
+            for idx_list in all_indices_list[img_id]:
+                idx_i, idx_j = idx_list
+                all_idx_i.append(idx_i)
+                all_idx_j.append(idx_j)
+            all_idx_i = np.hstack(all_idx_i)
+            all_idx_j = np.hstack(all_idx_j)
+            all_indices.append((all_idx_i, all_idx_j))
+
+
+        return [(torch.as_tensor(i, dtype=torch.int64), 
+                 torch.as_tensor(j, dtype=torch.int64)) for i, j in all_indices]

+ 106 - 0
odlab/models/detectors/yolof/yolof.py

@@ -0,0 +1,106 @@
+import torch
+import torch.nn as nn
+
+# --------------- Model components ---------------
+from ...backbone import build_backbone
+from ...neck import build_neck
+from ...head import build_head
+
+# --------------- External components ---------------
+from utils.misc import multiclass_nms
+
+
+# ------------------------ You Only Look One-level Feature ------------------------
+class YOLOF(nn.Module):
+    def __init__(self, 
+                 cfg,
+                 num_classes :int   = 80, 
+                 conf_thresh :float = 0.05,
+                 nms_thresh  :float = 0.6,
+                 topk        :int   = 1000,
+                 ca_nms      :bool  = False):
+        super(YOLOF, self).__init__()
+        # ---------------------- Basic Parameters ----------------------
+        self.cfg = cfg
+        self.topk = topk
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.ca_nms = ca_nms
+
+        # ---------------------- Network Parameters ----------------------
+        ## Backbone
+        self.backbone, feat_dims = build_backbone(cfg)
+
+        ## Neck
+        self.neck = build_neck(cfg, feat_dims[-1], cfg['head_dim'])
+        
+        ## Heads
+        self.head = build_head(cfg, cfg['head_dim'], cfg['head_dim'], num_classes)
+
+    def post_process(self, cls_pred, box_pred):
+        """
+        Input:
+            cls_pred: (Tensor) [[H x W x KA, C]
+            box_pred: (Tensor)  [H x W x KA, 4]
+        """
+        cls_pred = cls_pred[0]
+        box_pred = box_pred[0]
+        
+        # (H x W x KA x C,)
+        scores_i = cls_pred.sigmoid().flatten()
+
+        # Keep top k top scoring indices only.
+        num_topk = min(self.topk, box_pred.size(0))
+
+        # torch.sort is actually faster than .topk (at least on GPUs)
+        predicted_prob, topk_idxs = scores_i.sort(descending=True)
+        topk_scores = predicted_prob[:num_topk]
+        topk_idxs = topk_idxs[:num_topk]
+
+        # filter out the proposals with low confidence score
+        keep_idxs = topk_scores > self.conf_thresh
+        topk_idxs = topk_idxs[keep_idxs]
+
+        # final scores
+        scores = topk_scores[keep_idxs]
+        # final labels
+        labels = topk_idxs % self.num_classes
+        # final bboxes
+        anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
+        bboxes = box_pred[anchor_idxs]
+
+        # to cpu & numpy
+        scores = scores.cpu().numpy()
+        labels = labels.cpu().numpy()
+        bboxes = bboxes.cpu().numpy()
+
+        # nms
+        scores, labels, bboxes = multiclass_nms(
+            scores, labels, bboxes, self.nms_thresh, self.num_classes, self.ca_nms)
+
+        return bboxes, scores, labels
+
+    def forward(self, src, src_mask=None, targets=None):
+        # ---------------- Backbone ----------------
+        pyramid_feats = self.backbone(src)
+
+        # ---------------- Neck ----------------
+        feat = self.neck(pyramid_feats[-1])
+
+        # ---------------- Heads ----------------
+        outputs = self.head(feat, src_mask)
+
+        if not self.training:
+            # ---------------- PostProcess ----------------
+            cls_pred = outputs["pred_cls"]
+            box_pred = outputs["pred_box"]
+            bboxes, scores, labels = self.post_process(cls_pred, box_pred)
+            # normalize bbox
+            bboxes[..., 0::2] /= src.shape[-1]
+            bboxes[..., 1::2] /= src.shape[-2]
+            bboxes = bboxes.clip(0., 1.)
+
+            return bboxes, scores, labels
+
+        return outputs 

+ 42 - 0
odlab/models/head/__init__.py

@@ -0,0 +1,42 @@
+from .retinanet_head import RetinaNetHead
+from .yolof_head     import YOLOFHead
+from .fcos_head      import FCOSHead
+
+
+# build head
+def build_head(cfg, in_dim, out_dim, num_classes):
+    print('==============================')
+    print('Head: {}'.format(cfg['head']))
+    
+    if cfg['head'] == 'retinanet_head':
+        model = RetinaNetHead(cfg          = cfg,
+                              in_dim       = in_dim,
+                              out_dim      = out_dim,
+                              num_classes  = num_classes,
+                              num_cls_head = cfg['num_cls_head'],
+                              num_reg_head = cfg['num_reg_head'],
+                              act_type     = cfg['head_act'],
+                              norm_type    = cfg['head_norm']
+                              )
+    elif cfg['head'] == 'fcos_head':
+        model = FCOSHead(cfg          = cfg,
+                         in_dim       = in_dim,
+                         out_dim      = out_dim,
+                         num_classes  = num_classes,
+                         num_cls_head = cfg['num_cls_head'],
+                         num_reg_head = cfg['num_reg_head'],
+                         act_type     = cfg['head_act'],
+                         norm_type    = cfg['head_norm']
+                         )
+    elif cfg['head'] == 'yolof_head':
+        model = YOLOFHead(cfg          = cfg,
+                          in_dim       = in_dim,
+                          out_dim      = out_dim,
+                          num_classes  = num_classes,
+                          num_cls_head = cfg['num_cls_head'],
+                          num_reg_head = cfg['num_reg_head'],
+                          act_type     = cfg['head_act'],
+                          norm_type    = cfg['head_norm']
+                          )
+
+    return model

+ 185 - 0
odlab/models/head/fcos_head.py

@@ -0,0 +1,185 @@
+import torch
+import torch.nn as nn
+
+from ..basic.conv import ConvModule
+
+
+class Scale(nn.Module):
+    """
+    Multiply the output regression range by a learnable constant value
+    """
+    def __init__(self, init_value=1.0):
+        """
+        init_value : initial value for the scalar
+        """
+        super().__init__()
+        self.scale = nn.Parameter(
+            torch.tensor(init_value, dtype=torch.float32),
+            requires_grad=True
+        )
+
+    def forward(self, x):
+        """
+        input -> scale * input
+        """
+        return x * self.scale
+
+
+class FCOSHead(nn.Module):
+    def __init__(self, cfg, in_dim, out_dim, num_classes, num_cls_head=1, num_reg_head=1, act_type='relu', norm_type='BN'):
+        super().__init__()
+        self.fmp_size = None
+        # ------------------ Basic parameters -------------------
+        self.cfg = cfg
+        self.in_dim = in_dim
+        self.num_classes = num_classes
+        self.num_cls_head = num_cls_head
+        self.num_reg_head = num_reg_head
+        self.act_type = act_type
+        self.norm_type = norm_type
+        self.stride = cfg['out_stride']
+
+        # ------------------ Network parameters -------------------
+        ## cls head
+        cls_heads = []
+        self.cls_head_dim = out_dim
+        for i in range(self.num_cls_head):
+            if i == 0:
+                cls_heads.append(
+                    ConvModule(in_dim, self.cls_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+            else:
+                cls_heads.append(
+                    ConvModule(self.cls_head_dim, self.cls_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+        
+        ## reg head
+        reg_heads = []
+        self.reg_head_dim = out_dim
+        for i in range(self.num_reg_head):
+            if i == 0:
+                reg_heads.append(
+                    ConvModule(in_dim, self.reg_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+            else:
+                reg_heads.append(
+                    ConvModule(self.reg_head_dim, self.reg_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+        self.cls_heads = nn.Sequential(*cls_heads)
+        self.reg_heads = nn.Sequential(*reg_heads)
+
+        ## pred layers
+        self.cls_pred = nn.Conv2d(self.cls_head_dim, num_classes, kernel_size=3, padding=1)
+        self.reg_pred = nn.Conv2d(self.reg_head_dim, 4, kernel_size=3, padding=1)
+        self.ctn_pred = nn.Conv2d(self.reg_head_dim, 1, kernel_size=3, padding=1)
+        
+        ## scale layers
+        self.scales = nn.ModuleList(
+            Scale() for _ in range(len(self.stride))
+        )
+        
+        # init bias
+        self._init_layers()
+
+    def _init_layers(self):
+        for module in [self.cls_heads, self.reg_heads, self.cls_pred, self.reg_pred, self.ctn_pred]:
+            for layer in module.modules():
+                if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+                if isinstance(layer, nn.GroupNorm):
+                    torch.nn.init.constant_(layer.weight, 1)
+                    torch.nn.init.constant_(layer.bias, 0)
+        # init the bias of cls pred
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        torch.nn.init.constant_(self.cls_pred.bias, bias_value)
+        
+    def get_anchors(self, level, fmp_size):
+        """
+            fmp_size: (List) [H, W]
+        """
+        # generate grid cells
+        fmp_h, fmp_w = fmp_size
+        anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+        # [H, W, 2] -> [HW, 2]
+        anchors = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2) + 0.5
+        anchors *= self.stride[level]
+
+        return anchors
+        
+    def decode_boxes(self, pred_deltas, anchors):
+        """
+            pred_deltas: (List[Tensor]) [B, M, 4] or [M, 4] (l, t, r, b)
+            anchors:     (List[Tensor]) [1, M, 2] or [M, 2]
+        """
+        # x1 = x_anchor - l, x2 = x_anchor + r
+        # y1 = y_anchor - t, y2 = y_anchor + b
+        pred_x1y1 = anchors - pred_deltas[..., :2]
+        pred_x2y2 = anchors + pred_deltas[..., 2:]
+        pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
+
+        return pred_box
+    
+    def forward(self, pyramid_feats, mask=None):
+        all_masks = []
+        all_anchors = []
+        all_cls_preds = []
+        all_reg_preds = []
+        all_box_preds = []
+        all_ctn_preds = []
+        for level, feat in enumerate(pyramid_feats):
+            # ------------------- Decoupled head -------------------
+            cls_feat = self.cls_heads(feat)
+            reg_feat = self.reg_heads(feat)
+
+            # ------------------- Generate anchor box -------------------
+            B, _, H, W = cls_feat.size()
+            fmp_size = [H, W]
+            anchors = self.get_anchors(level, fmp_size)   # [M, 4]
+            anchors = anchors.to(cls_feat.device)
+
+            # ------------------- Predict -------------------
+            cls_pred = self.cls_pred(cls_feat)
+            reg_pred = self.reg_pred(reg_feat)
+            ctn_pred = self.ctn_pred(reg_feat)
+
+            # ------------------- Process preds -------------------
+            ## [B, C, H, W] -> [B, H, W, C] -> [B, M, C]
+            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
+            ctn_pred = ctn_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
+            reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 4)
+            reg_pred = nn.functional.relu(self.scales[level](reg_pred)) * self.stride[level]
+            ## Decode bbox
+            box_pred = self.decode_boxes(reg_pred, anchors)
+            ## Adjust mask
+            if mask is not None:
+                # [B, H, W]
+                mask_i = torch.nn.functional.interpolate(mask[None].float(), size=[H, W]).bool()[0]
+                # [B, H, W] -> [B, M]
+                mask_i = mask_i.flatten(1)     
+                all_masks.append(mask_i)
+                
+            all_anchors.append(anchors)
+            all_cls_preds.append(cls_pred)
+            all_reg_preds.append(reg_pred)
+            all_box_preds.append(box_pred)
+            all_ctn_preds.append(ctn_pred)
+
+        outputs = {"pred_cls": all_cls_preds,  # List [B, M, C]
+                   "pred_reg": all_reg_preds,  # List [B, M, 4]
+                   "pred_box": all_box_preds,  # List [B, M, 4]
+                   "pred_ctn": all_ctn_preds,  # List [B, M, 1]
+                   "anchors": all_anchors,     # List [B, M, 2]
+                   "strides": self.stride,
+                   "mask": all_masks}          # List [B, M,]
+
+        return outputs 

+ 203 - 0
odlab/models/head/retinanet_head.py

@@ -0,0 +1,203 @@
+import math
+import torch
+import torch.nn as nn
+
+from ..basic.conv import ConvModule
+
+
+class RetinaNetHead(nn.Module):
+    def __init__(self, cfg, in_dim, out_dim, num_classes, num_cls_head=1, num_reg_head=1, act_type='relu', norm_type='BN'):
+        super().__init__()
+        self.fmp_size = None
+        self.DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
+        # ------------------ Basic parameters -------------------
+        self.cfg = cfg
+        self.in_dim = in_dim
+        self.num_classes = num_classes
+        self.num_cls_head=num_cls_head
+        self.num_reg_head=num_reg_head
+        self.act_type=act_type
+        self.norm_type=norm_type
+        self.stride = cfg['out_stride']
+        # ------------------ Anchor parameters -------------------
+        self.anchor_size = self.get_anchor_sizes(cfg)  # [S, KA, 2]
+        self.num_anchors = self.anchor_size.shape[1]
+
+        # ------------------ Network parameters -------------------
+        ## cls head
+        cls_heads = []
+        self.cls_head_dim = out_dim
+        for i in range(self.num_cls_head):
+            if i == 0:
+                cls_heads.append(
+                    ConvModule(in_dim, self.cls_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+            else:
+                cls_heads.append(
+                    ConvModule(self.cls_head_dim, self.cls_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+        ## reg head
+        reg_heads = []
+        self.reg_head_dim = out_dim
+        for i in range(self.num_reg_head):
+            if i == 0:
+                reg_heads.append(
+                    ConvModule(in_dim, self.reg_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+            else:
+                reg_heads.append(
+                    ConvModule(self.reg_head_dim, self.reg_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+        self.cls_heads = nn.Sequential(*cls_heads)
+        self.reg_heads = nn.Sequential(*reg_heads)
+
+        ## pred layers
+        self.cls_pred = nn.Conv2d(self.cls_head_dim, num_classes * self.num_anchors, kernel_size=3, padding=1)
+        self.reg_pred = nn.Conv2d(self.reg_head_dim, 4 * self.num_anchors, kernel_size=3, padding=1)
+
+        # init bias
+        self._init_layers()
+
+    def _init_layers(self):
+        for module in [self.cls_heads, self.reg_heads, self.cls_pred, self.reg_pred]:
+            for layer in module.modules():
+                if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+                if isinstance(layer, nn.GroupNorm):
+                    torch.nn.init.constant_(layer.weight, 1)
+                    torch.nn.init.constant_(layer.bias, 0)
+        # init the bias of cls pred
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        torch.nn.init.constant_(self.cls_pred.bias, bias_value)
+        
+    def get_anchor_sizes(self, cfg):
+        basic_anchor_size =   cfg['anchor_config']['basic_size']
+        anchor_aspect_ratio = cfg['anchor_config']['aspect_ratio']
+        anchor_area_scale =   cfg['anchor_config']['area_scale']
+
+        num_scales = len(basic_anchor_size)
+        num_anchors = len(anchor_aspect_ratio) * len(anchor_area_scale)
+        anchor_sizes = []
+        for size in basic_anchor_size:
+            for ar in anchor_aspect_ratio:
+                for s in anchor_area_scale:
+                    ah, aw = size
+                    area = ah * aw * s
+                    anchor_sizes.append(
+                        [torch.sqrt(torch.tensor(ar * area)),
+                         torch.sqrt(torch.tensor(area / ar))]
+                         )
+        # [S * KA, 2] -> [S, KA, 2]
+        anchor_sizes = torch.as_tensor(anchor_sizes).view(num_scales, num_anchors, 2)
+
+        return anchor_sizes
+
+    def get_anchors(self, level, fmp_size):
+        """
+            fmp_size: (List) [H, W]
+        """
+        # generate grid cells
+        fmp_h, fmp_w = fmp_size
+        # [KA, 2]
+        anchor_size = self.anchor_size[level]
+
+        anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+        # [H, W, 2] -> [HW, 2]
+        anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2) + 0.5
+        # [HW, 2] -> [HW, 1, 2] -> [HW, KA, 2] 
+        anchor_xy = anchor_xy[:, None, :].repeat(1, self.num_anchors, 1)
+        anchor_xy *= self.stride[level]
+
+        # [KA, 2] -> [1, KA, 2] -> [HW, KA, 2]
+        anchor_wh = anchor_size[None, :, :].repeat(fmp_h*fmp_w, 1, 1)
+
+        # [HW, KA, 4] -> [M, 4], M = HW x KA
+        anchor_boxes = torch.cat([anchor_xy, anchor_wh], dim=-1)
+        anchor_boxes = anchor_boxes.view(-1, 4)
+
+        return anchor_boxes
+        
+    def decode_boxes(self, anchor_boxes, pred_reg):
+        """
+            anchor_boxes: (List[Tensor]) [1, M, 4] or [M, 4]
+            pred_reg:     (List[Tensor]) [B, M, 4] or [M, 4]
+        """
+        # x = x_anchor + dx * w_anchor
+        # y = y_anchor + dy * h_anchor
+        pred_ctr_offset = pred_reg[..., :2] * anchor_boxes[..., 2:]
+        pred_ctr_xy = anchor_boxes[..., :2] + pred_ctr_offset
+
+        # w = w_anchor * exp(tw)
+        # h = h_anchor * exp(th)
+        pred_dwdh = pred_reg[..., 2:]
+        pred_dwdh = torch.clamp(pred_dwdh, max=self.DEFAULT_SCALE_CLAMP)
+        pred_wh = anchor_boxes[..., 2:] * pred_dwdh.exp()
+
+        # convert [x, y, w, h] -> [x1, y1, x2, y2]
+        pred_x1y1 = pred_ctr_xy - 0.5 * pred_wh
+        pred_x2y2 = pred_ctr_xy + 0.5 * pred_wh
+        pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
+
+        return pred_box
+
+    def forward(self, pyramid_feats, mask=None):
+        all_masks = []
+        all_anchors = []
+        all_cls_preds = []
+        all_reg_preds = []
+        all_box_preds = []
+        for level, feat in enumerate(pyramid_feats):
+            # ------------------- Decoupled head -------------------
+            cls_feat = self.cls_heads(feat)
+            reg_feat = self.reg_heads(feat)
+
+            # ------------------- Generate anchor box -------------------
+            B, _, H, W = cls_feat.size()
+            fmp_size = [H, W]
+            anchor_boxes = self.get_anchors(level, fmp_size)   # [M, 4]
+            anchor_boxes = anchor_boxes.to(cls_feat.device)
+
+            # ------------------- Predict -------------------
+            cls_pred = self.cls_pred(cls_feat)
+            reg_pred = self.reg_pred(reg_feat)
+
+            # ------------------- Process preds -------------------
+            ## [B, C, H, W] -> [B, H, W, C] -> [B, M, C]
+            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
+            reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 4)
+            ## Decode bbox
+            box_pred = self.decode_boxes(anchor_boxes, reg_pred)
+            ## Adjust mask
+            if mask is not None:
+                # [B, H, W]
+                mask_i = torch.nn.functional.interpolate(mask[None].float(), size=[H, W]).bool()[0]
+                # [B, H, W] -> [B, M]
+                mask_i = mask_i.flatten(1)     
+                # [B, HW] -> [B, HW, KA] -> [B, M], M= HW x KA
+                mask_i = mask_i[..., None].repeat(1, 1, self.num_anchors).flatten(1)
+                
+                all_masks.append(mask_i)
+                
+            all_anchors.append(anchor_boxes)
+            all_cls_preds.append(cls_pred)
+            all_reg_preds.append(reg_pred)
+            all_box_preds.append(box_pred)
+
+        outputs = {"pred_cls": all_cls_preds,  # List [B, M, C]
+                   "pred_reg": all_reg_preds,  # List [B, M, 4]
+                   "pred_box": all_box_preds,  # List [B, M, 4]
+                   "anchors": all_anchors,     # List [B, M, 2]
+                   "strides": self.stride,
+                   "mask": all_masks}          # List [B, M,]
+
+        return outputs 

+ 185 - 0
odlab/models/head/yolof_head.py

@@ -0,0 +1,185 @@
+import math
+import torch
+import torch.nn as nn
+
+from ..basic.conv import ConvModule
+
+
+class YOLOFHead(nn.Module):
+    def __init__(self, cfg, in_dim, out_dim, num_classes, num_cls_head=1, num_reg_head=1, act_type='relu', norm_type='BN'):
+        super().__init__()
+        self.fmp_size = None
+        self.ctr_clamp = cfg['center_clamp']
+        self.DEFAULT_EXP_CLAMP = math.log(1e8)
+        self.DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
+        # ------------------ Basic parameters -------------------
+        self.cfg = cfg
+        self.in_dim = in_dim
+        self.num_classes = num_classes
+        self.num_cls_head=num_cls_head
+        self.num_reg_head=num_reg_head
+        self.act_type=act_type
+        self.norm_type=norm_type
+        self.stride = cfg['out_stride']
+        # Anchor config
+        self.anchor_size = torch.as_tensor(cfg['anchor_size'])
+        self.num_anchors = len(cfg['anchor_size'])
+
+        # ------------------ Network parameters -------------------
+        ## cls head
+        cls_heads = []
+        self.cls_head_dim = out_dim
+        for i in range(self.num_cls_head):
+            if i == 0:
+                cls_heads.append(
+                    ConvModule(in_dim, self.cls_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+            else:
+                cls_heads.append(
+                    ConvModule(self.cls_head_dim, self.cls_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+        ## reg head
+        reg_heads = []
+        self.reg_head_dim = out_dim
+        for i in range(self.num_reg_head):
+            if i == 0:
+                reg_heads.append(
+                    ConvModule(in_dim, self.reg_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+            else:
+                reg_heads.append(
+                    ConvModule(self.reg_head_dim, self.reg_head_dim, k=3, p=1, s=1, 
+                               act_type=self.act_type,
+                               norm_type=self.norm_type)
+                               )
+        self.cls_heads = nn.Sequential(*cls_heads)
+        self.reg_heads = nn.Sequential(*reg_heads)
+
+        # pred layer
+        self.obj_pred = nn.Conv2d(self.reg_head_dim, 1 * self.num_anchors, kernel_size=3, padding=1)
+        self.cls_pred = nn.Conv2d(self.cls_head_dim, num_classes * self.num_anchors, kernel_size=3, padding=1)
+        self.reg_pred = nn.Conv2d(self.reg_head_dim, 4 * self.num_anchors, kernel_size=3, padding=1)
+
+        # init bias
+        self._init_pred_layers()
+
+    def _init_pred_layers(self):  
+        # init cls pred
+        nn.init.normal_(self.cls_pred.weight, mean=0, std=0.01)
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        nn.init.constant_(self.cls_pred.bias, bias_value)
+        # init reg pred
+        nn.init.normal_(self.reg_pred.weight, mean=0, std=0.01)
+        nn.init.constant_(self.reg_pred.bias, 0.0)
+        # init obj pred
+        nn.init.normal_(self.obj_pred.weight, mean=0, std=0.01)
+        nn.init.constant_(self.obj_pred.bias, 0.0)
+
+    def get_anchors(self, fmp_size):
+        """fmp_size: list -> [H, W] \n
+           stride: int -> output stride
+        """
+        # check anchor boxes
+        if self.fmp_size is not None and self.fmp_size == fmp_size:
+            return self.anchor_boxes
+        else:
+            # generate grid cells
+            fmp_h, fmp_w = fmp_size
+            anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+            # [H, W, 2] -> [HW, 2]
+            anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2) + 0.5
+            # [HW, 2] -> [HW, 1, 2] -> [HW, KA, 2] 
+            anchor_xy = anchor_xy[:, None, :].repeat(1, self.num_anchors, 1)
+            anchor_xy *= self.stride
+
+            # [KA, 2] -> [1, KA, 2] -> [HW, KA, 2]
+            anchor_wh = self.anchor_size[None, :, :].repeat(fmp_h*fmp_w, 1, 1)
+
+            # [HW, KA, 4] -> [M, 4]
+            anchor_boxes = torch.cat([anchor_xy, anchor_wh], dim=-1)
+            anchor_boxes = anchor_boxes.view(-1, 4)
+
+            self.anchor_boxes = anchor_boxes
+            self.fmp_size = fmp_size
+
+            return anchor_boxes
+        
+    def decode_boxes(self, anchor_boxes, pred_reg):
+        """
+            anchor_boxes: (List[tensor]) [1, M, 4]
+            pred_reg: (List[tensor]) [B, M, 4]
+        """
+        # x = x_anchor + dx * w_anchor
+        # y = y_anchor + dy * h_anchor
+        pred_ctr_offset = pred_reg[..., :2] * anchor_boxes[..., 2:]
+        pred_ctr_offset = torch.clamp(pred_ctr_offset, min=-self.ctr_clamp, max=self.ctr_clamp)
+        pred_ctr_xy = anchor_boxes[..., :2] + pred_ctr_offset
+
+        # w = w_anchor * exp(tw)
+        # h = h_anchor * exp(th)
+        pred_dwdh = pred_reg[..., 2:]
+        pred_dwdh = torch.clamp(pred_dwdh, max=self.DEFAULT_SCALE_CLAMP)
+        pred_wh = anchor_boxes[..., 2:] * pred_dwdh.exp()
+
+        # convert [x, y, w, h] -> [x1, y1, x2, y2]
+        pred_x1y1 = pred_ctr_xy - 0.5 * pred_wh
+        pred_x2y2 = pred_ctr_xy + 0.5 * pred_wh
+        pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
+
+        return pred_box
+
+    def forward(self, x, mask=None):
+        # ------------------- Decoupled head -------------------
+        cls_feats = self.cls_heads(x)
+        reg_feats = self.reg_heads(x)
+
+        # ------------------- Generate anchor box -------------------
+        fmp_size = cls_feats.shape[2:]
+        anchor_boxes = self.get_anchors(fmp_size)   # [M, 4]
+        anchor_boxes = anchor_boxes.to(cls_feats.device)
+
+        # ------------------- Predict -------------------
+        obj_pred = self.obj_pred(reg_feats)
+        cls_pred = self.cls_pred(cls_feats)
+        reg_pred = self.reg_pred(reg_feats)
+
+        # ------------------- Precoess preds -------------------
+        ## implicit objectness
+        B, _, H, W = obj_pred.size()
+        obj_pred = obj_pred.view(B, -1, 1, H, W)
+        cls_pred = cls_pred.view(B, -1, self.num_classes, H, W)
+        normalized_cls_pred = cls_pred + obj_pred - torch.log(
+                1. + 
+                torch.clamp(cls_pred, max=self.DEFAULT_EXP_CLAMP).exp() + 
+                torch.clamp(obj_pred, max=self.DEFAULT_EXP_CLAMP).exp())
+        # [B, KA, C, H, W] -> [B, H, W, KA, C] -> [B, M, C], M = HxWxKA
+        normalized_cls_pred = normalized_cls_pred.permute(0, 3, 4, 1, 2).contiguous()
+        normalized_cls_pred = normalized_cls_pred.view(B, -1, self.num_classes)
+        # [B, KA*4, H, W] -> [B, KA, 4, H, W] -> [B, H, W, KA, 4] -> [B, M, 4]
+        reg_pred = reg_pred.view(B, -1, 4, H, W).permute(0, 3, 4, 1, 2).contiguous()
+        reg_pred = reg_pred.view(B, -1, 4)
+        ## Decode bbox
+        box_pred = self.decode_boxes(anchor_boxes[None], reg_pred)  # [B, M, 4]
+        ## adjust mask
+        if mask is not None:
+            # [B, H, W]
+            mask = torch.nn.functional.interpolate(mask[None].float(), size=fmp_size).bool()[0]
+            # [B, H, W] -> [B, HW]
+            mask = mask.flatten(1)
+            # [B, HW] -> [B, HW, KA] -> [BM,], M= HW x KA
+            mask = mask[..., None].repeat(1, 1, self.num_anchors).flatten()
+
+        outputs = {"pred_cls": normalized_cls_pred,
+                   "pred_reg": reg_pred,
+                   "pred_box": box_pred,
+                   "anchors": anchor_boxes,
+                   "mask": mask}
+
+        return outputs 

+ 57 - 0
odlab/models/neck/__init__.py

@@ -0,0 +1,57 @@
+from .dilated_encoder import DilatedEncoder
+from .hybrid_encoder import HybridEncoder
+from .fpn import BasicFPN
+from .spp import SPPF
+
+
+# build neck
+def build_neck(cfg, in_dim, out_dim):
+    print('==============================')
+    print('Neck: {}'.format(cfg['neck']))
+
+    # ----------------------- Neck module -----------------------
+    if cfg['neck'] == 'dilated_encoder':
+        model = DilatedEncoder(in_dim       = in_dim,
+                               out_dim      = out_dim,
+                               expand_ratio = cfg['neck_expand_ratio'],
+                               dilations    = cfg['neck_dilations'],
+                               act_type     = cfg['neck_act'],
+                               norm_type    = cfg['neck_norm']
+                               )
+    elif cfg['neck'] == 'spp_block':
+        model = SPPF(in_dim       = in_dim,
+                     out_dim      = out_dim,
+                     expand_ratio = cfg['neck_expand_ratio'],
+                     pooling_size = cfg["spp_pooling_size"],
+                     act_type     = cfg['neck_act'],
+                     norm_type    = cfg['neck_norm']
+                     )
+        
+    # ----------------------- FPN Neck -----------------------
+    elif cfg['neck'] == 'basic_fpn':
+        model = BasicFPN(in_dims = in_dim,
+                         out_dim = out_dim,
+                         p6_feat = cfg['fpn_p6_feat'],
+                         p7_feat = cfg['fpn_p7_feat'],
+                         from_c5 = cfg['fpn_p6_from_c5'], 
+                         )
+    elif cfg['neck'] == 'hybrid_encoder':
+        return HybridEncoder(in_dims     = in_dim,
+                             out_dim     = out_dim,
+                             num_blocks  = cfg['fpn_num_blocks'],
+                             expansion   = cfg['fpn_expansion'],
+                             act_type    = cfg['fpn_act'],
+                             norm_type   = cfg['fpn_norm'],
+                             depthwise   = cfg['fpn_depthwise'],
+                             num_heads   = cfg['en_num_heads'],
+                             num_layers  = cfg['en_num_layers'],
+                             ffn_dim     = cfg['en_ffn_dim'],
+                             dropout     = cfg['en_dropout'],
+                             pe_temperature = cfg['pe_temperature'],
+                             en_act_type    = cfg['en_act'],
+                             en_pre_norm    = cfg['en_pre_norm'],
+                             )
+    else:
+        raise NotImplementedError("Unknown PaFPN: <{}>".format(cfg['fpn']))
+        
+    return model

+ 72 - 0
odlab/models/neck/dilated_encoder.py

@@ -0,0 +1,72 @@
+import torch.nn as nn
+from utils import weight_init
+
+from ..basic.conv import ConvModule
+
+
+# BottleNeck
+class Bottleneck(nn.Module):
+    def __init__(self, in_dim, dilation, expand_ratio, act_type='relu', norm_type='BN'):
+        super(Bottleneck, self).__init__()
+        # ------------------ Basic parameters -------------------
+        self.in_dim = in_dim
+        self.dilation = dilation
+        self.expand_ratio = expand_ratio
+        inter_dim = round(in_dim * expand_ratio)
+        # ------------------ Network parameters -------------------
+        self.branch = nn.Sequential(
+            ConvModule(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type),
+            ConvModule(inter_dim, inter_dim, k=3, p=dilation, d=dilation, act_type=act_type, norm_type=norm_type),
+            ConvModule(inter_dim, in_dim, k=1, act_type=act_type, norm_type=norm_type)
+        )
+
+    def forward(self, x):
+        return x + self.branch(x)
+
+# Dilated Encoder
+class DilatedEncoder(nn.Module):
+    def __init__(self, in_dim, out_dim, expand_ratio, dilations=[2, 4, 6, 8], act_type='relu', norm_type='BN'):
+        super(DilatedEncoder, self).__init__()
+        # ------------------ Basic parameters -------------------
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.expand_ratio = expand_ratio
+        self.dilations = dilations
+        # ------------------ Network parameters -------------------
+        ## proj layer
+        self.projector = nn.Sequential(
+            ConvModule(in_dim, out_dim, k=1, act_type=None, norm_type=norm_type),
+            ConvModule(out_dim, out_dim, k=3, p=1, act_type=None, norm_type=norm_type)
+        )
+        ## encoder layers
+        self.encoders = nn.Sequential(
+            *[Bottleneck(out_dim, d, expand_ratio, act_type, norm_type) for d in dilations])
+
+        self._init_weight()
+
+
+    def _init_weight(self):
+        for m in self.projector:
+            if isinstance(m, nn.Conv2d):
+                weight_init.c2_xavier_fill(m)
+                weight_init.c2_xavier_fill(m)
+            if isinstance(m, (nn.GroupNorm, nn.BatchNorm2d, nn.SyncBatchNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        for m in self.encoders.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, mean=0, std=0.01)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+            if isinstance(m, (nn.GroupNorm, nn.BatchNorm2d, nn.SyncBatchNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+
+    def forward(self, x):
+        x = self.projector(x)
+        x = self.encoders(x)
+
+        return x

+ 80 - 0
odlab/models/neck/fpn.py

@@ -0,0 +1,80 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from utils import weight_init
+
+
+# ------------------ Basic Feature Pyramid Network ------------------
+class BasicFPN(nn.Module):
+    def __init__(self, 
+                 in_dims=[512, 1024, 2048],
+                 out_dim=256,
+                 p6_feat=False,
+                 p7_feat=False,
+                 from_c5=False,
+                 ):
+        super().__init__()
+        # ------------------ Basic parameters -------------------
+        self.p6_feat = p6_feat
+        self.p7_feat = p7_feat
+        self.from_c5 = from_c5
+
+        # ------------------ Network parameters -------------------
+        ## latter layers
+        self.input_projs = nn.ModuleList()
+        self.smooth_layers = nn.ModuleList()
+        for in_dim in in_dims[::-1]:
+            self.input_projs.append(nn.Conv2d(in_dim, out_dim, kernel_size=1))
+            self.smooth_layers.append(nn.Conv2d(out_dim, out_dim, kernel_size=3, padding=1))
+
+        ## P6/P7 layers
+        if p6_feat:
+            if from_c5:
+                self.p6_conv = nn.Conv2d(in_dims[-1], out_dim, kernel_size=3, stride=2, padding=1)
+            else: # from p5
+                self.p6_conv = nn.Conv2d(out_dim, out_dim, kernel_size=3, stride=2, padding=1)
+        if p7_feat:
+            self.p7_conv = nn.Sequential(
+                nn.ReLU(inplace=True),
+                nn.Conv2d(out_dim, out_dim, kernel_size=3, stride=2, padding=1)
+            )
+
+        self._init_weight()
+
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                weight_init.c2_xavier_fill(m)
+
+    def forward(self, feats):
+        """
+            feats: (List of Tensor) [C3, C4, C5], C_i ∈ R^(B x C_i x H_i x W_i)
+        """
+        outputs = []
+        # [C3, C4, C5] -> [C5, C4, C3]
+        feats = feats[::-1]
+        top_level_feat = feats[0]
+        prev_feat = self.input_projs[0](top_level_feat)
+        outputs.append(self.smooth_layers[0](prev_feat))
+
+        for feat, input_proj, smooth_layer in zip(feats[1:], self.input_projs[1:], self.smooth_layers[1:]):
+            feat = input_proj(feat)
+            top_down_feat = F.interpolate(prev_feat, size=feat.shape[2:], mode='nearest')
+            prev_feat = feat + top_down_feat
+            outputs.insert(0, smooth_layer(prev_feat))
+
+        if self.p6_feat:
+            if self.from_c5:
+                p6_feat = self.p6_conv(feats[0])
+            else:
+                p6_feat = self.p6_conv(outputs[-1])
+            # [P3, P4, P5] -> [P3, P4, P5, P6]
+            outputs.append(p6_feat)
+
+            if self.p7_feat:
+                p7_feat = self.p7_conv(p6_feat)
+                # [P3, P4, P5, P6] -> [P3, P4, P5, P6, P7]
+                outputs.append(p7_feat)
+
+        # [P3, P4, P5] or [P3, P4, P5, P6, P7]
+        return outputs

+ 142 - 0
odlab/models/neck/hybrid_encoder.py

@@ -0,0 +1,142 @@
+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..basic.conv import BasicConv, RepCSPLayer
+from ..basic.transformer import TransformerEncoder
+
+
+# -------------- Feature Pyramid Network + Transformer Encoder --------------
+class HybridEncoder(nn.Module):
+    def __init__(self, 
+                 in_dims        :List  = [256, 512, 1024],
+                 out_dim        :int   = 256,
+                 num_blocks     :int   = 3,
+                 expansion      :float = 1.0,
+                 act_type       :str   = 'silu',
+                 norm_type      :str   = 'GN',
+                 depthwise      :bool  = False,
+                 # Transformer's parameters
+                 num_heads      :int   = 8,
+                 num_layers     :int   = 1,
+                 ffn_dim        :int   = 1024,
+                 dropout        :float = 0.1,
+                 pe_temperature :float = 10000.,
+                 en_act_type    :str   = 'gelu',
+                 en_pre_norm    :bool  = False,
+                 ) -> None:
+        super(HybridEncoder, self).__init__()
+        # ---------------- Basic parameters ----------------
+        self.in_dims = in_dims
+        self.out_dim = out_dim
+        self.out_dims = [self.out_dim] * len(in_dims)
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.ffn_dim = ffn_dim
+        c3, c4, c5 = in_dims
+
+        # ---------------- Input projs ----------------
+        self.input_proj_1 = BasicConv(c5, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.input_proj_2 = BasicConv(c4, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.input_proj_3 = BasicConv(c3, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+
+        # ---------------- Transformer Encoder ----------------
+        self.transformer_encoder = TransformerEncoder(d_model        = self.out_dim,
+                                                      num_heads      = num_heads,
+                                                      num_layers     = num_layers,
+                                                      ffn_dim        = ffn_dim,
+                                                      pe_temperature = pe_temperature,
+                                                      dropout        = dropout,
+                                                      act_type       = en_act_type,
+                                                      pre_norm       = en_pre_norm,
+                                                      )
+
+        # ---------------- Top dwon FPN ----------------
+        ## P5 -> P4
+        self.reduce_layer_1 = BasicConv(self.out_dim, self.out_dim,
+                                        kernel_size=1, padding=0, stride=1,
+                                        act_type=act_type, norm_type=norm_type)
+        self.top_down_layer_1 = RepCSPLayer(in_dim      = self.out_dim * 2,
+                                            out_dim     = self.out_dim,
+                                            num_blocks  = num_blocks,
+                                            expansion   = expansion,
+                                            act_type    = act_type,
+                                            norm_type   = norm_type,
+                                            )
+        ## P4 -> P3
+        self.reduce_layer_2 = BasicConv(self.out_dim, self.out_dim,
+                                        kernel_size=1, padding=0, stride=1,
+                                        act_type=act_type, norm_type=norm_type)
+        self.top_down_layer_2 = RepCSPLayer(in_dim      = self.out_dim * 2,
+                                            out_dim     = self.out_dim,
+                                            num_blocks  = num_blocks,
+                                            expansion   = expansion,
+                                            act_type    = act_type,
+                                            norm_type   = norm_type,
+                                            )
+        
+        # ---------------- Bottom up PAN----------------
+        ## P3 -> P4
+        self.dowmsample_layer_1 = BasicConv(self.out_dim, self.out_dim,
+                                            kernel_size=3, padding=1, stride=2,
+                                            act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        self.bottom_up_layer_1 = RepCSPLayer(in_dim      = self.out_dim * 2,
+                                             out_dim     = self.out_dim,
+                                             num_blocks  = num_blocks,
+                                             expansion   = expansion,
+                                             act_type    = act_type,
+                                             norm_type   = norm_type,
+                                             )
+        ## P4 -> P5
+        self.dowmsample_layer_2 = BasicConv(self.out_dim, self.out_dim,
+                                            kernel_size=3, padding=1, stride=2,
+                                            act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        self.bottom_up_layer_2 = RepCSPLayer(in_dim      = self.out_dim * 2,
+                                             out_dim     = self.out_dim,
+                                             num_blocks  = num_blocks,
+                                             expansion   = expansion,
+                                             act_type    = act_type,
+                                             norm_type   = norm_type,
+                                             )
+
+        self.init_weights()
+  
+    def init_weights(self):
+        """Initialize the parameters."""
+        for m in self.modules():
+            if isinstance(m, torch.nn.Conv2d):
+                # In order to be consistent with the source code,
+                # reset the Conv2d initialization parameters
+                m.reset_parameters()
+
+    def forward(self, features):
+        c3, c4, c5 = features
+
+        # -------- Input projs --------
+        p5 = self.input_proj_1(c5)
+        p4 = self.input_proj_2(c4)
+        p3 = self.input_proj_3(c3)
+
+        # -------- Transformer encoder --------
+        p5 = self.transformer_encoder(p5)
+
+        # -------- Top down FPN --------
+        p5_in = self.reduce_layer_1(p5)
+        p5_up = F.interpolate(p5_in, size=p4.shape[2:])
+        p4 = self.top_down_layer_1(torch.cat([p4, p5_up], dim=1))
+
+        p4_in = self.reduce_layer_2(p4)
+        p4_up = F.interpolate(p4_in, size=p3.shape[2:])
+        p3 = self.top_down_layer_2(torch.cat([p3, p4_up], dim=1))
+
+        # -------- Bottom up PAN --------
+        p3_ds = self.dowmsample_layer_1(p3)
+        p4 = self.bottom_up_layer_1(torch.cat([p4_in, p3_ds], dim=1))
+
+        p4_ds = self.dowmsample_layer_2(p4)
+        p5 = self.bottom_up_layer_2(torch.cat([p5_in, p4_ds], dim=1))
+
+        out_feats = [p3, p4, p5]
+        
+        return out_feats

+ 25 - 0
odlab/models/neck/spp.py

@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+from ..basic.conv import ConvModule
+
+
+# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
+class SPPF(nn.Module):
+    """
+        This code referenced to https://github.com/ultralytics/yolov5
+    """
+    def __init__(self, in_dim, out_dim, expand_ratio=0.5, pooling_size=5, act_type="relu", norm_type="BN"):
+        super().__init__()
+        inter_dim = int(in_dim * expand_ratio)
+        self.out_dim = out_dim
+        self.cv1 = ConvModule(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.cv2 = ConvModule(inter_dim * 4, out_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.m = nn.MaxPool2d(kernel_size=pooling_size, stride=1, padding=pooling_size // 2)
+
+    def forward(self, x):
+        x = self.cv1(x)
+        y1 = self.m(x)
+        y2 = self.m(y1)
+
+        return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))

+ 179 - 0
odlab/test.py

@@ -0,0 +1,179 @@
+import argparse
+import cv2
+import os
+import time
+import numpy as np
+from copy import deepcopy
+import torch
+
+# load transform
+from datasets import build_dataset, build_transform
+
+# load some utils
+from utils.misc import load_weight, compute_flops
+
+from config import build_config
+from models.detectors import build_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Object Detection Lab')
+    # Basic
+    parser.add_argument('--cuda', action='store_true', default=False, 
+                        help='use cuda.')
+    parser.add_argument('--show', action='store_true', default=False,
+                        help='show the visulization results.')
+    parser.add_argument('--save', action='store_true', default=False,
+                        help='save the visulization results.')
+    parser.add_argument('--save_folder', default='det_results/', type=str,
+                        help='Dir to save results')
+    parser.add_argument('-vt', '--visual_threshold', default=0.3, type=float,
+                        help='Final confidence threshold')
+    parser.add_argument('-ws', '--window_scale', default=1.0, type=float,
+                        help='resize window of cv2 for visualization.')
+    parser.add_argument('--resave', action='store_true', default=False, 
+                        help='resave checkpoints without optimizer state dict.')
+    # Model
+    parser.add_argument('-m', '--model', default='yolof_r18_c5_1x', type=str,
+                        help='build detector')
+    parser.add_argument('--weight', default=None,
+                        type=str, help='Trained state_dict file path to open')
+    parser.add_argument('--fuse_conv_bn', action='store_true', default=False,
+                        help='fuse Conv & BN')
+    # Dataset
+    parser.add_argument('--root', default='/Users/liuhaoran/Desktop/python_work/object-detection/dataset/COCO/',
+                        help='data root')
+    parser.add_argument('-d', '--dataset', default='coco',
+                        help='coco, voc.')
+
+    return parser.parse_args()
+
+def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4):
+    x1, y1, x2, y2 = bbox
+    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+    t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0]
+    # plot bbox
+    cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2)
+    
+    if label is not None:
+        # plot title bbox
+        cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1)
+        # put the test on the title bbox
+        cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA)
+
+    return img
+
+def visualize(img, 
+              bboxes, 
+              scores, 
+              labels, 
+              vis_thresh, 
+              class_colors, 
+              class_names):
+    ts = 0.4
+    for i, bbox in enumerate(bboxes):
+        if scores[i] > vis_thresh:
+            cls_id = int(labels[i])
+            cls_color = class_colors[cls_id]
+                
+            mess = '%s: %.2f' % (class_names[cls_id], scores[i])
+            img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts)
+
+    return img
+        
+@torch.no_grad()
+def run(args, model, device, dataset, transform, class_colors, class_names):
+    num_images = len(dataset)
+    save_path = os.path.join('det_results/', args.dataset, args.model)
+    os.makedirs(save_path, exist_ok=True)
+
+    for index, (image, _) in enumerate(dataset):
+        print('Testing image {:d}/{:d}....'.format(index+1, num_images))
+        orig_h, orig_w = image.height, image.width
+
+        # PreProcess
+        x, _ = transform(image)
+        x = x.unsqueeze(0).to(device)
+
+        # Inference
+        t0 = time.time()
+        bboxes, scores, labels = model(x)
+        print("Infer. time: {}".format(time.time() - t0, "s"))
+        
+        # Rescale bboxes
+        bboxes[..., 0::2] *= orig_w
+        bboxes[..., 1::2] *= orig_h
+
+        # vis detection
+        image = np.array(image).astype(np.uint8)
+        image = image[..., (2, 1, 0)].copy()
+        img_processed = visualize(
+            image, bboxes, scores, labels, args.visual_threshold, class_colors, class_names)
+        if args.show:
+            h, w = img_processed.shape[:2]
+            sw, sh = int(w*args.window_scale), int(h*args.window_scale)
+            cv2.namedWindow('detection', 0)
+            cv2.resizeWindow('detection', sw, sh)
+            cv2.imshow('detection', img_processed)
+            cv2.waitKey(0)
+
+        if args.save:
+            # save result
+            cv2.imwrite(os.path.join(save_path, str(index).zfill(6) +'.jpg'), img_processed)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    # cuda
+    if args.cuda:
+        print('use cuda')
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+
+    # Dataset & Model Config
+    cfg = build_config(args)
+
+    # Transform
+    transform = build_transform(cfg, is_train=False)
+
+    # Dataset
+    dataset, dataset_info = build_dataset(args, is_train=False)
+
+    np.random.seed(0)
+    class_colors = [(np.random.randint(255),
+                     np.random.randint(255),
+                     np.random.randint(255))
+                     for _ in range(dataset_info['num_classes'])]
+
+    # Model
+    model = build_model(args, cfg, dataset_info['num_classes'], is_val=False)
+    model = load_weight(model, args.weight, args.fuse_conv_bn)
+    model.to(device).eval()
+
+    # Compute FLOPs and Params
+    model_copy = deepcopy(model)
+    model_copy.trainable = False
+    model_copy.eval()
+    compute_flops(
+        model=model_copy,
+        min_size=cfg['test_min_size'],
+        max_size=cfg['test_max_size'],
+        device=device)
+    del model_copy
+
+    # Resave model weight
+    if args.resave:
+        print('Resave: {}'.format(args.model.upper()))
+        checkpoint = torch.load(args.weight, map_location='cpu')
+        output_dir = 'weights/{}/{}/'.format(args.dataset, args.model)
+        os.makedirs(output_dir, exist_ok=True)
+        checkpoint_path = os.path.join(output_dir, "{}_pure.pth".format(args.model))
+        torch.save({'model': model.state_dict(),
+                    'mAP': checkpoint.pop("mAP"),
+                    'epoch': checkpoint.pop("epoch")}, 
+                    checkpoint_path)
+        
+    print("================= DETECT =================")
+    # run
+    run(args, model, device, dataset, transform, class_colors, dataset_info['class_labels'])

+ 52 - 0
odlab/train.sh

@@ -0,0 +1,52 @@
+# Args setting
+MODEL=$1
+DATASET=$2
+DATA_ROOT=$3
+WORLD_SIZE=$4
+MASTER_PORT=$5
+if [[ $MODEL == *"yolof"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=64
+    EVAL_EPOCH=2
+elif [[ $MODEL == *"fcos"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=16
+    EVAL_EPOCH=2
+elif [[ $MODEL == *"retinanet"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=16
+    EVAL_EPOCH=2
+elif [[ $MODEL == *"plain_detr"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=16
+    EVAL_EPOCH=2
+elif [[ $MODEL == *"rtdetr"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=16
+    EVAL_EPOCH=1
+fi
+
+# -------------------------- Train Pipeline --------------------------
+if [ $WORLD_SIZE == 1 ]; then
+    python main.py \
+        --cuda \
+        --dataset ${DATASET}  \
+        --root ${DATA_ROOT} \
+        --model ${MODEL} \
+        --batch_size ${BATCH_SIZE} \
+        --eval_epoch ${EVAL_EPOCH}
+elif [[ $WORLD_SIZE -gt 1 && $WORLD_SIZE -le 8 ]]; then
+    python -m torch.distributed.run --nproc_per_node=$WORLD_SIZE --master_port ${MASTER_PORT}  \
+        main.py \
+        --cuda \
+        --distributed \
+        --dataset ${DATASET}  \
+        --root ${DATA_ROOT} \
+        --model ${MODEL} \
+        --batch_size ${BATCH_SIZE} \
+        --eval_epoch ${EVAL_EPOCH}
+else
+    echo "The WORLD_SIZE is set to a value greater than 8, indicating the use of multi-machine \
+          multi-card training mode, which is currently unsupported."
+    exit 1
+fi

+ 1 - 0
odlab/utils/__init__.py

@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

+ 204 - 0
odlab/utils/box_ops.py

@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Utilities for bounding box manipulation and GIoU.
+"""
+import torch
+from torchvision.ops.boxes import box_area
+
+
+def get_ious(bboxes1,
+             bboxes2,
+             box_mode="xyxy",
+             iou_type="iou"):
+    """
+    Compute iou loss of type ['iou', 'giou', 'linear_iou']
+
+    Args:
+        inputs (tensor): pred values
+        targets (tensor): target values
+        weight (tensor): loss weight
+        box_mode (str): 'xyxy' or 'ltrb', 'ltrb' is currently supported.
+        loss_type (str): 'giou' or 'iou' or 'linear_iou'
+        reduction (str): reduction manner
+
+    Returns:
+        loss (tensor): computed iou loss.
+    """
+    if box_mode == "ltrb":
+        bboxes1 = torch.cat((-bboxes1[..., :2], bboxes1[..., 2:]), dim=-1)
+        bboxes2 = torch.cat((-bboxes2[..., :2], bboxes2[..., 2:]), dim=-1)
+    elif box_mode != "xyxy":
+        raise NotImplementedError
+
+    eps = torch.finfo(torch.float32).eps
+
+    bboxes1_area = (bboxes1[..., 2] - bboxes1[..., 0]).clamp_(min=0) \
+        * (bboxes1[..., 3] - bboxes1[..., 1]).clamp_(min=0)
+    bboxes2_area = (bboxes2[..., 2] - bboxes2[..., 0]).clamp_(min=0) \
+        * (bboxes2[..., 3] - bboxes2[..., 1]).clamp_(min=0)
+
+    w_intersect = (torch.min(bboxes1[..., 2], bboxes2[..., 2])
+                   - torch.max(bboxes1[..., 0], bboxes2[..., 0])).clamp_(min=0)
+    h_intersect = (torch.min(bboxes1[..., 3], bboxes2[..., 3])
+                   - torch.max(bboxes1[..., 1], bboxes2[..., 1])).clamp_(min=0)
+
+    area_intersect = w_intersect * h_intersect
+    area_union = bboxes2_area + bboxes1_area - area_intersect
+    ious = area_intersect / area_union.clamp(min=eps)
+
+    if iou_type == "iou":
+        return ious
+    elif iou_type == "giou":
+        g_w_intersect = torch.max(bboxes1[..., 2], bboxes2[..., 2]) \
+            - torch.min(bboxes1[..., 0], bboxes2[..., 0])
+        g_h_intersect = torch.max(bboxes1[..., 3], bboxes2[..., 3]) \
+            - torch.min(bboxes1[..., 1], bboxes2[..., 1])
+        ac_uion = g_w_intersect * g_h_intersect
+        gious = ious - (ac_uion - area_union) / ac_uion.clamp(min=eps)
+        return gious
+    else:
+        raise NotImplementedError
+
+
+def delta2bbox(proposals,
+               deltas,
+               max_shape=None,
+               wh_ratio_clip=16 / 1000,
+               clip_border=True,
+               add_ctr_clamp=False,
+               ctr_clamp=32):
+
+    dxy = deltas[..., :2]
+    dwh = deltas[..., 2:]
+
+    # Compute width/height of each roi
+    pxy = proposals[..., :2]
+    pwh = proposals[..., 2:]
+
+    dxy_wh = pwh * dxy
+    wh_ratio_clip = torch.tensor(wh_ratio_clip).to(deltas.device)
+    max_ratio = torch.abs(torch.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
+        dwh = torch.clamp(dwh, max=max_ratio)
+    else:
+        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
+
+    gxy = pxy + dxy_wh
+    gwh = pwh * dwh.exp()
+    x1y1 = gxy - (gwh * 0.5)
+    x2y2 = gxy + (gwh * 0.5)
+    bboxes = torch.cat([x1y1, x2y2], dim=-1)
+    if clip_border and max_shape is not None:
+        bboxes[..., 0::2].clamp_(min=0).clamp_(max=max_shape[1])
+        bboxes[..., 1::2].clamp_(min=0).clamp_(max=max_shape[0])
+        
+    return bboxes
+
+
+def bbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)):
+    # hack for matcher
+    if proposals.size() != gt.size():
+        proposals = proposals[:, None]
+        gt = gt[None]
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px, py, pw, ph = proposals.unbind(-1)
+    gx, gy, gw, gh = gt.unbind(-1)
+
+    dx = (gx - px) / (pw + 0.1)
+    dy = (gy - py) / (ph + 0.1)
+    dw = torch.log(gw / (pw + 0.1))
+    dh = torch.log(gh / (ph + 0.1))
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+    union[union == 0.0] = 1.0
+
+    iou = inter / union
+    
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)

+ 122 - 0
odlab/utils/distributed_utils.py

@@ -0,0 +1,122 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os
+import subprocess
+
+import torch
+import torch.distributed as dist
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
+    sha = 'N/A'
+    diff = "clean"
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)

+ 98 - 0
odlab/utils/dn_compoments.py

@@ -0,0 +1,98 @@
+import torch
+from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0., max=1.)
+    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
+
+def get_contrastive_denoising_training_group(targets,
+                                             num_classes,
+                                             num_queries,
+                                             class_embed,
+                                             num_denoising=100,
+                                             label_noise_ratio=0.5,
+                                             box_noise_scale=1.0,):
+    if num_denoising <= 0:
+        return None, None, None, None
+
+    num_gts = [len(t['labels']) for t in targets]
+    device = targets[0]['labels'].device
+    
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(num_gts)
+
+    input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device)
+    input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device)
+    pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device)
+
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets[i]['labels']
+            input_query_bbox[i, :num_gt] = targets[i]['boxes']
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_group])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
+    # positive and negative mask
+    negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device)
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
+    dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * 2 * num_group)
+
+    if label_noise_ratio > 0:
+        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
+        # randomly put a new one here
+        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
+
+    if box_noise_scale > 0:
+        known_bbox = box_cxcywh_to_xyxy(input_query_bbox)
+        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = torch.rand_like(input_query_bbox)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = box_xyxy_to_cxcywh(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+    input_query_class = class_embed(input_query_class)
+
+    tgt_size = num_denoising + num_queries
+    # attn_mask = torch.ones([tgt_size, tgt_size], device=device) < 0
+    attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device)
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True
+        else:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True
+        
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta

+ 60 - 0
odlab/utils/lr_scheduler.py

@@ -0,0 +1,60 @@
+import torch
+
+
+# ------------------------- WarmUp LR Scheduler -------------------------
+## Warmup LR Scheduler
+class LinearWarmUpScheduler(object):
+    def __init__(self, base_lr=0.01, wp_iter=500, warmup_factor=0.00066667):
+        self.base_lr = base_lr
+        self.wp_iter = wp_iter
+        self.warmup_factor = warmup_factor
+
+
+    def set_lr(self, optimizer, lr):
+        for param_group in optimizer.param_groups:
+            init_lr = param_group['initial_lr']
+            ratio = init_lr / self.base_lr
+            param_group['lr'] = lr * ratio
+
+
+    def __call__(self, iter, optimizer):
+        # warmup
+        alpha = iter / self.wp_iter
+        warmup_factor = self.warmup_factor * (1 - alpha) + alpha
+        tmp_lr = self.base_lr * warmup_factor
+        self.set_lr(optimizer, tmp_lr)
+        
+## Build WP LR Scheduler
+def build_wp_lr_scheduler(cfg, base_lr=0.01):
+    print('==============================')
+    print('WarmUpScheduler: {}'.format(cfg['warmup']))
+    print('--base_lr: {}'.format(base_lr))
+    print('--warmup_iters: {}'.format(cfg['warmup_iters']))
+    print('--warmup_factor: {}'.format(cfg['warmup_factor']))
+
+    if cfg['warmup'] == 'linear':
+        wp_lr_scheduler = LinearWarmUpScheduler(base_lr, cfg['warmup_iters'], cfg['warmup_factor'])
+    
+    return wp_lr_scheduler
+
+                           
+# ------------------------- LR Scheduler -------------------------
+def build_lr_scheduler(cfg, optimizer, resume=None):
+    print('==============================')
+    print('LR Scheduler: {}'.format(cfg['lr_scheduler']))
+
+    if cfg['lr_scheduler'] == 'step':
+        assert 'lr_epoch' in cfg
+        print('--lr_epoch: {}'.format(cfg['lr_epoch']))
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=cfg['lr_epoch'])
+    elif cfg['lr_scheduler'] == 'cosine':
+        pass
+        
+    if resume is not None:
+        print('keep training: ', resume)
+        checkpoint = torch.load(resume)
+        # checkpoint state dict
+        checkpoint_state_dict = checkpoint.pop("lr_scheduler")
+        lr_scheduler.load_state_dict(checkpoint_state_dict)
+
+    return lr_scheduler

+ 572 - 0
odlab/utils/misc.py

@@ -0,0 +1,572 @@
+# ---------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ---------------------------------------------------------------------------
+import time
+import math
+import datetime
+import numpy as np
+from typing import List
+from thop import profile
+from copy import deepcopy
+from collections import defaultdict, deque
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch import Tensor
+from .distributed_utils import is_dist_avail_and_initialized
+
+
+# ---------------------------- Train tools ----------------------------
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+class SinkhornDistance(torch.nn.Module):
+    def __init__(self, eps=1e-3, max_iter=100, reduction='none'):
+        super(SinkhornDistance, self).__init__()
+        self.eps = eps
+        self.max_iter = max_iter
+        self.reduction = reduction
+
+    def forward(self, mu, nu, C):
+        u = torch.ones_like(mu)
+        v = torch.ones_like(nu)
+
+        # Sinkhorn iterations
+        for i in range(self.max_iter):
+            v = self.eps * \
+                (torch.log(
+                    nu + 1e-8) - torch.logsumexp(self.M(C, u, v).transpose(-2, -1), dim=-1)) + v
+            u = self.eps * \
+                (torch.log(
+                    mu + 1e-8) - torch.logsumexp(self.M(C, u, v), dim=-1)) + u
+
+        U, V = u, v
+        # Transport plan pi = diag(a)*K*diag(b)
+        pi = torch.exp(
+            self.M(C, U, V)).detach()
+        # Sinkhorn distance
+        cost = torch.sum(
+            pi * C, dim=(-2, -1))
+        return cost, pi
+
+    def M(self, C, u, v):
+        '''
+        "Modified cost for logarithmic updates"
+        "$M_{ij} = (-c_{ij} + u_i + v_j) / epsilon$"
+        '''
+        return (-C + u.unsqueeze(-1) + v.unsqueeze(-2)) / self.eps
+    
+
+# ---------------------------- Dataloader tools ----------------------------
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+def batch_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], :img.shape[2]] = False
+    else:
+        raise ValueError('not supported')
+    
+    return tensor, mask
+
+def collate_fn(batch):
+    batch = list(zip(*batch))
+    batch[0] = batch_tensor_from_tensor_list(batch[0])
+
+    return tuple(batch)
+
+
+# ---------------------------- For Model ----------------------------
+def match_name_keywords(n, name_keywords):
+    out = False
+    for b in name_keywords:
+        if b in n:
+            out = True
+            break
+    return out
+
+## fuse Conv & BN layer
+def fuse_conv_bn(module):
+    """Recursively fuse conv and bn in a module.
+    During inference, the functionary of batch norm layers is turned off
+    but only the mean and var alone channels are used, which exposes the
+    chance to fuse it with the preceding conv layers to save computations and
+    simplify network structures.
+    Args:
+        module (nn.Module): Module to be fused.
+    Returns:
+        nn.Module: Fused module.
+    """
+    last_conv = None
+    last_conv_name = None
+    
+    def _fuse_conv_bn(conv, bn):
+        """Fuse conv and bn into one module.
+        Args:
+            conv (nn.Module): Conv to be fused.
+            bn (nn.Module): BN to be fused.
+        Returns:
+            nn.Module: Fused module.
+        """
+        conv_w = conv.weight
+        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+            bn.running_mean)
+
+        factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+        conv.weight = nn.Parameter(conv_w *
+                                factor.reshape([conv.out_channels, 1, 1, 1]))
+        conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+        return conv
+    for name, child in module.named_children():
+        if isinstance(child,
+                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = _fuse_conv_bn(last_conv, child)
+            module._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            module._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_conv_bn(child)
+    return module
+
+## compute FLOPs & Parameters
+def compute_flops(model, min_size, max_size, device):
+    if isinstance(min_size[0], List):
+        min_size, max_size = min_size[0]
+    else:
+        min_size = min_size[0]
+
+    x = torch.randn(1, 3, min_size, max_size).to(device)
+    print('==============================')
+    flops, params = profile(model, inputs=(x, ), verbose=False)
+    print('GFLOPs : {:.2f}'.format(flops / 1e9))
+    print('Params : {:.2f} M'.format(params / 1e6))
+
+## load trained weight
+def load_weight(model, path_to_ckpt, fuse_cbn=False):
+    # check ckpt file
+    if path_to_ckpt is None:
+        print('no weight file ...')
+    else:
+        checkpoint = torch.load(path_to_ckpt, map_location='cpu')
+        print('--------------------------------------')
+        print('Best model infor:')
+        print('Epoch: {}'.format(checkpoint.pop("epoch")))
+        print('mAP: {}'.format(checkpoint.pop("mAP")))
+        print('--------------------------------------')
+        checkpoint_state_dict = checkpoint.pop("model")
+        model.load_state_dict(checkpoint_state_dict)
+
+        print('Finished loading model!')
+
+    # fuse conv & bn
+    if fuse_cbn:
+        print('Fusing Conv & BN ...')
+        model = fuse_conv_bn(model)
+
+    return model
+
+## gradient clip
+def get_total_grad_norm(parameters, norm_type=2):
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    device = parameters[0].grad.device
+    total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]),
+                            norm_type)
+    return total_norm
+
+## param Dict
+def get_param_dict(model, cfg, return_name=False):
+    # sanity check: a variable could not match backbone_names and linear_proj_names at the same time
+    cfg['lr_backbone'] = cfg['base_lr'] * cfg['backbone_lr_ratio']
+    for n, p in model.named_parameters():
+        if match_name_keywords(n, cfg['lr_backbone_names']) and match_name_keywords(n, cfg['lr_linear_proj_names']):
+            raise ValueError
+
+    param_dicts = [
+        {
+            "params": [
+                p if not return_name else n
+                for n, p in model.named_parameters()
+                if not match_name_keywords(n, cfg['lr_backbone_names'])
+                and not match_name_keywords(n, cfg['lr_linear_proj_names'])
+                and not match_name_keywords(n, cfg['wd_norm_names'])
+                and p.requires_grad
+            ],
+            "lr": cfg['base_lr'],
+            "weight_decay": cfg['weight_decay'],
+        },
+        {
+            "params": [
+                p if not return_name else n
+                for n, p in model.named_parameters()
+                if match_name_keywords(n, cfg['lr_backbone_names'])
+                and not match_name_keywords(n, cfg['lr_linear_proj_names'])
+                and not match_name_keywords(n, cfg['wd_norm_names'])
+                and p.requires_grad
+            ],
+            "lr": cfg['lr_backbone'],
+            "weight_decay": cfg['weight_decay'],
+        },
+        {
+            "params": [
+                p if not return_name else n
+                for n, p in model.named_parameters()
+                if not match_name_keywords(n, cfg['lr_backbone_names'])
+                and match_name_keywords(n, cfg['lr_linear_proj_names'])
+                and not match_name_keywords(n, cfg['wd_norm_names'])
+                and p.requires_grad
+            ],
+            "lr": cfg['base_lr'] * cfg['lr_linear_proj_mult'],
+            "weight_decay": cfg['weight_decay'],
+        },
+        {
+            "params": [
+                p if not return_name else n
+                for n, p in model.named_parameters()
+                if not match_name_keywords(n, cfg['lr_backbone_names'])
+                and not match_name_keywords(n, cfg['lr_linear_proj_names'])
+                and match_name_keywords(n, cfg['wd_norm_names'])
+                and p.requires_grad
+            ],
+            "lr": cfg['base_lr'],
+            "weight_decay": cfg['weight_decay'] * cfg['wd_norm_mult'],
+        },
+        {
+            "params": [
+                p if not return_name else n
+                for n, p in model.named_parameters()
+                if match_name_keywords(n, cfg['lr_backbone_names'])
+                and not match_name_keywords(n, cfg['lr_linear_proj_names'])
+                and match_name_keywords(n, cfg['wd_norm_names'])
+                and p.requires_grad
+            ],
+            "lr": cfg['lr_backbone'],
+            "weight_decay": cfg['weight_decay'] * cfg['wd_norm_mult'],
+        },
+        {
+            "params": [
+                p if not return_name else n
+                for n, p in model.named_parameters()
+                if not match_name_keywords(n, cfg['lr_backbone_names'])
+                and match_name_keywords(n, cfg['lr_linear_proj_names'])
+                and match_name_keywords(n, cfg['wd_norm_names'])
+                and p.requires_grad
+            ],
+            "lr": cfg['base_lr'] * cfg['lr_linear_proj_mult'],
+            "weight_decay": cfg['weight_decay'] * cfg['wd_norm_mult'],
+        },
+    ]
+
+    return param_dicts
+
+## Model EMA
+class ModelEMA(object):
+    def __init__(self, cfg, model, updates=0):
+        # Create EMA
+        self.ema = deepcopy(self.de_parallel(model)).eval()  # FP32 EMA
+        self.updates = updates  # number of EMA updates
+        self.decay = lambda x: cfg['ema_decay'] * (1 - math.exp(-x / cfg['ema_tau']))  # decay exponential ramp (to help early epochs)
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def is_parallel(self, model):
+        # Returns True if model is of type DP or DDP
+        return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
+
+    def de_parallel(self, model):
+        # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
+        return model.module if self.is_parallel(model) else model
+
+    def copy_attr(self, a, b, include=(), exclude=()):
+        # Copy attributes from b to a, options to only include [...] and to exclude [...]
+        for k, v in b.__dict__.items():
+            if (len(include) and k not in include) or k.startswith('_') or k in exclude:
+                continue
+            else:
+                setattr(a, k, v)
+
+    def update(self, model):
+        # Update EMA parameters
+        self.updates += 1
+        d = self.decay(self.updates)
+
+        msd = self.de_parallel(model).state_dict()  # model state_dict
+        for k, v in self.ema.state_dict().items():
+            if v.dtype.is_floating_point:  # true for FP16 and FP32
+                v *= d
+                v += (1 - d) * msd[k].detach()
+        # assert v.dtype == msd[k].dtype == torch.float32, f'{k}: EMA {v.dtype} and model {msd[k].dtype} must be FP32'
+
+    def update_attr(self, model, include=(), exclude=('process_group', 'reducer')):
+        # Update EMA attributes
+        self.copy_attr(self.ema, model, include, exclude)
+
+
+# ---------------------------- For Loss ----------------------------
+## focal loss
+def sigmoid_focal_loss(inputs, targets, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss
+
+
+# ---------------------------- NMS ----------------------------
+def nms(bboxes, scores, nms_thresh):
+    """"Pure Python NMS."""
+    x1 = bboxes[:, 0]  #xmin
+    y1 = bboxes[:, 1]  #ymin
+    x2 = bboxes[:, 2]  #xmax
+    y2 = bboxes[:, 3]  #ymax
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        # compute iou
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(1e-10, xx2 - xx1)
+        h = np.maximum(1e-10, yy2 - yy1)
+        inter = w * h
+
+        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+        #reserve all the boundingbox whose ovr less than thresh
+        inds = np.where(iou <= nms_thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh):
+    # nms
+    keep = nms(bboxes, scores, nms_thresh)
+
+    scores = scores[keep]
+    labels = labels[keep]
+    bboxes = bboxes[keep]
+
+    return scores, labels, bboxes
+
+def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes):
+    # nms
+    keep = np.zeros(len(bboxes), dtype=np.int32)
+    for i in range(num_classes):
+        inds = np.where(labels == i)[0]
+        if len(inds) == 0:
+            continue
+        c_bboxes = bboxes[inds]
+        c_scores = scores[inds]
+        c_keep = nms(c_bboxes, c_scores, nms_thresh)
+        keep[inds[c_keep]] = 1
+
+    keep = np.where(keep > 0)
+    scores = scores[keep]
+    labels = labels[keep]
+    bboxes = bboxes[keep]
+
+    return scores, labels, bboxes
+
+def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False):
+    if class_agnostic:
+        return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh)
+    else:
+        return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes)

+ 102 - 0
odlab/utils/optimizer.py

@@ -0,0 +1,102 @@
+import torch
+from torch import optim
+
+
+def build_optimizer(optimizer_cfg, model, param_dicts=None, resume=None):
+    print('==============================')
+    print('Optimizer: {}'.format(optimizer_cfg['optimizer']))
+    print('--base_lr: {}'.format(optimizer_cfg['base_lr']))
+    print('--backbone_lr_ratio: {}'.format(optimizer_cfg['backbone_lr_ratio']))
+    print('--momentum: {}'.format(optimizer_cfg['momentum']))
+    print('--weight_decay: {}'.format(optimizer_cfg['weight_decay']))
+
+    if param_dicts is None:
+        param_dicts = [
+            {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]},
+            {
+                "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
+                "lr": optimizer_cfg['base_lr'] * optimizer_cfg['backbone_lr_ratio'],
+            },
+        ]
+
+    if optimizer_cfg['optimizer'] == 'sgd':
+        optimizer = optim.SGD(
+            params=param_dicts, 
+            lr=optimizer_cfg['base_lr'],
+            momentum=optimizer_cfg['momentum'],
+            weight_decay=optimizer_cfg['weight_decay']
+            )
+                                
+    elif optimizer_cfg['optimizer'] == 'adamw':
+        optimizer = optim.AdamW(
+            params=param_dicts, 
+            lr=optimizer_cfg['base_lr'],
+            weight_decay=optimizer_cfg['weight_decay']
+            )
+                                
+    start_epoch = 0
+    if resume is not None:
+        print('keep training: ', resume)
+        checkpoint = torch.load(resume)
+        # checkpoint state dict
+        checkpoint_state_dict = checkpoint.pop("optimizer")
+        optimizer.load_state_dict(checkpoint_state_dict)
+        start_epoch = checkpoint.pop("epoch") + 1
+                                                        
+    return optimizer, start_epoch
+
+
+def build_detr_optimizer(optimizer_cfg, model, resume=None):
+    print('==============================')
+    print('Optimizer: {}'.format(optimizer_cfg['optimizer']))
+    print('--base_lr: {}'.format(optimizer_cfg['base_lr']))
+    print('--backbone_lr_ratio: {}'.format(optimizer_cfg['backbone_lr_ratio']))
+    print('--weight_decay: {}'.format(optimizer_cfg['weight_decay']))
+
+    # ------------- Divide model's parameters -------------
+    param_dicts = [], [], [], [], [], [], []
+    norm_names = ["norm"] + ["norm{}".format(i) for i in range(10000)]
+    for n, p in model.named_parameters():
+        # Non-Backbone's learnable parameters
+        if "backbone" not in n and p.requires_grad:
+            if "bias" == n.split(".")[-1]:
+                param_dicts[0].append(p)      # no weight decay for all layers' bias
+            else:
+                if n.split(".")[-2] in norm_names:
+                    param_dicts[1].append(p)  # no weight decay for all NormLayers' weight
+                elif "cpb_mlp1" in n.split(".") or "cpb_mlp2" in n.split("."):
+                    param_dicts[2].append(p)  # no weight decay for plain-detr cpb_mlp weight
+                else:
+                    param_dicts[3].append(p)  # weight decay for all Non-NormLayers' weight
+        # Backbone's learnable parameters
+        elif "backbone" in n and p.requires_grad:
+            if "bias" == n.split(".")[-1]:
+                param_dicts[4].append(p)      # no weight decay for all layers' bias
+            else:
+                if n.split(".")[-2] in norm_names:
+                    param_dicts[5].append(p)  # no weight decay for all NormLayers' weight
+                else:
+                    param_dicts[6].append(p)  # weight decay for all Non-NormLayers' weight
+
+    # Non-Backbone's learnable parameters
+    optimizer = torch.optim.AdamW(param_dicts[0], lr=optimizer_cfg['base_lr'], weight_decay=0.0)
+    optimizer.add_param_group({"params": param_dicts[1], "weight_decay": 0.0})
+    optimizer.add_param_group({"params": param_dicts[2], "weight_decay": 0.0})
+    optimizer.add_param_group({"params": param_dicts[3], "weight_decay": optimizer_cfg['weight_decay']})
+
+    # Backbone's learnable parameters
+    backbone_lr = optimizer_cfg['base_lr'] * optimizer_cfg['backbone_lr_ratio']
+    optimizer.add_param_group({"params": param_dicts[4], "lr": backbone_lr, "weight_decay": 0.0})
+    optimizer.add_param_group({"params": param_dicts[5], "lr": backbone_lr, "weight_decay": 0.0})
+    optimizer.add_param_group({"params": param_dicts[6], "lr": backbone_lr, "weight_decay": optimizer_cfg['weight_decay']})
+
+    start_epoch = 0
+    if resume is not None:
+        print('keep training: ', resume)
+        checkpoint = torch.load(resume)
+        # checkpoint state dict
+        checkpoint_state_dict = checkpoint.pop("optimizer")
+        optimizer.load_state_dict(checkpoint_state_dict)
+        start_epoch = checkpoint.pop("epoch") + 1
+                                                        
+    return optimizer, start_epoch

+ 107 - 0
odlab/utils/plot_utils.py

@@ -0,0 +1,107 @@
+"""
+Plotting utilities to visualize training logs.
+"""
+import torch
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from pathlib import Path, PurePath
+
+
+def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
+    '''
+    Function to plot specific fields from training log(s). Plots both training and test results.
+
+    :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
+              - fields = which results to plot from each log file - plots both training and test for each field.
+              - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
+              - log_name = optional, name of log file if different than default 'log.txt'.
+
+    :: Outputs - matplotlib plots of results in fields, color coded for each log file.
+               - solid lines are training results, dashed lines are test results.
+
+    '''
+    func_name = "plot_utils.py::plot_logs"
+
+    # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
+    # convert single Path to list to avoid 'not iterable' error
+
+    if not isinstance(logs, list):
+        if isinstance(logs, PurePath):
+            logs = [logs]
+            print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
+        else:
+            raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
+            Expect list[Path] or single Path obj, received {type(logs)}")
+
+    # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir
+    for i, dir in enumerate(logs):
+        if not isinstance(dir, PurePath):
+            raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
+        if not dir.exists():
+            raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
+        # verify log_name exists
+        fn = Path(dir / log_name)
+        if not fn.exists():
+            print(f"-> missing {log_name}.  Have you gotten to Epoch 1 in training?")
+            print(f"--> full path of missing log file: {fn}")
+            return
+
+    # load log file(s) and plot
+    dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
+
+    fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
+
+    for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
+        for j, field in enumerate(fields):
+            if field == 'mAP':
+                coco_eval = pd.DataFrame(
+                    np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1]
+                ).ewm(com=ewm_col).mean()
+                axs[j].plot(coco_eval, c=color)
+            else:
+                df.interpolate().ewm(com=ewm_col).mean().plot(
+                    y=[f'train_{field}', f'test_{field}'],
+                    ax=axs[j],
+                    color=[color] * 2,
+                    style=['-', '--']
+                )
+    for ax, field in zip(axs, fields):
+        ax.legend([Path(p).name for p in logs])
+        ax.set_title(field)
+
+
+def plot_precision_recall(files, naming_scheme='iter'):
+    if naming_scheme == 'exp_id':
+        # name becomes exp_id
+        names = [f.parts[-3] for f in files]
+    elif naming_scheme == 'iter':
+        names = [f.stem for f in files]
+    else:
+        raise ValueError(f'not supported {naming_scheme}')
+    fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
+    for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
+        data = torch.load(f)
+        # precision is n_iou, n_points, n_cat, n_area, max_det
+        precision = data['precision']
+        recall = data['params'].recThrs
+        scores = data['scores']
+        # take precision for all classes, all areas and 100 detections
+        precision = precision[0, :, :, 0, -1].mean(1)
+        scores = scores[0, :, :, 0, -1].mean(1)
+        prec = precision.mean()
+        rec = data['recall'][0, :, 0, -1].mean()
+        print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
+              f'score={scores.mean():0.3f}, ' +
+              f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
+              )
+        axs[0].plot(recall, precision, c=color)
+        axs[1].plot(recall, scores, c=color)
+
+    axs[0].set_title('Precision / Recall')
+    axs[0].legend(names)
+    axs[1].set_title('Scores / Recall')
+    axs[1].legend(names)
+    return fig, axs

+ 177 - 0
odlab/utils/vis_tools.py

@@ -0,0 +1,177 @@
+import cv2
+import os
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+# -------------------------- For Detection Task --------------------------
+## visualize the input data during the training stage
+def vis_data(images, targets, masks=None, class_labels=None, normalized_coord=False, box_format='xyxy'):
+    """
+        images: (tensor) [B, 3, H, W]
+        masks: (Tensor) [B, H, W]
+        targets: (list) a list of targets
+    """
+    batch_size = images.size(0)
+    np.random.seed(0)
+    class_colors = [(np.random.randint(255),
+                     np.random.randint(255),
+                     np.random.randint(255)) for _ in range(80)]
+    pixel_means = [0.485, 0.456, 0.406]
+    pixel_std   = [0.229, 0.224, 0.225]
+
+    for bi in range(batch_size):
+        target = targets[bi]
+        # to numpy
+        image = images[bi].permute(1, 2, 0).cpu().numpy()
+        not_mask = ~masks[bi]
+        img_h = not_mask.cumsum(0, dtype=torch.int32)[-1, 0]
+        img_w = not_mask.cumsum(1, dtype=torch.int32)[0, -1]
+        # denormalize
+        image = (image * pixel_std + pixel_means) * 255
+        image = image[:, :, (2, 1, 0)].astype(np.uint8)
+        image = image.copy()
+
+        tgt_boxes = target['boxes'].float()
+        tgt_labels = target['labels'].long()
+        for box, label in zip(tgt_boxes, tgt_labels):
+            box_ = box.clone()
+            if normalized_coord:
+                box_[..., [0, 2]] *= img_w
+                box_[..., [1, 3]] *= img_h
+            if box_format == 'xywh':
+                box_x1y1 = box_[..., :2] - box_[..., 2:] * 0.5
+                box_x2y2 = box_[..., :2] + box_[..., 2:] * 0.5
+                box_ = torch.cat([box_x1y1, box_x2y2], dim=-1)
+            x1, y1, x2, y2 = box_.long().cpu().numpy()
+            
+            cls_id = label.item()
+            color = class_colors[cls_id]
+            # draw box
+            cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
+            if class_labels is not None:
+                class_name = class_labels[cls_id]
+                # plot title bbox
+                t_size = cv2.getTextSize(class_name, 0, fontScale=1, thickness=2)[0]
+                cv2.rectangle(image, (x1, y1-t_size[1]), (int(x1 + t_size[0] * 0.4), y1), color, -1)
+                # put the test on the title bbox
+                cv2.putText(image, class_name, (x1, y1 - 5), 0, 0.4, (0, 0, 0), 1, lineType=cv2.LINE_AA)
+
+        cv2.imshow('train target', image)
+        cv2.waitKey(0)
+
+## plot bbox & label on image
+def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4):
+    x1, y1, x2, y2 = bbox
+    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+    t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0]
+    # plot bbox
+    cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2)
+    
+    if label is not None:
+        # plot title bbox
+        cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1)
+        # put the test on the title bbox
+        cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA)
+
+    return img
+
+## visualize detection
+def visualize(img, 
+              bboxes, 
+              scores, 
+              labels, 
+              vis_thresh, 
+              class_colors, 
+              class_names):
+    ts = 0.4
+    for i, bbox in enumerate(bboxes):
+        if scores[i] > vis_thresh:
+            cls_id = int(labels[i])
+            cls_color = class_colors[cls_id]
+                
+            mess = '%s: %.2f' % (class_names[cls_id], scores[i])
+            img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts)
+
+    return img
+        
+
+## convert feature to he heatmap
+def convert_feature_heatmap(feature):
+    """
+        feature: (ndarray) [H, W, C]
+    """
+    heatmap = None
+
+    return heatmap
+
+## draw feature on the image
+def draw_feature(img, features, save=None):
+    """
+        img: (ndarray & cv2.Mat) [H, W, C], where the C is 3 for RGB or 1 for Gray.
+        features: (List[ndarray]). It is a list of the multiple feature map whose shape is [H, W, C].
+        save: (bool) save the result or not.
+    """
+    img_h, img_w = img.shape[:2]
+
+    for i, fmp in enumerate(features):
+        hmp = convert_feature_heatmap(fmp)
+        hmp = cv2.resize(hmp, (img_w, img_h))
+        hmp = hmp.astype(np.uint8)*255
+        hmp_rgb = cv2.applyColorMap(hmp, cv2.COLORMAP_JET)
+        
+        superimposed_img = hmp_rgb * 0.4 + img 
+
+        # show the heatmap
+        plt.imshow(hmp)
+        plt.close()
+
+        # show the image with heatmap
+        cv2.imshow("image with heatmap", superimposed_img)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
+
+        if save:
+            save_dir = 'feature_heatmap'
+            os.makedirs(save_dir, exist_ok=True)
+            cv2.imwrite(os.path.join(save_dir, 'feature_{}.png'.format(i) ), superimposed_img)    
+
+
+# -------------------------- For Tracking Task --------------------------
+def get_color(idx):
+    idx = idx * 3
+    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
+
+    return color
+
+
+def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2=None):
+    im = np.ascontiguousarray(np.copy(image))
+    im_h, im_w = im.shape[:2]
+
+    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255
+
+    #text_scale = max(1, image.shape[1] / 1600.)
+    #text_thickness = 2
+    #line_thickness = max(1, int(image.shape[1] / 500.))
+    text_scale = 2
+    text_thickness = 2
+    line_thickness = 3
+
+    radius = max(5, int(im_w/140.))
+    cv2.putText(im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
+                (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, 2, (0, 0, 255), thickness=2)
+
+    for i, tlwh in enumerate(tlwhs):
+        x1, y1, w, h = tlwh
+        intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
+        obj_id = int(obj_ids[i])
+        id_text = '{}'.format(int(obj_id))
+        if ids2 is not None:
+            id_text = id_text + ', {}'.format(int(ids2[i]))
+        color = get_color(abs(obj_id))
+        cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)
+        cv2.putText(im, id_text, (intbox[0], intbox[1]), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255),
+                    thickness=text_thickness)
+    return im

+ 110 - 0
odlab/utils/weight_init.py

@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+
+import torch.nn as nn
+
+
+def constant_init(module, val, bias=0):
+    nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.xavier_uniform_(module.weight, gain=gain)
+    else:
+        nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.kaiming_uniform_(module.weight,
+                                 a=a,
+                                 mode=mode,
+                                 nonlinearity=nonlinearity)
+    else:
+        nn.init.kaiming_normal_(module.weight,
+                                a=a,
+                                mode=mode,
+                                nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def caffe2_xavier_init(module, bias=0):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    kaiming_init(module,
+                 a=1,
+                 mode='fan_in',
+                 nonlinearity='leaky_relu',
+                 bias=bias,
+                 distribution='uniform')
+
+
+def c2_xavier_fill(module: nn.Module):
+    """
+    Initialize `module.weight` using the "XavierFill" implemented in Caffe2.
+    Also initializes `module.bias` to 0.
+
+    Args:
+        module (torch.nn.Module): module to initialize.
+    """
+    # Caffe2 implementation of XavierFill in fact
+    # corresponds to kaiming_uniform_ in PyTorch
+    nn.init.kaiming_uniform_(module.weight, a=1)
+    if module.bias is not None:
+        nn.init.constant_(module.bias, 0)
+
+
+def c2_msra_fill(module: nn.Module):
+    """
+    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2.
+    Also initializes `module.bias` to 0.
+
+    Args:
+        module (torch.nn.Module): module to initialize.
+    """
+    nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+    if module.bias is not None:
+        nn.init.constant_(module.bias, 0)
+
+
+def init_weights(m: nn.Module, zero_init_final_gamma=False):
+    """Performs ResNet-style weight initialization."""
+    if isinstance(m, nn.Conv2d):
+        # Note that there is no bias due to BN
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out))
+    elif isinstance(m, nn.BatchNorm2d):
+        zero_init_gamma = (
+            hasattr(m, "final_bn") and m.final_bn and zero_init_final_gamma
+        )
+        m.weight.data.fill_(0.0 if zero_init_gamma else 1.0)
+        m.bias.data.zero_()
+    elif isinstance(m, nn.Linear):
+        m.weight.data.normal_(mean=0.0, std=0.01)
+        m.bias.data.zero_()

+ 10 - 0
yolo/.gitignore

@@ -0,0 +1,10 @@
+*.pt
+*.pth
+*.pkl
+*.onnx
+*.pyc
+*.zip
+weights
+__pycache__
+det_results
+.vscode

+ 0 - 0
LICENSE → yolo/LICENSE


+ 0 - 0
README.md → yolo/README.md


+ 0 - 0
config/__init__.py → yolo/config/__init__.py


+ 0 - 0
config/gelan_config.py → yolo/config/gelan_config.py


+ 0 - 0
config/rtdetr_config.py → yolo/config/rtdetr_config.py


+ 0 - 0
config/yolov1_config.py → yolo/config/yolov1_config.py


+ 0 - 0
config/yolov2_config.py → yolo/config/yolov2_config.py


+ 0 - 0
config/yolov3_config.py → yolo/config/yolov3_config.py


+ 0 - 0
config/yolov5_af_config.py → yolo/config/yolov5_af_config.py


+ 0 - 0
config/yolov5_config.py → yolo/config/yolov5_config.py


+ 0 - 0
config/yolov6_config.py → yolo/config/yolov6_config.py


+ 0 - 0
config/yolov7_af_config.py → yolo/config/yolov7_af_config.py


+ 0 - 0
config/yolov8_config.py → yolo/config/yolov8_config.py


+ 0 - 0
dataset/__init__.py → yolo/dataset/__init__.py


+ 0 - 0
dataset/build.py → yolo/dataset/build.py


+ 0 - 0
dataset/coco.py → yolo/dataset/coco.py


+ 0 - 0
dataset/customed.py → yolo/dataset/customed.py


+ 0 - 0
dataset/data_augment/ssd_augment.py → yolo/dataset/data_augment/ssd_augment.py


+ 0 - 0
dataset/data_augment/strong_augment.py → yolo/dataset/data_augment/strong_augment.py


+ 0 - 0
dataset/data_augment/yolo_augment.py → yolo/dataset/data_augment/yolo_augment.py


+ 0 - 0
dataset/scripts/COCO2017.sh → yolo/dataset/scripts/COCO2017.sh


+ 0 - 0
dataset/scripts/VOC2007.sh → yolo/dataset/scripts/VOC2007.sh


+ 0 - 0
dataset/scripts/VOC2012.sh → yolo/dataset/scripts/VOC2012.sh


Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov