1 year ago · 5183498321
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -33,8 +33,7 @@ from .data_config.transform_config import (
 
				     # SSD-Style
			
 
				     ssd_trans_config,
			
 
				     # RT-DETR style
			
 
				-    rtdetr_s_trans_config,
			
 
				-    rtdetr_l_trans_config,
			
 
				+    rtdetr_base_trans_config,
			
 
				 )
			
 
				 
			
 
				 def build_trans_config(trans_config='ssd'):
			
@@ -74,10 +73,8 @@ def build_trans_config(trans_config='ssd'):
 
				         cfg = yolox_x_trans_config
			
 
				 
			
 
				     # RT-DETR style
			
 
				-    elif trans_config == 'rtdetr_s':
			
 
				-        cfg = rtdetr_s_trans_config
			
 
				-    elif trans_config == 'rtdetr_l':
			
 
				-        cfg = rtdetr_l_trans_config
			
 
				+    elif trans_config == 'rtdetr_base':
			
 
				+        cfg = rtdetr_base_trans_config
			
 
				 
			
 
				     print('Transform Config: {} \n'.format(cfg))
			
 
				 
			
--- a/config/data_config/transform_config.py
+++ b/config/data_config/transform_config.py
@@ -296,32 +296,7 @@ ssd_trans_config = {
 
				 
			
 
				 
			
 
				 # ----------------------- SSD-Style Transform -----------------------
			
 
				-rtdetr_s_trans_config = {
			
 
				-    'aug_type': 'rtdetr',
			
 
				-    'use_ablu': True,
			
 
				-    'pixel_mean': [123.675, 116.28, 103.53],  # IN-1K statistics
			
 
				-    'pixel_std':  [58.395, 57.12, 57.375],    # IN-1K statistics
			
 
				-    # Basic Augment
			
 
				-    'affine_params': {
			
 
				-        'degrees': 0.0,
			
 
				-        'translate': 0.2,
			
 
				-        'scale': [0.1, 2.0],
			
 
				-        'shear': 0.0,
			
 
				-        'perspective': 0.0,
			
 
				-        'hsv_h': 0.015,
			
 
				-        'hsv_s': 0.7,
			
 
				-        'hsv_v': 0.4,
			
 
				-    },
			
 
				-    # Mosaic & Mixup
			
 
				-    'mosaic_keep_ratio': False,
			
 
				-    'mosaic_prob': 0.0,
			
 
				-    'mixup_prob':  0.0,
			
 
				-    'mosaic_type': 'yolov5',
			
 
				-    'mixup_type':  'yolov5',
			
 
				-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
			
 
				-}
			
 
				-
			
 
				-rtdetr_l_trans_config = {
			
 
				+rtdetr_base_trans_config = {
			
 
				     'aug_type': 'rtdetr',
			
 
				     'use_ablu': True,
			
 
				     'pixel_mean': [123.675, 116.28, 103.53],  # IN-1K statistics
			
--- a/config/model_config/rtdetr_config.py
+++ b/config/model_config/rtdetr_config.py
@@ -53,7 +53,7 @@ rtdetr_cfg = {
 
				         # ---------------- Train config ----------------
			
 
				         ## input
			
 
				         'multi_scale': [0.5, 1.25],   # 320 -> 800
			
 
				-        'trans_type': 'rtdetr_s',
			
 
				+        'trans_type': 'rtdetr_base',
			
 
				         # ---------------- Train config ----------------
			
 
				         'trainer_type': 'rtdetr',
			
 
				     },
			
@@ -110,7 +110,7 @@ rtdetr_cfg = {
 
				         # ---------------- Train config ----------------
			
 
				         ## input
			
 
				         'multi_scale': [0.5, 1.25],   # 320 -> 800
			
 
				-        'trans_type': 'rtdetr_l',
			
 
				+        'trans_type': 'rtdetr_base',
			
 
				         # ---------------- Train config ----------------
			
 
				         'trainer_type': 'rtdetr',
			
 
				     },
			
@@ -167,7 +167,7 @@ rtdetr_cfg = {
 
				         # ---------------- Train config ----------------
			
 
				         ## input
			
 
				         'multi_scale': [0.5, 1.25],   # 320 -> 800
			
 
				-        'trans_type': 'rtdetr_l',
			
 
				+        'trans_type': 'rtdetr_base',
			
 
				         # ---------------- Train config ----------------
			
 
				         'trainer_type': 'rtdetr',
			
 
				     },
			
--- a/config/model_config/yolov3_config.py
+++ b/config/model_config/yolov3_config.py
@@ -35,7 +35,7 @@ yolov3_cfg = {
 
				         # ---------------- Train config ----------------
			
 
				         ## input
			
 
				         'trans_type': 'yolov5_l',
			
 
				-        'multi_scale': [0.5, 1.0],
			
 
				+        'multi_scale': [0.5, 1.25],  # 320 -> 800
			
 
				         # ---------------- Assignment config ----------------
			
 
				         ## matcher
			
 
				         'iou_thresh': 0.5,
			
@@ -82,7 +82,7 @@ yolov3_cfg = {
 
				         # ---------------- Train config ----------------
			
 
				         ## input
			
 
				         'trans_type': 'yolov5_n',
			
 
				-        'multi_scale': [0.5, 1.0],
			
 
				+        'multi_scale': [0.5, 1.25],  # 320 -> 800
			
 
				         # ---------------- Assignment config ----------------
			
 
				         ## matcher
			
 
				         'iou_thresh': 0.5,
			
--- a/config/model_config/yolov4_config.py
+++ b/config/model_config/yolov4_config.py
@@ -35,7 +35,7 @@ yolov4_cfg = {
 
				         # ---------------- Train config ----------------
			
 
				         ## input
			
 
				         'trans_type': 'yolov5_l',
			
 
				-        'multi_scale': [0.5, 1.0],
			
 
				+        'multi_scale': [0.5, 1.25],  # 320 -> 800
			
 
				         # ---------------- Assignment config ----------------
			
 
				         ## matcher
			
 
				         'iou_thresh': 0.5,
			
@@ -82,7 +82,7 @@ yolov4_cfg = {
 
				         # ---------------- Train config ----------------
			
 
				         ## input
			
 
				         'trans_type': 'yolov5_n',
			
 
				-        'multi_scale': [0.5, 1.0],
			
 
				+        'multi_scale': [0.5, 1.25],  # 320 -> 800
			
 
				         # ---------------- Assignment config ----------------
			
 
				         ## matcher
			
 
				         'iou_thresh': 0.5,
			
--- a/dataset/coco.py
+++ b/dataset/coco.py
@@ -272,24 +272,26 @@ if __name__ == "__main__":
 
				 
			
 
				     trans_config = {
			
 
				         'aug_type': args.aug_type,    # optional: ssd, yolov5
			
 
				-        'pixel_mean': [0., 0., 0.],
			
 
				-        'pixel_std':  [255., 255., 255.],
			
 
				-        # Basic Augment
			
 
				-        'degrees': 0.0,
			
 
				-        'translate': 0.2,
			
 
				-        'scale': [0.1, 2.0],
			
 
				-        'shear': 0.0,
			
 
				-        'perspective': 0.0,
			
 
				-        'hsv_h': 0.015,
			
 
				-        'hsv_s': 0.7,
			
 
				-        'hsv_v': 0.4,
			
 
				+        'pixel_mean': [123.675, 116.28, 103.53],
			
 
				+        'pixel_std':  [58.395, 57.12, 57.375],
			
 
				         'use_ablu': True,
			
 
				+        # Basic Augment
			
 
				+        'affine_params': {
			
 
				+            'degrees': 0.0,
			
 
				+            'translate': 0.2,
			
 
				+            'scale': [0.1, 2.0],
			
 
				+            'shear': 0.0,
			
 
				+            'perspective': 0.0,
			
 
				+            'hsv_h': 0.015,
			
 
				+            'hsv_s': 0.7,
			
 
				+            'hsv_v': 0.4,
			
 
				+        },
			
 
				         # Mosaic & Mixup
			
 
				+        'mosaic_keep_ratio': False,
			
 
				         'mosaic_prob': args.mosaic,
			
 
				         'mixup_prob': args.mixup,
			
 
				-        'mosaic_type': 'yolov5_mosaic',
			
 
				-        'mixup_type': args.mixup_type,   # optional: yolov5_mixup, yolox_mixup
			
 
				-        'mosaic_keep_ratio': False,
			
 
				+        'mosaic_type': 'yolov5',
			
 
				+        'mixup_type':  'yolov5',
			
 
				         'mixup_scale': [0.5, 1.5]
			
 
				     }
			
 
				     transform, trans_cfg = build_transform(args, trans_config, 32, args.is_train)
			
--- a/demo.py
+++ b/demo.py
@@ -277,6 +277,7 @@ def run():
 
				     model_cfg = build_model_config(args)
			
 
				     trans_cfg = build_trans_config(model_cfg['trans_type'])
			
 
				     data_cfg  = build_dataset_config(args)
			
 
				+    
			
 
				     ## Data info
			
 
				     num_classes = data_cfg['num_classes']
			
 
				     class_names = data_cfg['class_names']
			
--- a/models/detectors/rtdetr/README.md
+++ b/models/detectors/rtdetr/README.md
@@ -10,6 +10,7 @@ This model is not yet complete.
 
				 
			
 
				 - For the backbone of the image encoder, we use the IN-1K classification pretrained weight from torchvision, which is different from the official
			
 
				 RT-DETR. It might be hard to train RT-DETR from scratch without IN-1K pretrained weight.
			
 
				+- For the HybridEncoder, we use the C2f of YOLOv8 rather than the CSPRepLayer.
			
 
				 - For training, we train RT-DETR series with 6x (~72 epochs) schedule on COCO and use ModelEMA trick. We close the fp16 training trick.
			
 
				 - For data augmentation, we use the `color jitter`, `random hflip`, `random crop`, and multi-scale training trick.
			
 
				 - For optimizer, we use AdamW with weight decay 0.0001 and base per image lr 0.0001 / 16.
			
--- a/models/detectors/yolov8/README.md
+++ b/models/detectors/yolov8/README.md
@@ -3,7 +3,7 @@
 
				 |   Model   |  Batch | Scale | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
			
 
				 |-----------|--------|-------|------------------------|-------------------|-------------------|--------------------|--------|
			
 
				 | YOLOv8-N  | 8xb16  |  640  |          37.0          |        52.9       |        8.8        |         3.2        | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/yolo_tutorial_ckpt/yolov8_n_coco.pth) |
			
 
				-| YOLOv8-S  | 8xb16  |  640  |                        |                   |                   |                    |  |
			
 
				+| YOLOv8-S  | 8xb16  |  640  |          43.5          |        60.4       |       28.8        |         11.2       | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/yolo_tutorial_ckpt/yolov8_s_coco.pth) |
			
 
				 | YOLOv8-M  | 8xb16  |  640  |                        |                   |                   |                    |  |
			
 
				 | YOLOv8-L  | 8xb16  |  640  |          50.7          |        68.3       |       165.7       |         43.7       | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/yolo_tutorial_ckpt/yolov8_l_coco.pth) |
			
 
				 
			
--- a/models/detectors/yolov8/yolov8_backbone.py
+++ b/models/detectors/yolov8/yolov8_backbone.py
@@ -105,9 +105,9 @@ if __name__ == '__main__':
 
				         'bk_act': 'silu',
			
 
				         'bk_norm': 'BN',
			
 
				         'bk_depthwise': False,
			
 
				-        'width': 1.0,
			
 
				-        'depth': 1.0,
			
 
				-        'ratio': 1.0,
			
 
				+        'width': 0.25,
			
 
				+        'depth': 0.34,
			
 
				+        'ratio': 2.0,
			
 
				     }
			
 
				     model, feats = build_backbone(cfg)
			
 
				     x = torch.randn(1, 3, 640, 640)
			
--- a/models/detectors/yolov8/yolov8_pafpn.py
+++ b/models/detectors/yolov8/yolov8_pafpn.py
@@ -129,12 +129,12 @@ if __name__ == '__main__':
 
				         'fpn_act': 'silu',
			
 
				         'fpn_norm': 'BN',
			
 
				         'fpn_depthwise': False,
			
 
				-        'width': 1.0,
			
 
				-        'depth': 1.0,
			
 
				-        'ratio': 1.0,
			
 
				+        'width': 0.25,
			
 
				+        'depth': 0.34,
			
 
				+        'ratio': 2.0,
			
 
				     }
			
 
				-    model = build_fpn(cfg, in_dims=[256, 512, 512])
			
 
				-    pyramid_feats = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 512, 20, 20)]
			
 
				+    model = build_fpn(cfg, in_dims=[64, 128, 256])
			
 
				+    pyramid_feats = [torch.randn(1, 64, 80, 80), torch.randn(1, 128, 40, 40), torch.randn(1, 256, 20, 20)]
			
 
				     t0 = time.time()
			
 
				     outputs = model(pyramid_feats)
			
 
				     t1 = time.time()
			
--- a/models/detectors/yolov8/yolov8_pred.py
+++ b/models/detectors/yolov8/yolov8_pred.py
@@ -111,9 +111,13 @@ class MultiLevelPredLayer(nn.Module):
 
				                                   for level in range(num_levels)
			
 
				                                   ])
			
 
				         ## proj conv
			
 
				-        proj_init = torch.arange(reg_max, dtype=torch.float)
			
 
				         self.proj_conv = nn.Conv2d(self.reg_max, 1, kernel_size=1, bias=False).requires_grad_(False)
			
 
				-        self.proj_conv.weight.data[:] = nn.Parameter(proj_init.view([1, reg_max, 1, 1]))
			
 
				+
			
 
				+        self._reset_parameters()
			
 
				+
			
 
				+    def _reset_parameters(self):
			
 
				+        proj_init = torch.arange(self.reg_max, dtype=torch.float)
			
 
				+        self.proj_conv.weight.data[:] = nn.Parameter(proj_init.view([1, self.reg_max, 1, 1]), requires_grad=False)
			
 
				 
			
 
				     def forward(self, cls_feats, reg_feats):
			
 
				         all_anchors = []
			
--- a/models/detectors/yolox/README.md
+++ b/models/detectors/yolox/README.md
@@ -12,21 +12,6 @@
 
				 - For optimizer, we use SGD with weight decay 0.0005 and base per image lr 0.01 / 64,.
			
 
				 - For learning rate scheduler, we use Cosine decay scheduler.
			
 
				 
			
 
				-On the other hand, we are trying to use **AdamW** to train our reproduced YOLOX. We will update the new results as soon as possible.
			
 
				-
			
 
				-|   Model | Batch | Scale | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
			
 
				-|---------|-------|-------|------------------------|-------------------|-------------------|--------------------|--------|
			
 
				-| YOLOX-N | 8xb16 |  640  |                        |                   |                   |                    |  |
			
 
				-| YOLOX-S | 8xb16 |  640  |                        |                   |                   |                    |  |
			
 
				-| YOLOX-M | 8xb16 |  640  |                        |                   |                   |                    |  |
			
 
				-| YOLOX-L | 8xb16 |  640  |                        |                   |                   |                    |  |
			
 
				-| YOLOX-X | 8xb16 |  640  |                        |                   |                   |                    |  |
			
 
				-
			
 
				-- For training, we train YOLOX series with 300 epochs on COCO.
			
 
				-- For data augmentation, we use the large scale jitter (LSJ), Mosaic augmentation and Mixup augmentation.
			
 
				-- For optimizer, we use AdamW with weight decay 0.05 and base per image lr 0.001 / 64,.
			
 
				-- For learning rate scheduler, we use linear decay scheduler.
			
 
				-
			
 
				 ## Train YOLOX
			
 
				 ### Single GPU
			
 
				 Taking training YOLOX-S on COCO as the example,
			
--- a/test.py
+++ b/test.py
@@ -70,10 +70,6 @@ def parse_args():
 
				     parser.add_argument('--load_cache', action='store_true', default=False,
			
 
				                         help='load data into memory.')
			
 
				 
			
 
				-    # Task setting
			
 
				-    parser.add_argument('-t', '--task', default='det', choices=['det', 'det_seg', 'det_pos', 'det_seg_pos'],
			
 
				-                        help='task type.')
			
 
				-
			
 
				     return parser.parse_args()
			
 
				 
			
 
				 
			
@@ -131,18 +127,6 @@ def test_det(args,
 
				             # save result
			
 
				             cv2.imwrite(os.path.join(save_path, str(index).zfill(6) +'.jpg'), img_processed)
			
 
				 
			
 
				-@torch.no_grad()
			
 
				-def test_det_seg():
			
 
				-    pass
			
 
				-
			
 
				-@torch.no_grad()
			
 
				-def test_det_pos():
			
 
				-    pass
			
 
				-
			
 
				-@torch.no_grad()
			
 
				-def test_det_seg_pos():
			
 
				-    pass
			
 
				-
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     args = parse_args()
			
@@ -199,19 +183,12 @@ if __name__ == '__main__':
 
				         
			
 
				     print("================= DETECT =================")
			
 
				     # run
			
 
				-    if args.task == "det":
			
 
				-        test_det(args=args,
			
 
				-                model=model, 
			
 
				-                device=device, 
			
 
				-                dataset=dataset,
			
 
				-                transform=val_transform,
			
 
				-                class_colors=class_colors,
			
 
				-                class_names=dataset_info['class_names'],
			
 
				-                class_indexs=dataset_info['class_indexs'],
			
 
				-                )
			
 
				-    elif args.task == "det_seg":
			
 
				-        test_det_seg()
			
 
				-    elif args.task == "det_pos":
			
 
				-        test_det_pos()
			
 
				-    elif args.task == "det_seg_pos":
			
 
				-        test_det_seg_pos()
			
 
				+    test_det(args=args,
			
 
				+             model=model, 
			
 
				+             device=device, 
			
 
				+             dataset=dataset,
			
 
				+             transform=val_transform,
			
 
				+             class_colors=class_colors,
			
 
				+             class_names=dataset_info['class_names'],
			
 
				+             class_indexs=dataset_info['class_indexs'],
			
 
				+             )