yjh0410 1 yıl önce
ebeveyn
işleme
3ddb801333

+ 3 - 6
config/__init__.py

@@ -33,9 +33,8 @@ from .data_config.transform_config import (
     # SSD-Style
     ssd_trans_config,
     # RT-DETR style
-    rtdetr_base_trans_config,
+    rtdetr_s_trans_config,
     rtdetr_l_trans_config,
-    rtdetr_x_trans_config
 )
 
 def build_trans_config(trans_config='ssd'):
@@ -75,12 +74,10 @@ def build_trans_config(trans_config='ssd'):
         cfg = yolox_x_trans_config
 
     # RT-DETR style
-    elif trans_config == 'rtdetr_base':
-        cfg = rtdetr_base_trans_config
+    elif trans_config == 'rtdetr_s':
+        cfg = rtdetr_s_trans_config
     elif trans_config == 'rtdetr_l':
         cfg = rtdetr_l_trans_config
-    elif trans_config == 'rtdetr_x':
-        cfg = rtdetr_x_trans_config
 
     print('Transform Config: {} \n'.format(cfg))
 

+ 236 - 204
config/data_config/transform_config.py

@@ -4,256 +4,280 @@
 # ----------------------- YOLOv5-Style Transform -----------------------
 yolov5_x_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 0.0,
-    'translate': 0.2,
-    'scale': [0.1, 2.0],
-    'shear': 0.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': True,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 0.0,
+        'translate': 0.2,
+        'scale': [0.1, 2.0],
+        'shear': 0.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 0.2,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  0.2,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolov5_l_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 0.0,
-    'translate': 0.2,
-    'scale': [0.1, 2.0],
-    'shear': 0.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': True,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 0.0,
+        'translate': 0.2,
+        'scale': [0.1, 2.0],
+        'shear': 0.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 0.15,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  0.15,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolov5_m_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 0.0,
-    'translate': 0.2,
-    'scale': [0.1, 2.0],
-    'shear': 0.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': True,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 0.0,
+        'translate': 0.2,
+        'scale': [0.1, 2.0],
+        'shear': 0.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 0.10,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  0.10,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolov5_s_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 0.0,
-    'translate': 0.2,
-    'scale': [0.1, 2.0],
-    'shear': 0.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': True,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 0.0,
+        'translate': 0.2,
+        'scale': [0.1, 2.0],
+        'shear': 0.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 0.0,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  0.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolov5_n_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 0.0,
-    'translate': 0.1,
-    'scale': [0.5, 1.5],
-    'shear': 0.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': True,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 0.0,
+        'translate': 0.1,
+        'scale': [0.5, 1.5],
+        'shear': 0.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 0.0,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  0.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolov5_p_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 0.0,
-    'translate': 0.1,
-    'scale': [0.5, 1.5],
-    'shear': 0.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': True,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 0.0,
+        'translate': 0.1,
+        'scale': [0.5, 1.5],
+        'shear': 0.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 0.5,
-    'mixup_prob': 0.0,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 0.5,
+    'mixup_prob':  0.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 
 # ----------------------- YOLOX-Style Transform -----------------------
 yolox_x_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 10.0,
-    'translate': 0.1,
-    'scale': [0.1, 2.0],
-    'shear': 2.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': False,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 10.0,
+        'translate': 0.1,
+        'scale': [0.1, 2.0],
+        'shear': 2.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 1.0,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolox_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]
+    'mosaic_prob': 1.0,
+    'mixup_prob':  1.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolox',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolox_l_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 10.0,
-    'translate': 0.1,
-    'scale': [0.1, 2.0],
-    'shear': 2.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': False,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 10.0,
+        'translate': 0.1,
+        'scale': [0.1, 2.0],
+        'shear': 2.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 1.0,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolox_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  1.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolox',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolox_m_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 10.0,
-    'translate': 0.1,
-    'scale': [0.1, 2.0],
-    'shear': 2.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': False,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 10.0,
+        'translate': 0.1,
+        'scale': [0.1, 2.0],
+        'shear': 2.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 1.0,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolox_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  1.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolox',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolox_s_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 10.0,
-    'translate': 0.1,
-    'scale': [0.1, 2.0],
-    'shear': 2.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': False,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 10.0,
+        'translate': 0.1,
+        'scale': [0.1, 2.0],
+        'shear': 2.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 1.0,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolox_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  1.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolox',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolox_n_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 10.0,
-    'translate': 0.1,
-    'scale': [0.5, 1.5],
-    'shear': 2.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': False,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 10.0,
+        'translate': 0.1,
+        'scale': [0.1, 2.0],
+        'shear': 2.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 1.0,
-    'mixup_prob': 0.5,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolox_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 1.0,
+    'mixup_prob':  0.5,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolox',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 yolox_p_trans_config = {
     'aug_type': 'yolov5',
-    # Basic Augment
-    'degrees': 10.0,
-    'translate': 0.1,
-    'scale': [0.5, 1.5],
-    'shear': 2.0,
-    'perspective': 0.0,
-    'hsv_h': 0.015,
-    'hsv_s': 0.7,
-    'hsv_v': 0.4,
     'use_ablu': False,
+    # Basic Augment
+    'affine_params': {
+        'degrees': 10.0,
+        'translate': 0.1,
+        'scale': [0.1, 2.0],
+        'shear': 2.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
     # Mosaic & Mixup
-    'mosaic_prob': 0.5,
-    'mixup_prob': 0.0,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolox_mixup',
     'mosaic_keep_ratio': True,
-    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
+    'mosaic_prob': 0.5,
+    'mixup_prob':  0.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolox',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 
@@ -262,54 +286,62 @@ ssd_trans_config = {
     'aug_type': 'ssd',
     'use_ablu': False,
     # Mosaic & Mixup are not used for SSD-style augmentation
-    'mosaic_prob': 0.,
-    'mixup_prob': 0.,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
     'mosaic_keep_ratio': False,
-    'mixup_scale': [0.5, 1.5]
+    'mosaic_prob': 0.0,
+    'mixup_prob':  0.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 
 # ----------------------- SSD-Style Transform -----------------------
-rtdetr_base_trans_config = {
+rtdetr_s_trans_config = {
     'aug_type': 'rtdetr',
-    'use_ablu': False,
+    'use_ablu': True,
     'pixel_mean': [123.675, 116.28, 103.53],  # IN-1K statistics
     'pixel_std':  [58.395, 57.12, 57.375],    # IN-1K statistics
-    # Mosaic & Mixup are not used for RT_DETR-style augmentation
-    'mosaic_prob': 0.,
-    'mixup_prob': 0.,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
+    # Basic Augment
+    'affine_params': {
+        'degrees': 0.0,
+        'translate': 0.2,
+        'scale': [0.1, 2.0],
+        'shear': 0.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
+    # Mosaic & Mixup
     'mosaic_keep_ratio': False,
-    'mixup_scale': [0.5, 1.5]
+    'mosaic_prob': 1.0,
+    'mixup_prob':  0.0,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }
 
 rtdetr_l_trans_config = {
     'aug_type': 'rtdetr',
-    'use_ablu': False,
-    'pixel_mean': [0., 0., 0.],
-    'pixel_std':  [255., 255., 255.],
-    # Mosaic & Mixup are not used for RT_DETR-style augmentation
-    'mosaic_prob': 0.,
-    'mixup_prob': 0.,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
-    'mosaic_keep_ratio': False,
-    'mixup_scale': [0.5, 1.5]
-}
-
-rtdetr_x_trans_config = {
-    'aug_type': 'rtdetr',
-    'use_ablu': False,
-    'pixel_mean': [0., 0., 0.],
-    'pixel_std':  [255., 255., 255.],
-    # Mosaic & Mixup are not used for RT_DETR-style augmentation
-    'mosaic_prob': 0.,
-    'mixup_prob': 0.,
-    'mosaic_type': 'yolov5_mosaic',
-    'mixup_type': 'yolov5_mixup',
+    'use_ablu': True,
+    'pixel_mean': [123.675, 116.28, 103.53],  # IN-1K statistics
+    'pixel_std':  [58.395, 57.12, 57.375],    # IN-1K statistics
+    # Basic Augment
+    'affine_params': {
+        'degrees': 0.0,
+        'translate': 0.2,
+        'scale': [0.1, 2.0],
+        'shear': 0.0,
+        'perspective': 0.0,
+        'hsv_h': 0.015,
+        'hsv_s': 0.7,
+        'hsv_v': 0.4,
+    },
+    # Mosaic & Mixup
     'mosaic_keep_ratio': False,
-    'mixup_scale': [0.5, 1.5]
+    'mosaic_prob': 1.0,
+    'mixup_prob':  0.15,
+    'mosaic_type': 'yolov5',
+    'mixup_type':  'yolov5',
+    'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp, just for YOLOXMixup
 }

+ 2 - 2
config/model_config/rtdetr_config.py

@@ -54,7 +54,7 @@ rtdetr_cfg = {
         # ---------------- Train config ----------------
         ## input
         'multi_scale': [0.5, 1.25],   # 320 -> 800
-        'trans_type': 'rtdetr_base',
+        'trans_type': 'rtdetr_s',
         # ---------------- Train config ----------------
         'trainer_type': 'rtdetr',
     },
@@ -112,7 +112,7 @@ rtdetr_cfg = {
         # ---------------- Train config ----------------
         ## input
         'multi_scale': [0.5, 1.25],   # 320 -> 800
-        'trans_type': 'rtdetr_base',
+        'trans_type': 'rtdetr_l',
         # ---------------- Train config ----------------
         'trainer_type': 'rtdetr',
     },

+ 8 - 7
dataset/build.py

@@ -111,23 +111,24 @@ def build_transform(args, trans_config, max_stride=32, is_train=False):
     ## SSD style transform
     if trans_config['aug_type'] == 'ssd':
         if is_train:
-            transform = SSDAugmentation(img_size=args.img_size,)
+            transform = SSDAugmentation(args.img_size)
         else:
-            transform = SSDBaseTransform(img_size=args.img_size,)
+            transform = SSDBaseTransform(args.img_size)
     ## YOLO style transform
     elif trans_config['aug_type'] == 'yolov5':
         if is_train:
-            transform = YOLOv5Augmentation(img_size=args.img_size, trans_config=trans_config, use_ablu=trans_config['use_ablu'])
+            transform = YOLOv5Augmentation(args.img_size, trans_config['affine_params'], trans_config['use_ablu'])
         else:
-            transform = YOLOv5BaseTransform(img_size=args.img_size,max_stride=max_stride)
+            transform = YOLOv5BaseTransform(args.img_size, max_stride)
     ## RT-DETR style transform
     elif trans_config['aug_type'] == 'rtdetr':
         if is_train:
-            use_mosaic = False if trans_config['mosaic_prob'] < 0.2 else True
             transform = RTDetrAugmentation(
-                img_size=args.img_size, pixel_mean=trans_config['pixel_mean'], pixel_std=trans_config['pixel_std'], use_mosaic=use_mosaic)
+                args.img_size, trans_config['pixel_mean'], trans_config['pixel_std'])
+            if trans_config["mosaic_prob"] > 0:
+                transform.reset_weak_augment()
         else:
             transform = RTDetrBaseTransform(
-                img_size=args.img_size, pixel_mean=trans_config['pixel_mean'], pixel_std=trans_config['pixel_std'])
+                args.img_size, trans_config['pixel_mean'], trans_config['pixel_std'])
 
     return transform, trans_config

+ 25 - 22
dataset/coco.py

@@ -3,8 +3,6 @@ import cv2
 import time
 import random
 import numpy as np
-
-import torch
 from torch.utils.data import Dataset
 
 try:
@@ -13,9 +11,9 @@ except:
     print("It seems that the COCOAPI is not installed.")
 
 try:
-    from .data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from .data_augment.strong_augment import MosaicAugment, MixupAugment
 except:
-    from data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from  data_augment.strong_augment import MosaicAugment, MixupAugment
 
 
 coco_class_index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
@@ -52,10 +50,19 @@ class COCODataset(Dataset):
         self.class_ids = sorted(self.coco.getCatIds())
         self.dataset_size = len(self.ids)
         # ----------- Transform parameters -----------
-        self.transform = transform
-        self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
-        self.mixup_prob = trans_config['mixup_prob'] if trans_config else 0.0
         self.trans_config = trans_config
+        self.transform = transform
+        # ----------- Strong augmentation -----------
+        if is_train:
+            self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
+            self.mixup_prob  = trans_config['mixup_prob']  if trans_config else 0.0
+            self.mosaic_augment = MosaicAugment(img_size, trans_config, is_train)
+            self.mixup_augment  = MixupAugment(img_size, trans_config)
+        else:
+            self.mosaic_prob = 0.0
+            self.mixup_prob  = 0.0
+            self.mosaic_augment = None
+            self.mixup_augment  = None
         print('==============================')
         print('use Mosaic Augmentation: {}'.format(self.mosaic_prob))
         print('use Mixup Augmentation: {}'.format(self.mixup_prob))
@@ -66,7 +73,6 @@ class COCODataset(Dataset):
         if self.load_cache:
             self.cached_datas = self._load_cache()
 
-
     # ------------ Basic dataset function ------------
     def __len__(self):
         return len(self.ids)
@@ -108,13 +114,14 @@ class COCODataset(Dataset):
 
     # ------------ Mosaic & Mixup ------------
     def load_mosaic(self, index):
-        # load 4x mosaic image
+        # ------------ Prepare 4 indexes of images ------------
+        ## Load 4x mosaic image
         index_list = np.arange(index).tolist() + np.arange(index+1, len(self.ids)).tolist()
         id1 = index
         id2, id3, id4 = random.sample(index_list, 3)
         indexs = [id1, id2, id3, id4]
 
-        # load images and targets
+        ## Load images and targets
         image_list = []
         target_list = []
         for index in indexs:
@@ -122,26 +129,22 @@ class COCODataset(Dataset):
             image_list.append(img_i)
             target_list.append(target_i)
 
-        # Mosaic
-        if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
-            image, target = yolov5_mosaic_augment(
-                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
+        # ------------ Mosaic augmentation ------------
+        image, target = self.mosaic_augment(image_list, target_list)
 
         return image, target
 
     def load_mixup(self, origin_image, origin_target):
-        # YOLOv5 type Mixup
-        if self.trans_config['mixup_type'] == 'yolov5_mixup':
+        # ------------ Load a new image & target ------------
+        if self.mixup_augment.mixup_type == 'yolov5':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_mosaic(new_index)
-            image, target = yolov5_mixup_augment(
-                origin_image, origin_target, new_image, new_target)
-        # YOLOX type Mixup
-        elif self.trans_config['mixup_type'] == 'yolox_mixup':
+        elif self.mixup_augment.mixup_type == 'yolox':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_image_target(new_index)
-            image, target = yolox_mixup_augment(
-                origin_image, origin_target, new_image, new_target, self.img_size, self.trans_config['mixup_scale'])
+            
+        # ------------ Mixup augmentation ------------
+        image, target = self.mixup_augment(origin_image, origin_target, new_image, new_target)
 
         return image, target
     

+ 25 - 20
dataset/crowdhuman.py

@@ -11,9 +11,9 @@ except:
     print("It seems that the COCOAPI is not installed.")
 
 try:
-    from .data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from .data_augment.strong_augment import MosaicAugment, MixupAugment
 except:
-    from data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from  data_augment.strong_augment import MosaicAugment, MixupAugment
 
 
 crowd_class_labels = ('person',)
@@ -40,12 +40,20 @@ class CrowdHumanDataset(Dataset):
         self.coco = COCO(os.path.join(self.data_dir, 'annotations', self.json_file))
         self.ids = self.coco.getImgIds()
         self.class_ids = sorted(self.coco.getCatIds())
-
         # ----------- Transform parameters -----------
-        self.transform = transform
-        self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
-        self.mixup_prob = trans_config['mixup_prob'] if trans_config else 0.0
         self.trans_config = trans_config
+        self.transform = transform
+        # ----------- Strong augmentation -----------
+        if is_train:
+            self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
+            self.mixup_prob  = trans_config['mixup_prob']  if trans_config else 0.0
+            self.mosaic_augment = MosaicAugment(img_size, trans_config, is_train)
+            self.mixup_augment  = MixupAugment(img_size, trans_config)
+        else:
+            self.mosaic_prob = 0.0
+            self.mixup_prob  = 0.0
+            self.mosaic_augment = None
+            self.mixup_augment  = None
         print('==============================')
         print('use Mosaic Augmentation: {}'.format(self.mosaic_prob))
         print('use Mixup Augmentation: {}'.format(self.mixup_prob))
@@ -60,13 +68,14 @@ class CrowdHumanDataset(Dataset):
 
     # ------------ Mosaic & Mixup ------------
     def load_mosaic(self, index):
-        # load 4x mosaic image
+        # ------------ Prepare 4 indexes of images ------------
+        ## Load 4x mosaic image
         index_list = np.arange(index).tolist() + np.arange(index+1, len(self.ids)).tolist()
         id1 = index
         id2, id3, id4 = random.sample(index_list, 3)
         indexs = [id1, id2, id3, id4]
 
-        # load images and targets
+        ## Load images and targets
         image_list = []
         target_list = []
         for index in indexs:
@@ -74,26 +83,22 @@ class CrowdHumanDataset(Dataset):
             image_list.append(img_i)
             target_list.append(target_i)
 
-        # Mosaic
-        if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
-            image, target = yolov5_mosaic_augment(
-                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
+        # ------------ Mosaic augmentation ------------
+        image, target = self.mosaic_augment(image_list, target_list)
 
         return image, target
 
     def load_mixup(self, origin_image, origin_target):
-        # YOLOv5 type Mixup
-        if self.trans_config['mixup_type'] == 'yolov5_mixup':
+        # ------------ Load a new image & target ------------
+        if self.mixup_augment.mixup_type == 'yolov5':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_mosaic(new_index)
-            image, target = yolov5_mixup_augment(
-                origin_image, origin_target, new_image, new_target)
-        # YOLOX type Mixup
-        elif self.trans_config['mixup_type'] == 'yolox_mixup':
+        elif self.mixup_augment.mixup_type == 'yolox':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_image_target(new_index)
-            image, target = yolox_mixup_augment(
-                origin_image, origin_target, new_image, new_target, self.img_size, self.trans_config['mixup_scale'])
+            
+        # ------------ Mixup augmentation ------------
+        image, target = self.mixup_augment(origin_image, origin_target, new_image, new_target)
 
         return image, target
     

+ 25 - 24
dataset/customed.py

@@ -4,7 +4,6 @@ import time
 import random
 import numpy as np
 
-import torch
 from torch.utils.data import Dataset
 
 try:
@@ -13,9 +12,9 @@ except:
     print("It seems that the COCOAPI is not installed.")
 
 try:
-    from .data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from .data_augment.strong_augment import MosaicAugment, MixupAugment
 except:
-    from data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from  data_augment.strong_augment import MosaicAugment, MixupAugment
 
 
 class CustomedDataset(Dataset):
@@ -41,14 +40,19 @@ class CustomedDataset(Dataset):
         self.class_ids = sorted(self.coco.getCatIds())
         self.dataset_size = len(self.ids)
         # ----------- Transform parameters -----------
-        self.transform = transform
-        self.mosaic_prob = 0
-        self.mixup_prob = 0
         self.trans_config = trans_config
-        if trans_config is not None:
-            self.mosaic_prob = trans_config['mosaic_prob']
-            self.mixup_prob = trans_config['mixup_prob']
-
+        self.transform = transform
+        # ----------- Strong augmentation -----------
+        if is_train:
+            self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
+            self.mixup_prob  = trans_config['mixup_prob']  if trans_config else 0.0
+            self.mosaic_augment = MosaicAugment(img_size, trans_config, is_train)
+            self.mixup_augment  = MixupAugment(img_size, trans_config)
+        else:
+            self.mosaic_prob = 0.0
+            self.mixup_prob  = 0.0
+            self.mosaic_augment = None
+            self.mixup_augment  = None
         print('==============================')
         print('Image Set: {}'.format(image_set))
         print('Json file: {}'.format(self.json_file))
@@ -103,13 +107,14 @@ class CustomedDataset(Dataset):
 
     # ------------ Mosaic & Mixup ------------
     def load_mosaic(self, index):
-        # load 4x mosaic image
+        # ------------ Prepare 4 indexes of images ------------
+        ## Load 4x mosaic image
         index_list = np.arange(index).tolist() + np.arange(index+1, len(self.ids)).tolist()
         id1 = index
         id2, id3, id4 = random.sample(index_list, 3)
         indexs = [id1, id2, id3, id4]
 
-        # load images and targets
+        ## Load images and targets
         image_list = []
         target_list = []
         for index in indexs:
@@ -117,26 +122,22 @@ class CustomedDataset(Dataset):
             image_list.append(img_i)
             target_list.append(target_i)
 
-        # Mosaic
-        if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
-            image, target = yolov5_mosaic_augment(
-                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
+        # ------------ Mosaic augmentation ------------
+        image, target = self.mosaic_augment(image_list, target_list)
 
         return image, target
 
     def load_mixup(self, origin_image, origin_target):
-        # YOLOv5 type Mixup
-        if self.trans_config['mixup_type'] == 'yolov5_mixup':
+        # ------------ Load a new image & target ------------
+        if self.mixup_augment.mixup_type == 'yolov5':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_mosaic(new_index)
-            image, target = yolov5_mixup_augment(
-                origin_image, origin_target, new_image, new_target)
-        # YOLOX type Mixup
-        elif self.trans_config['mixup_type'] == 'yolox_mixup':
+        elif self.mixup_augment.mixup_type == 'yolox':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_image_target(new_index)
-            image, target = yolox_mixup_augment(
-                origin_image, origin_target, new_image, new_target, self.img_size, self.trans_config['mixup_scale'])
+            
+        # ------------ Mixup augmentation ------------
+        image, target = self.mixup_augment(origin_image, origin_target, new_image, new_target)
 
         return image, target
     

+ 14 - 26
dataset/data_augment/rtdetr_augment.py

@@ -401,10 +401,9 @@ class ToTensor(object):
 # ------------------------- Preprocessers -------------------------
 ## Transform for Train
 class RTDetrAugmentation(object):
-    def __init__(self, img_size=640, pixel_mean=[123.675, 116.28, 103.53], pixel_std=[58.395, 57.12, 57.375], use_mosaic=False):
+    def __init__(self, img_size=640, pixel_mean=[123.675, 116.28, 103.53], pixel_std=[58.395, 57.12, 57.375]):
         # ----------------- Basic parameters -----------------
         self.img_size = img_size
-        self.use_mosaic = use_mosaic
         self.pixel_mean = pixel_mean  # RGB format
         self.pixel_std = pixel_std    # RGB format
         self.color_format = 'rgb'
@@ -413,29 +412,18 @@ class RTDetrAugmentation(object):
         print("Pixel std:  {}".format(self.pixel_std))
 
         # ----------------- Transforms -----------------
-        if use_mosaic:
-            # For use-mosaic setting, we do not use RandomSampleCrop processor.
-            self.augment = Compose([
-                RandomPhotometricDistort(hue=0.5, saturation=1.5, exposure=1.5),
-                RandomHorizontalFlip(p=0.5),
-                Resize(img_size=self.img_size),
-                ConvertColorFormat(self.color_format),
-                Normalize(self.pixel_mean, self.pixel_std),
-                ToTensor()
-            ])
-        else:
-            # For no-mosaic setting, we use RandomExpand & RandomSampleCrop processor.
-            self.augment = Compose([
-                RandomPhotometricDistort(hue=0.5, saturation=1.5, exposure=1.5),
-                RandomJitterCrop(p=0.8, jitter_ratio=0.3, fill_value=self.pixel_mean[::-1]),
-                RandomHorizontalFlip(p=0.5),
-                Resize(img_size=self.img_size),
-                ConvertColorFormat(self.color_format),
-                Normalize(self.pixel_mean, self.pixel_std),
-                ToTensor()
-            ])
-
-    def set_weak_augment(self):
+        self.augment = Compose([
+            RandomPhotometricDistort(hue=0.5, saturation=1.5, exposure=1.5),
+            RandomJitterCrop(p=0.8, jitter_ratio=0.3, fill_value=self.pixel_mean[::-1]),
+            RandomHorizontalFlip(p=0.5),
+            Resize(img_size=self.img_size),
+            ConvertColorFormat(self.color_format),
+            Normalize(self.pixel_mean, self.pixel_std),
+            ToTensor()
+        ])
+
+    def reset_weak_augment(self):
+        print("Reset transform with weak augmentation ...")
         self.augment = Compose([
             RandomHorizontalFlip(p=0.5),
             Resize(img_size=self.img_size),
@@ -444,6 +432,7 @@ class RTDetrAugmentation(object):
             ToTensor()
         ])
 
+
     def __call__(self, image, target, mosaic=False):
         orig_h, orig_w = image.shape[:2]
         ratio = [self.img_size / orig_w, self.img_size / orig_h]
@@ -452,7 +441,6 @@ class RTDetrAugmentation(object):
 
         return image, target, ratio
 
-
 ## Transform for Eval
 class RTDetrBaseTransform(object):
     def __init__(self, img_size=640, pixel_mean=[123.675, 116.28, 103.53], pixel_std=[58.395, 57.12, 57.375]):

+ 250 - 0
dataset/data_augment/strong_augment.py

@@ -0,0 +1,250 @@
+import random
+import cv2
+import numpy as np
+
+from .yolov5_augment import random_perspective
+
+
+# ------------------------- Strong augmentations -------------------------
+## Mosaic Augmentation
+class MosaicAugment(object):
+    def __init__(self,
+                 img_size,
+                 transform_config,
+                 is_train=False,
+                 ) -> None:
+        self.img_size = img_size
+        self.is_train = is_train
+        self.keep_ratio    = transform_config['mosaic_keep_ratio']
+        self.affine_params = transform_config['affine_params']
+        self.mosaic_type   = transform_config['mosaic_type']
+
+    def yolov5_mosaic_augment(self, image_list, target_list):
+        assert len(image_list) == 4
+
+        mosaic_img = np.ones([self.img_size*2, self.img_size*2, image_list[0].shape[2]], dtype=np.uint8) * 114
+        # mosaic center
+        yc, xc = [int(random.uniform(-x, 2*self.img_size + x)) for x in [-self.img_size // 2, -self.img_size // 2]]
+        # yc = xc = self.img_size
+
+        mosaic_bboxes = []
+        mosaic_labels = []
+        for i in range(4):
+            img_i, target_i = image_list[i], target_list[i]
+            bboxes_i = target_i["boxes"]
+            labels_i = target_i["labels"]
+
+            orig_h, orig_w, _ = img_i.shape
+
+            # resize
+            if self.keep_ratio:
+                r = self.img_size / max(orig_h, orig_w)
+                if r != 1: 
+                    interp = cv2.INTER_LINEAR if (self.is_train or r > 1) else cv2.INTER_AREA
+                    img_i = cv2.resize(img_i, (int(orig_w * r), int(orig_h * r)), interpolation=interp)
+            else:
+                interp = cv2.INTER_LINEAR if self.is_train else cv2.INTER_AREA
+                img_i = cv2.resize(img_i, (self.img_size, self.img_size), interpolation=interp)
+            h, w, _ = img_i.shape
+
+            # place img in img4
+            if i == 0:  # top left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+            elif i == 1:  # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, self.img_size * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:  # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(self.img_size * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:  # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, self.img_size * 2), min(self.img_size * 2, yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+            mosaic_img[y1a:y2a, x1a:x2a] = img_i[y1b:y2b, x1b:x2b]
+            padw = x1a - x1b
+            padh = y1a - y1b
+
+            # labels
+            bboxes_i_ = bboxes_i.copy()
+            if len(bboxes_i) > 0:
+                # a valid target, and modify it.
+                bboxes_i_[:, 0] = (w * bboxes_i[:, 0] / orig_w + padw)
+                bboxes_i_[:, 1] = (h * bboxes_i[:, 1] / orig_h + padh)
+                bboxes_i_[:, 2] = (w * bboxes_i[:, 2] / orig_w + padw)
+                bboxes_i_[:, 3] = (h * bboxes_i[:, 3] / orig_h + padh)    
+
+                mosaic_bboxes.append(bboxes_i_)
+                mosaic_labels.append(labels_i)
+
+        if len(mosaic_bboxes) == 0:
+            mosaic_bboxes = np.array([]).reshape(-1, 4)
+            mosaic_labels = np.array([]).reshape(-1)
+        else:
+            mosaic_bboxes = np.concatenate(mosaic_bboxes)
+            mosaic_labels = np.concatenate(mosaic_labels)
+
+        # clip
+        mosaic_bboxes = mosaic_bboxes.clip(0, self.img_size * 2)
+
+        # random perspective
+        mosaic_targets = np.concatenate([mosaic_labels[..., None], mosaic_bboxes], axis=-1)
+        mosaic_img, mosaic_targets = random_perspective(
+            mosaic_img,
+            mosaic_targets,
+            self.affine_params['degrees'],
+            translate=self.affine_params['translate'],
+            scale=self.affine_params['scale'],
+            shear=self.affine_params['shear'],
+            perspective=self.affine_params['perspective'],
+            border=[-self.img_size//2, -self.img_size//2]
+            )
+
+        # target
+        mosaic_target = {
+            "boxes": mosaic_targets[..., 1:],
+            "labels": mosaic_targets[..., 0],
+            "orig_size": [self.img_size, self.img_size]
+        }
+
+        return mosaic_img, mosaic_target
+
+    def __call__(self, image_list, target_list):
+        if self.mosaic_type == 'yolov5':
+            return self.yolov5_mosaic_augment(image_list, target_list)
+        else:
+            raise NotImplementedError("Unknown mosaic type: {}".format(self.mosaic_type))
+
+## Mixup Augmentation
+class MixupAugment(object):
+    def __init__(self,
+                 img_size,
+                 transform_config,
+                 ) -> None:
+        self.img_size = img_size
+        self.mixup_type  = transform_config['mixup_type']
+        self.mixup_scale = transform_config['mixup_scale']
+
+    def yolov5_mixup_augment(self, origin_image, origin_target, new_image, new_target):
+        if origin_image.shape[:2] != new_image.shape[:2]:
+            img_size = max(new_image.shape[:2])
+            # origin_image is not a mosaic image
+            orig_h, orig_w = origin_image.shape[:2]
+            scale_ratio = img_size / max(orig_h, orig_w)
+            if scale_ratio != 1: 
+                interp = cv2.INTER_LINEAR if scale_ratio > 1 else cv2.INTER_AREA
+                resize_size = (int(orig_w * scale_ratio), int(orig_h * scale_ratio))
+                origin_image = cv2.resize(origin_image, resize_size, interpolation=interp)
+
+            # pad new image
+            pad_origin_image = np.ones([img_size, img_size, origin_image.shape[2]], dtype=np.uint8) * 114
+            pad_origin_image[:resize_size[1], :resize_size[0]] = origin_image
+            origin_image = pad_origin_image.copy()
+            del pad_origin_image
+
+        r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
+        mixup_image = r * origin_image.astype(np.float32) + \
+                    (1.0 - r)* new_image.astype(np.float32)
+        mixup_image = mixup_image.astype(np.uint8)
+        
+        cls_labels = new_target["labels"].copy()
+        box_labels = new_target["boxes"].copy()
+
+        mixup_bboxes = np.concatenate([origin_target["boxes"], box_labels], axis=0)
+        mixup_labels = np.concatenate([origin_target["labels"], cls_labels], axis=0)
+
+        mixup_target = {
+            "boxes": mixup_bboxes,
+            "labels": mixup_labels,
+            'orig_size': mixup_image.shape[:2]
+        }
+        
+        return mixup_image, mixup_target
+
+    def yolox_mixup_augment(self, origin_image, origin_target, new_image, new_target):
+        assert self.mixup_scale is not None, "You should set mixup_scale as a List type, such as [0.5, 1.5], not a NoneType."
+
+        jit_factor = random.uniform(*self.mixup_scale)
+        FLIP = random.uniform(0, 1) > 0.5
+
+        # resize new image
+        orig_h, orig_w = new_image.shape[:2]
+        cp_scale_ratio = self.img_size / max(orig_h, orig_w)
+        if cp_scale_ratio != 1: 
+            interp = cv2.INTER_LINEAR if cp_scale_ratio > 1 else cv2.INTER_AREA
+            resized_new_img = cv2.resize(
+                new_image, (int(orig_w * cp_scale_ratio), int(orig_h * cp_scale_ratio)), interpolation=interp)
+        else:
+            resized_new_img = new_image
+
+        # pad new image
+        cp_img = np.ones([self.img_size, self.img_size, new_image.shape[2]], dtype=np.uint8) * 114
+        new_shape = (resized_new_img.shape[1], resized_new_img.shape[0])
+        cp_img[:new_shape[1], :new_shape[0]] = resized_new_img
+
+        # resize padded new image
+        cp_img_h, cp_img_w = cp_img.shape[:2]
+        cp_new_shape = (int(cp_img_w * jit_factor),
+                        int(cp_img_h * jit_factor))
+        cp_img = cv2.resize(cp_img, (cp_new_shape[0], cp_new_shape[1]))
+        cp_scale_ratio *= jit_factor
+
+        # flip new image
+        if FLIP:
+            cp_img = cp_img[:, ::-1, :]
+
+        # pad image
+        origin_h, origin_w = cp_img.shape[:2]
+        target_h, target_w = origin_image.shape[:2]
+        padded_img = np.zeros(
+            (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8
+        )
+        padded_img[:origin_h, :origin_w] = cp_img
+
+        # crop padded image
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
+        padded_cropped_img = padded_img[
+            y_offset: y_offset + target_h, x_offset: x_offset + target_w
+        ]
+
+        # process target
+        new_boxes = new_target["boxes"]
+        new_labels = new_target["labels"]
+        new_boxes[:, 0::2] = np.clip(new_boxes[:, 0::2] * cp_scale_ratio, 0, origin_w)
+        new_boxes[:, 1::2] = np.clip(new_boxes[:, 1::2] * cp_scale_ratio, 0, origin_h)
+        if FLIP:
+            new_boxes[:, 0::2] = (
+                origin_w - new_boxes[:, 0::2][:, ::-1]
+            )
+        new_boxes[:, 0::2] = np.clip(
+            new_boxes[:, 0::2] - x_offset, 0, target_w
+        )
+        new_boxes[:, 1::2] = np.clip(
+            new_boxes[:, 1::2] - y_offset, 0, target_h
+        )
+
+        # mixup target
+        mixup_boxes = np.concatenate([new_boxes, origin_target['boxes']], axis=0)
+        mixup_labels = np.concatenate([new_labels, origin_target['labels']], axis=0)
+        mixup_target = {
+            'boxes': mixup_boxes,
+            'labels': mixup_labels
+        }
+
+        # mixup images
+        origin_image = origin_image.astype(np.float32)
+        origin_image = 0.5 * origin_image + 0.5 * padded_cropped_img.astype(np.float32)
+
+        return origin_image.astype(np.uint8), mixup_target
+            
+    def __call__(self, origin_image, origin_target, new_image, new_target):
+        if self.mixup_type == "yolov5":
+            return self.yolov5_mixup_augment(origin_image, origin_target, new_image, new_target)
+        elif self.mixup_type == "yolox":
+            return self.yolox_mixup_augment(origin_image, origin_target, new_image, new_target)
+        else:
+            raise NotImplementedError("Unknown mixup type: {}".format(self.mixup_type))

+ 11 - 219
dataset/data_augment/yolov5_augment.py

@@ -121,225 +121,16 @@ class Albumentations(object):
         return image, target
 
 
-# ------------------------- Strong augmentations -------------------------
-## YOLOv5-Mosaic
-def yolov5_mosaic_augment(image_list, target_list, img_size, affine_params, keep_ratio=True, is_train=False):
-    assert len(image_list) == 4
-
-    mosaic_img = np.ones([img_size*2, img_size*2, image_list[0].shape[2]], dtype=np.uint8) * 114
-    # mosaic center
-    yc, xc = [int(random.uniform(-x, 2*img_size + x)) for x in [-img_size // 2, -img_size // 2]]
-    # yc = xc = self.img_size
-
-    mosaic_bboxes = []
-    mosaic_labels = []
-    for i in range(4):
-        img_i, target_i = image_list[i], target_list[i]
-        bboxes_i = target_i["boxes"]
-        labels_i = target_i["labels"]
-
-        orig_h, orig_w, _ = img_i.shape
-
-        # resize
-        if keep_ratio:
-            r = img_size / max(orig_h, orig_w)
-            if r != 1: 
-                interp = cv2.INTER_LINEAR if (is_train or r > 1) else cv2.INTER_AREA
-                img_i = cv2.resize(img_i, (int(orig_w * r), int(orig_h * r)), interpolation=interp)
-        else:
-            interp = cv2.INTER_LINEAR if is_train else cv2.INTER_AREA
-            img_i = cv2.resize(img_i, (img_size, img_size), interpolation=interp)
-        h, w, _ = img_i.shape
-
-        # place img in img4
-        if i == 0:  # top left
-            x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
-            x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
-        elif i == 1:  # top right
-            x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, img_size * 2), yc
-            x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
-        elif i == 2:  # bottom left
-            x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(img_size * 2, yc + h)
-            x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
-        elif i == 3:  # bottom right
-            x1a, y1a, x2a, y2a = xc, yc, min(xc + w, img_size * 2), min(img_size * 2, yc + h)
-            x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
-
-        mosaic_img[y1a:y2a, x1a:x2a] = img_i[y1b:y2b, x1b:x2b]
-        padw = x1a - x1b
-        padh = y1a - y1b
-
-        # labels
-        bboxes_i_ = bboxes_i.copy()
-        if len(bboxes_i) > 0:
-            # a valid target, and modify it.
-            bboxes_i_[:, 0] = (w * bboxes_i[:, 0] / orig_w + padw)
-            bboxes_i_[:, 1] = (h * bboxes_i[:, 1] / orig_h + padh)
-            bboxes_i_[:, 2] = (w * bboxes_i[:, 2] / orig_w + padw)
-            bboxes_i_[:, 3] = (h * bboxes_i[:, 3] / orig_h + padh)    
-
-            mosaic_bboxes.append(bboxes_i_)
-            mosaic_labels.append(labels_i)
-
-    if len(mosaic_bboxes) == 0:
-        mosaic_bboxes = np.array([]).reshape(-1, 4)
-        mosaic_labels = np.array([]).reshape(-1)
-    else:
-        mosaic_bboxes = np.concatenate(mosaic_bboxes)
-        mosaic_labels = np.concatenate(mosaic_labels)
-
-    # clip
-    mosaic_bboxes = mosaic_bboxes.clip(0, img_size * 2)
-
-    # random perspective
-    mosaic_targets = np.concatenate([mosaic_labels[..., None], mosaic_bboxes], axis=-1)
-    mosaic_img, mosaic_targets = random_perspective(
-        mosaic_img,
-        mosaic_targets,
-        affine_params['degrees'],
-        translate=affine_params['translate'],
-        scale=affine_params['scale'],
-        shear=affine_params['shear'],
-        perspective=affine_params['perspective'],
-        border=[-img_size//2, -img_size//2]
-        )
-
-    # target
-    mosaic_target = {
-        "boxes": mosaic_targets[..., 1:],
-        "labels": mosaic_targets[..., 0],
-        "orig_size": [img_size, img_size]
-    }
-
-    return mosaic_img, mosaic_target
-
-## YOLOv5-Mixup
-def yolov5_mixup_augment(origin_image, origin_target, new_image, new_target):
-    if origin_image.shape[:2] != new_image.shape[:2]:
-        img_size = max(new_image.shape[:2])
-        # origin_image is not a mosaic image
-        orig_h, orig_w = origin_image.shape[:2]
-        scale_ratio = img_size / max(orig_h, orig_w)
-        if scale_ratio != 1: 
-            interp = cv2.INTER_LINEAR if scale_ratio > 1 else cv2.INTER_AREA
-            resize_size = (int(orig_w * scale_ratio), int(orig_h * scale_ratio))
-            origin_image = cv2.resize(origin_image, resize_size, interpolation=interp)
-
-        # pad new image
-        pad_origin_image = np.ones([img_size, img_size, origin_image.shape[2]], dtype=np.uint8) * 114
-        pad_origin_image[:resize_size[1], :resize_size[0]] = origin_image
-        origin_image = pad_origin_image.copy()
-        del pad_origin_image
-
-    r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
-    mixup_image = r * origin_image.astype(np.float32) + \
-                  (1.0 - r)* new_image.astype(np.float32)
-    mixup_image = mixup_image.astype(np.uint8)
-    
-    cls_labels = new_target["labels"].copy()
-    box_labels = new_target["boxes"].copy()
-
-    mixup_bboxes = np.concatenate([origin_target["boxes"], box_labels], axis=0)
-    mixup_labels = np.concatenate([origin_target["labels"], cls_labels], axis=0)
-
-    mixup_target = {
-        "boxes": mixup_bboxes,
-        "labels": mixup_labels,
-        'orig_size': mixup_image.shape[:2]
-    }
-    
-    return mixup_image, mixup_target
-    
-## YOLOX-Mixup
-def yolox_mixup_augment(origin_img, origin_target, new_img, new_target, img_size, mixup_scale):
-    jit_factor = random.uniform(*mixup_scale)
-    FLIP = random.uniform(0, 1) > 0.5
-
-    # resize new image
-    orig_h, orig_w = new_img.shape[:2]
-    cp_scale_ratio = img_size / max(orig_h, orig_w)
-    if cp_scale_ratio != 1: 
-        interp = cv2.INTER_LINEAR if cp_scale_ratio > 1 else cv2.INTER_AREA
-        resized_new_img = cv2.resize(
-            new_img, (int(orig_w * cp_scale_ratio), int(orig_h * cp_scale_ratio)), interpolation=interp)
-    else:
-        resized_new_img = new_img
-
-    # pad new image
-    cp_img = np.ones([img_size, img_size, new_img.shape[2]], dtype=np.uint8) * 114
-    new_shape = (resized_new_img.shape[1], resized_new_img.shape[0])
-    cp_img[:new_shape[1], :new_shape[0]] = resized_new_img
-
-    # resize padded new image
-    cp_img_h, cp_img_w = cp_img.shape[:2]
-    cp_new_shape = (int(cp_img_w * jit_factor),
-                    int(cp_img_h * jit_factor))
-    cp_img = cv2.resize(cp_img, (cp_new_shape[0], cp_new_shape[1]))
-    cp_scale_ratio *= jit_factor
-
-    # flip new image
-    if FLIP:
-        cp_img = cp_img[:, ::-1, :]
-
-    # pad image
-    origin_h, origin_w = cp_img.shape[:2]
-    target_h, target_w = origin_img.shape[:2]
-    padded_img = np.zeros(
-        (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8
-    )
-    padded_img[:origin_h, :origin_w] = cp_img
-
-    # crop padded image
-    x_offset, y_offset = 0, 0
-    if padded_img.shape[0] > target_h:
-        y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
-    if padded_img.shape[1] > target_w:
-        x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
-    padded_cropped_img = padded_img[
-        y_offset: y_offset + target_h, x_offset: x_offset + target_w
-    ]
-
-    # process target
-    new_boxes = new_target["boxes"]
-    new_labels = new_target["labels"]
-    new_boxes[:, 0::2] = np.clip(new_boxes[:, 0::2] * cp_scale_ratio, 0, origin_w)
-    new_boxes[:, 1::2] = np.clip(new_boxes[:, 1::2] * cp_scale_ratio, 0, origin_h)
-    if FLIP:
-        new_boxes[:, 0::2] = (
-            origin_w - new_boxes[:, 0::2][:, ::-1]
-        )
-    new_boxes[:, 0::2] = np.clip(
-        new_boxes[:, 0::2] - x_offset, 0, target_w
-    )
-    new_boxes[:, 1::2] = np.clip(
-        new_boxes[:, 1::2] - y_offset, 0, target_h
-    )
-
-    # mixup target
-    mixup_boxes = np.concatenate([new_boxes, origin_target['boxes']], axis=0)
-    mixup_labels = np.concatenate([new_labels, origin_target['labels']], axis=0)
-    mixup_target = {
-        'boxes': mixup_boxes,
-        'labels': mixup_labels
-    }
-
-    # mixup images
-    origin_img = origin_img.astype(np.float32)
-    origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32)
-
-    return origin_img.astype(np.uint8), mixup_target
-        
-
 # ------------------------- Preprocessers -------------------------
 ## YOLOv5-style Transform for Train
 class YOLOv5Augmentation(object):
-    def __init__(self, img_size=640, trans_config=None, use_ablu=False):
+    def __init__(self, img_size=640, affine_params=None, use_ablu=False):
         # Basic parameters
         self.img_size = img_size
         self.pixel_mean = [0., 0., 0.]
         self.pixel_std  = [255., 255., 255.]
         self.color_format = 'bgr'
-        self.trans_config = trans_config
+        self.affine_params = affine_params
         # Albumentations
         self.ablu_trans = Albumentations(img_size) if use_ablu else None
 
@@ -367,9 +158,10 @@ class YOLOv5Augmentation(object):
             img, target = self.ablu_trans(img, target)
 
         # --------------- HSV augmentations ---------------
-        augment_hsv(img, hgain=self.trans_config['hsv_h'], 
-                    sgain=self.trans_config['hsv_s'], 
-                    vgain=self.trans_config['hsv_v'])
+        augment_hsv(img,
+                    hgain=self.affine_params['hsv_h'], 
+                    sgain=self.affine_params['hsv_s'], 
+                    vgain=self.affine_params['hsv_v'])
         
         # --------------- Spatial augmentations ---------------
         ## Random perspective
@@ -384,11 +176,11 @@ class YOLOv5Augmentation(object):
                 (target['labels'][..., None], target['boxes']), axis=-1)
             img, target_ = random_perspective(
                 img, target_,
-                degrees=self.trans_config['degrees'],
-                translate=self.trans_config['translate'],
-                scale=self.trans_config['scale'],
-                shear=self.trans_config['shear'],
-                perspective=self.trans_config['perspective']
+                degrees     = self.affine_params['degrees'],
+                translate   = self.affine_params['translate'],
+                scale       = self.affine_params['scale'],
+                shear       = self.affine_params['shear'],
+                perspective = self.affine_params['perspective']
                 )
             target['boxes'] = target_[..., 1:]
             target['labels'] = target_[..., 0]

+ 40 - 33
dataset/voc.py

@@ -3,13 +3,12 @@ import random
 import numpy as np
 import os.path as osp
 import xml.etree.ElementTree as ET
-
-import torch
 import torch.utils.data as data
+
 try:
-    from .data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from .data_augment.strong_augment import MosaicAugment, MixupAugment
 except:
-    from data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from  data_augment.strong_augment import MosaicAugment, MixupAugment
 
 
 # VOC class names
@@ -90,10 +89,19 @@ class VOCDataset(data.Dataset):
                 self.ids.append((rootpath, line.strip()))
         self.dataset_size = len(self.ids)
         # ----------- Transform parameters -----------
-        self.transform = transform
-        self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
-        self.mixup_prob = trans_config['mixup_prob'] if trans_config else 0.0
         self.trans_config = trans_config
+        self.transform = transform
+        # ----------- Strong augmentation -----------
+        if is_train:
+            self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
+            self.mixup_prob  = trans_config['mixup_prob']  if trans_config else 0.0
+            self.mosaic_augment = MosaicAugment(img_size, trans_config, is_train)
+            self.mixup_augment  = MixupAugment(img_size, trans_config)
+        else:
+            self.mosaic_prob = 0.0
+            self.mixup_prob  = 0.0
+            self.mosaic_augment = None
+            self.mixup_augment  = None
         print('==============================')
         print('use Mosaic Augmentation: {}'.format(self.mosaic_prob))
         print('use Mixup Augmentation: {}'.format(self.mixup_prob))
@@ -147,13 +155,14 @@ class VOCDataset(data.Dataset):
 
     # ------------ Mosaic & Mixup ------------
     def load_mosaic(self, index):
-        # load 4x mosaic image
+        # ------------ Prepare 4 indexes of images ------------
+        ## Load 4x mosaic image
         index_list = np.arange(index).tolist() + np.arange(index+1, len(self.ids)).tolist()
         id1 = index
         id2, id3, id4 = random.sample(index_list, 3)
         indexs = [id1, id2, id3, id4]
 
-        # load images and targets
+        ## Load images and targets
         image_list = []
         target_list = []
         for index in indexs:
@@ -161,26 +170,22 @@ class VOCDataset(data.Dataset):
             image_list.append(img_i)
             target_list.append(target_i)
 
-        # Mosaic
-        if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
-            image, target = yolov5_mosaic_augment(
-                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
+        # ------------ Mosaic augmentation ------------
+        image, target = self.mosaic_augment(image_list, target_list)
 
         return image, target
 
     def load_mixup(self, origin_image, origin_target):
-        # YOLOv5 type Mixup
-        if self.trans_config['mixup_type'] == 'yolov5_mixup':
+        # ------------ Load a new image & target ------------
+        if self.mixup_augment.mixup_type == 'yolov5':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_mosaic(new_index)
-            image, target = yolov5_mixup_augment(
-                origin_image, origin_target, new_image, new_target)
-        # YOLOX type Mixup
-        elif self.trans_config['mixup_type'] == 'yolox_mixup':
+        elif self.mixup_augment.mixup_type == 'yolox':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_image_target(new_index)
-            image, target = yolox_mixup_augment(
-                origin_image, origin_target, new_image, new_target, self.img_size, self.trans_config['mixup_scale'])
+            
+        # ------------ Mixup augmentation ------------
+        image, target = self.mixup_augment(origin_image, origin_target, new_image, new_target)
 
         return image, target
     
@@ -275,22 +280,24 @@ if __name__ == "__main__":
         'aug_type': args.aug_type,    # optional: ssd, yolov5
         'pixel_mean': [123.675, 116.28, 103.53],
         'pixel_std':  [58.395, 57.12, 57.375],
-        # Basic Augment
-        'degrees': 0.0,
-        'translate': 0.2,
-        'scale': [0.1, 2.0],
-        'shear': 0.0,
-        'perspective': 0.0,
-        'hsv_h': 0.015,
-        'hsv_s': 0.7,
-        'hsv_v': 0.4,
         'use_ablu': True,
+        # Basic Augment
+        'affine_params': {
+            'degrees': 0.0,
+            'translate': 0.2,
+            'scale': [0.1, 2.0],
+            'shear': 0.0,
+            'perspective': 0.0,
+            'hsv_h': 0.015,
+            'hsv_s': 0.7,
+            'hsv_v': 0.4,
+        },
         # Mosaic & Mixup
+        'mosaic_keep_ratio': False,
         'mosaic_prob': args.mosaic,
         'mixup_prob': args.mixup,
-        'mosaic_type': 'yolov5_mosaic',
-        'mixup_type': args.mixup_type,   # optional: yolov5_mixup, yolox_mixup
-        'mosaic_keep_ratio': False,
+        'mosaic_type': 'yolov5',
+        'mixup_type':  'yolov5',
         'mixup_scale': [0.5, 1.5]
     }
     transform, trans_cfg = build_transform(args, trans_config, 32, args.is_train)

+ 25 - 20
dataset/widerface.py

@@ -12,9 +12,9 @@ except:
     print("It seems that the COCOAPI is not installed.")
 
 try:
-    from .data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from .data_augment.strong_augment import MosaicAugment, MixupAugment
 except:
-    from data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
+    from  data_augment.strong_augment import MosaicAugment, MixupAugment
 
 
 widerface_class_labels = ('face',)
@@ -44,12 +44,20 @@ class WiderFaceDataset(Dataset):
         self.coco = COCO(os.path.join(self.data_dir, 'annotations', self.json_file))
         self.ids = self.coco.getImgIds()
         self.class_ids = sorted(self.coco.getCatIds())
-
         # ----------- Transform parameters -----------
-        self.transform = transform
-        self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
-        self.mixup_prob = trans_config['mixup_prob'] if trans_config else 0.0
         self.trans_config = trans_config
+        self.transform = transform
+        # ----------- Strong augmentation -----------
+        if is_train:
+            self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
+            self.mixup_prob  = trans_config['mixup_prob']  if trans_config else 0.0
+            self.mosaic_augment = MosaicAugment(img_size, trans_config, is_train)
+            self.mixup_augment  = MixupAugment(img_size, trans_config)
+        else:
+            self.mosaic_prob = 0.0
+            self.mixup_prob  = 0.0
+            self.mosaic_augment = None
+            self.mixup_augment  = None
         print('==============================')
         print('use Mosaic Augmentation: {}'.format(self.mosaic_prob))
         print('use Mixup Augmentation: {}'.format(self.mixup_prob))
@@ -64,13 +72,14 @@ class WiderFaceDataset(Dataset):
 
     # ------------ Mosaic & Mixup ------------
     def load_mosaic(self, index):
-        # load 4x mosaic image
+        # ------------ Prepare 4 indexes of images ------------
+        ## Load 4x mosaic image
         index_list = np.arange(index).tolist() + np.arange(index+1, len(self.ids)).tolist()
         id1 = index
         id2, id3, id4 = random.sample(index_list, 3)
         indexs = [id1, id2, id3, id4]
 
-        # load images and targets
+        ## Load images and targets
         image_list = []
         target_list = []
         for index in indexs:
@@ -78,26 +87,22 @@ class WiderFaceDataset(Dataset):
             image_list.append(img_i)
             target_list.append(target_i)
 
-        # Mosaic
-        if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
-            image, target = yolov5_mosaic_augment(
-                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
+        # ------------ Mosaic augmentation ------------
+        image, target = self.mosaic_augment(image_list, target_list)
 
         return image, target
 
     def load_mixup(self, origin_image, origin_target):
-        # YOLOv5 type Mixup
-        if self.trans_config['mixup_type'] == 'yolov5_mixup':
+        # ------------ Load a new image & target ------------
+        if self.mixup_augment.mixup_type == 'yolov5':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_mosaic(new_index)
-            image, target = yolov5_mixup_augment(
-                origin_image, origin_target, new_image, new_target)
-        # YOLOX type Mixup
-        elif self.trans_config['mixup_type'] == 'yolox_mixup':
+        elif self.mixup_augment.mixup_type == 'yolox':
             new_index = np.random.randint(0, len(self.ids))
             new_image, new_target = self.load_image_target(new_index)
-            image, target = yolox_mixup_augment(
-                origin_image, origin_target, new_image, new_target, self.img_size, self.trans_config['mixup_scale'])
+            
+        # ------------ Mixup augmentation ------------
+        image, target = self.mixup_augment(origin_image, origin_target, new_image, new_target)
 
         return image, target
     

+ 1 - 4
engine.py

@@ -1140,7 +1140,7 @@ class RTDetrTrainer(object):
         self.args.fp16 = False
         # weak augmentatino stage
         self.second_stage = False
-        self.second_stage_epoch = -1
+        self.second_stage_epoch = 5
         # path to save model
         self.path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
         os.makedirs(self.path_to_save, exist_ok=True)
@@ -1160,8 +1160,6 @@ class RTDetrTrainer(object):
             args=args, trans_config=self.trans_cfg, max_stride=self.model_cfg['max_stride'], is_train=True)
         self.val_transform, _ = build_transform(
             args=args, trans_config=self.trans_cfg, max_stride=self.model_cfg['max_stride'], is_train=False)
-        if self.trans_cfg["mosaic_prob"] > 0.5:
-            self.second_stage_epoch = 5
 
         # ---------------------------- Build Dataset & Dataloader ----------------------------
         self.dataset, self.dataset_info = build_dataset(args, self.data_cfg, self.trans_cfg, self.train_transform, is_train=True)
@@ -1488,7 +1486,6 @@ class RTDetrTrainer(object):
         self.train_transform, self.trans_cfg = build_transform(
             args=self.args, trans_config=self.trans_cfg, max_stride=self.model_cfg['max_stride'], is_train=True)
         
-        self.train_transform.set_weak_augment()
         self.train_loader.dataset.transform = self.train_transform
 
 ## Real-time PlainDETR Trainer

+ 1 - 1
models/detectors/rtcdet/loss.py

@@ -82,7 +82,7 @@ class Criterion(object):
                                  'labels': [...], 
                                  'orig_size': ...}, ...]
         """
-        bs, num_anchors = outputs['pred_cls'][0].shape[:2]
+        bs = outputs['pred_cls'][0].shape[0]
         device = outputs['pred_cls'][0].device
         anchors = outputs['anchors']
         fpn_strides = outputs['strides']

+ 0 - 2
models/detectors/rtdetr/basic_modules/fpn.py

@@ -66,8 +66,6 @@ class HybridEncoder(nn.Module):
         self.input_proj_2 = BasicConv(c4, self.out_dim, kernel_size=1, act_type=None, norm_type=norm_type)
         self.input_proj_3 = BasicConv(c3, self.out_dim, kernel_size=1, act_type=None, norm_type=norm_type)
 
-        # ---------------- Downsample ----------------
-
         # ---------------- Transformer Encoder ----------------
         self.transformer_encoder = TransformerEncoder(d_model        = self.out_dim,
                                                       num_heads      = num_heads,

+ 3 - 2
models/detectors/rtdetr/basic_modules/transformer.py

@@ -316,7 +316,7 @@ class TransformerEncoder(nn.Module):
         # -------- Transformer encoder --------
         channels, fmp_h, fmp_w = src.shape[1:]
         # [B, C, H, W] -> [B, N, C], N=HxW
-        src_flatten = src.flatten(2).permute(0, 2, 1)
+        src_flatten = src.flatten(2).permute(0, 2, 1).contiguous()
         memory = src_flatten
 
         # PosEmbed: [1, N, C]
@@ -328,7 +328,8 @@ class TransformerEncoder(nn.Module):
             memory = encoder(memory, pos_embed=pos_embed)
 
         # Output: [B, N, C] -> [B, C, N] -> [B, C, H, W]
-        src = memory.permute(0, 2, 1).reshape([-1, channels, fmp_h, fmp_w])
+        src = memory.permute(0, 2, 1).contiguous()
+        src = src.view([-1, channels, fmp_h, fmp_w])
 
         return src
 

+ 1 - 1
models/detectors/rtdetr/rtdetr_decoder.py

@@ -201,7 +201,7 @@ class RTDETRTransformer(nn.Module):
             # [l], start index of each level
             level_start_index.append(h * w + level_start_index[-1])
             # [B, C, H, W] -> [B, N, C], N=HxW
-            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
+            feat_flatten.append(feat.flatten(2).permute(0, 2, 1).contiguous())
 
         # [B, N, C], N = N_0 + N_1 + ...
         feat_flatten = torch.cat(feat_flatten, dim=1)