1 年之前 · b77da8e973
--- a/config/data_config/transform_config.py
+++ b/config/data_config/transform_config.py
@@ -19,6 +19,7 @@ yolov5_x_trans_config = {
 
				     'mixup_prob': 0.2,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolov5_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -39,6 +40,7 @@ yolov5_l_trans_config = {
 
				     'mixup_prob': 0.15,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolov5_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -59,6 +61,7 @@ yolov5_m_trans_config = {
 
				     'mixup_prob': 0.10,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolov5_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -79,6 +82,7 @@ yolov5_s_trans_config = {
 
				     'mixup_prob': 0.0,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolov5_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -99,6 +103,7 @@ yolov5_n_trans_config = {
 
				     'mixup_prob': 0.0,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolov5_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -119,6 +124,7 @@ yolov5_p_trans_config = {
 
				     'mixup_prob': 0.0,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolov5_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -141,6 +147,7 @@ yolox_x_trans_config = {
 
				     'mixup_prob': 1.0,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolox_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]
			
 
				 }
			
 
				 
			
@@ -161,6 +168,7 @@ yolox_l_trans_config = {
 
				     'mixup_prob': 1.0,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolox_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -181,6 +189,7 @@ yolox_m_trans_config = {
 
				     'mixup_prob': 1.0,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolox_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -201,6 +210,7 @@ yolox_s_trans_config = {
 
				     'mixup_prob': 1.0,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolox_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -221,6 +231,7 @@ yolox_n_trans_config = {
 
				     'mixup_prob': 0.5,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolox_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -241,6 +252,7 @@ yolox_p_trans_config = {
 
				     'mixup_prob': 0.0,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolox_mixup',
			
 
				+    'mosaic_keep_ratio': True,
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
 
				 }
			
 
				 
			
@@ -254,5 +266,20 @@ ssd_trans_config = {
 
				     'mixup_prob': 0.,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolov5_mixup',
			
 
				+    'mosaic_keep_ratio': False,
			
 
				+    'mixup_scale': [0.5, 1.5]
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# ----------------------- SSD-Style Transform -----------------------
			
 
				+rtdetr_trans_config = {
			
 
				+    'aug_type': 'rtdetr',
			
 
				+    'use_ablu': False,
			
 
				+    # Mosaic & Mixup are not used for RT_DETR-style augmentation
			
 
				+    'mosaic_prob': 0.,
			
 
				+    'mixup_prob': 0.,
			
 
				+    'mosaic_type': 'yolov5_mosaic',
			
 
				+    'mixup_type': 'yolov5_mixup',
			
 
				+    'mosaic_keep_ratio': False,
			
 
				     'mixup_scale': [0.5, 1.5]
			
 
				 }
			
--- a/dataset/build.py
+++ b/dataset/build.py
@@ -1,22 +1,28 @@
 
				 import os
			
 
				 
			
 
				 try:
			
 
				+    # dataset class
			
 
				     from .voc import VOCDataset
			
 
				     from .coco import COCODataset
			
 
				     from .crowdhuman import CrowdHumanDataset
			
 
				     from .widerface import WiderFaceDataset
			
 
				     from .customed import CustomedDataset
			
 
				+    # transform class
			
 
				     from .data_augment.ssd_augment import SSDAugmentation, SSDBaseTransform
			
 
				     from .data_augment.yolov5_augment import YOLOv5Augmentation, YOLOv5BaseTransform
			
 
				+    from .data_augment.rtdetr_augment import RTDetrAugmentation, RTDetrBaseTransform
			
 
				 
			
 
				 except:
			
 
				+    # dataset class
			
 
				     from voc import VOCDataset
			
 
				     from coco import COCODataset
			
 
				     from crowdhuman import CrowdHumanDataset
			
 
				     from widerface import WiderFaceDataset
			
 
				     from customed import CustomedDataset
			
 
				+    # transform class
			
 
				     from data_augment.ssd_augment import SSDAugmentation, SSDBaseTransform
			
 
				     from data_augment.yolov5_augment import YOLOv5Augmentation, YOLOv5BaseTransform
			
 
				+    from data_augment.rtdetr_augment import RTDetrAugmentation, RTDetrBaseTransform
			
 
				 
			
 
				 
			
 
				 # ------------------------------ Dataset ------------------------------
			
@@ -92,28 +98,23 @@ def build_dataset(args, data_cfg, trans_config, transform, is_train=False):
 
				 
			
 
				 # ------------------------------ Transform ------------------------------
			
 
				 def build_transform(args, trans_config, max_stride=32, is_train=False):
			
 
				-    # Modify trans_config
			
 
				+    # ---------------- Modify trans_config ----------------
			
 
				     if is_train:
			
 
				         ## mosaic prob.
			
 
				         if args.mosaic is not None:
			
 
				-            trans_config['mosaic_prob']=args.mosaic if is_train else 0.0
			
 
				-        else:
			
 
				-            trans_config['mosaic_prob']=trans_config['mosaic_prob'] if is_train else 0.0
			
 
				+            trans_config['mosaic_prob'] = args.mosaic
			
 
				         ## mixup prob.
			
 
				         if args.mixup is not None:
			
 
				-            trans_config['mixup_prob']=args.mixup if is_train else 0.0
			
 
				-        else:
			
 
				-            trans_config['mixup_prob']=trans_config['mixup_prob']  if is_train else 0.0
			
 
				+            trans_config['mixup_prob'] = args.mixup
			
 
				 
			
 
				-    # Transform
			
 
				+    # ---------------- Build transform ----------------
			
 
				+    ## SSD-style transform
			
 
				     if trans_config['aug_type'] == 'ssd':
			
 
				         if is_train:
			
 
				             transform = SSDAugmentation(img_size=args.img_size,)
			
 
				         else:
			
 
				             transform = SSDBaseTransform(img_size=args.img_size,)
			
 
				-        trans_config['mosaic_prob'] = 0.0
			
 
				-        trans_config['mixup_prob'] = 0.0
			
 
				-
			
 
				+    ## YOLO-style transform
			
 
				     elif trans_config['aug_type'] == 'yolov5':
			
 
				         if is_train:
			
 
				             transform = YOLOv5Augmentation(
			
@@ -126,5 +127,14 @@ def build_transform(args, trans_config, max_stride=32, is_train=False):
 
				                 img_size=args.img_size,
			
 
				                 max_stride=max_stride
			
 
				                 )
			
 
				+    ## RT_DETR-style transform
			
 
				+    elif trans_config['aug_type'] == 'rtdetr':
			
 
				+        if is_train:
			
 
				+            use_mosaic = False if trans_config['mosaic_prob'] < 0.2 else True
			
 
				+            transform = RTDetrAugmentation(
			
 
				+                img_size=args.img_size, pixel_mean=[123.675, 116.28, 103.53], pixel_std=[58.395, 57.12, 57.375], use_mosaic=use_mosaic)
			
 
				+        else:
			
 
				+            transform = RTDetrBaseTransform(
			
 
				+                img_size=args.img_size, pixel_mean=[123.675, 116.28, 103.53], pixel_std=[58.395, 57.12, 57.375])
			
 
				 
			
 
				     return transform, trans_config
			
--- a/dataset/coco.py
+++ b/dataset/coco.py
@@ -125,7 +125,7 @@ class COCODataset(Dataset):
 
				         # Mosaic
			
 
				         if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
			
 
				             image, target = yolov5_mosaic_augment(
			
 
				-                image_list, target_list, self.img_size, self.trans_config, self.is_train)
			
 
				+                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
			
 
				 
			
 
				         return image, target
			
 
				 
			
@@ -253,7 +253,7 @@ if __name__ == "__main__":
 
				     parser.add_argument('-size', '--img_size', default=640, type=int,
			
 
				                         help='input image size.')
			
 
				     parser.add_argument('--aug_type', type=str, default='ssd',
			
 
				-                        help='augmentation type')
			
 
				+                        help='augmentation type: ssd, yolov5, rtdetr.')
			
 
				     parser.add_argument('--mosaic', default=0., type=float,
			
 
				                         help='mosaic augmentation.')
			
 
				     parser.add_argument('--mixup', default=0., type=float,
			
@@ -284,10 +284,13 @@ if __name__ == "__main__":
 
				         'mixup_prob': args.mixup,
			
 
				         'mosaic_type': 'yolov5_mosaic',
			
 
				         'mixup_type': args.mixup_type,   # optional: yolov5_mixup, yolox_mixup
			
 
				+        'mosaic_keep_ratio': False,
			
 
				         'mixup_scale': [0.5, 1.5]
			
 
				     }
			
 
				-
			
 
				     transform, trans_cfg = build_transform(args, trans_config, 32, args.is_train)
			
 
				+    pixel_mean = transform.pixel_mean
			
 
				+    pixel_std  = transform.pixel_std
			
 
				+    color_format = transform.color_format
			
 
				 
			
 
				     dataset = COCODataset(
			
 
				         img_size=args.img_size,
			
@@ -312,6 +315,13 @@ if __name__ == "__main__":
 
				 
			
 
				         # to numpy
			
 
				         image = image.permute(1, 2, 0).numpy()
			
 
				+        
			
 
				+        # denormalize
			
 
				+        image = image * pixel_std + pixel_mean
			
 
				+        if color_format == 'rgb':
			
 
				+            # RGB to BGR
			
 
				+            image = image[..., (2, 1, 0)]
			
 
				+
			
 
				         # to uint8
			
 
				         image = image.astype(np.uint8)
			
 
				         image = image.copy()
			
@@ -326,7 +336,7 @@ if __name__ == "__main__":
 
				             color = class_colors[cls_id]
			
 
				             # class name
			
 
				             label = coco_class_labels[coco_class_index[cls_id]]
			
 
				-            image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,255), 2)
			
 
				+            image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
			
 
				             # put the test on the bbox
			
 
				             cv2.putText(image, label, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA)
			
 
				         cv2.imshow('gt', image)
			
--- a/dataset/crowdhuman.py
+++ b/dataset/crowdhuman.py
@@ -77,7 +77,7 @@ class CrowdHumanDataset(Dataset):
 
				         # Mosaic
			
 
				         if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
			
 
				             image, target = yolov5_mosaic_augment(
			
 
				-                image_list, target_list, self.img_size, self.trans_config, self.is_train)
			
 
				+                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
			
 
				 
			
 
				         return image, target
			
 
				 
			
@@ -219,10 +219,13 @@ if __name__ == "__main__":
 
				         'mixup_prob': args.mixup,
			
 
				         'mosaic_type': 'yolov5_mosaic',
			
 
				         'mixup_type': args.mixup_type,   # optional: yolov5_mixup, yolox_mixup
			
 
				+        'mosaic_keep_ratio': False,
			
 
				         'mixup_scale': [0.5, 1.5]
			
 
				     }
			
 
				-
			
 
				     transform, trans_cfg = build_transform(args, trans_config, 32, args.is_train)
			
 
				+    pixel_mean = transform.pixel_mean
			
 
				+    pixel_std  = transform.pixel_std
			
 
				+    color_format = transform.color_format
			
 
				 
			
 
				     dataset = CrowdHumanDataset(
			
 
				         img_size=args.img_size,
			
@@ -245,6 +248,13 @@ if __name__ == "__main__":
 
				 
			
 
				         # to numpy
			
 
				         image = image.permute(1, 2, 0).numpy()
			
 
				+        
			
 
				+        # denormalize
			
 
				+        image = image * pixel_std + pixel_mean
			
 
				+        if color_format == 'rgb':
			
 
				+            # RGB to BGR
			
 
				+            image = image[..., (2, 1, 0)]
			
 
				+
			
 
				         # to uint8
			
 
				         image = image.astype(np.uint8)
			
 
				         image = image.copy()
			
--- a/dataset/customed.py
+++ b/dataset/customed.py
@@ -120,7 +120,7 @@ class CustomedDataset(Dataset):
 
				         # Mosaic
			
 
				         if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
			
 
				             image, target = yolov5_mosaic_augment(
			
 
				-                image_list, target_list, self.img_size, self.trans_config, self.is_train)
			
 
				+                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
			
 
				 
			
 
				         return image, target
			
 
				 
			
@@ -262,25 +262,29 @@ if __name__ == "__main__":
 
				     args = parser.parse_args()
			
 
				 
			
 
				     trans_config = {
			
 
				-        'aug_type': 'yolov5',  # optional: ssd, yolov5
			
 
				+        'aug_type': args.aug_type,    # optional: ssd, yolov5
			
 
				         # Basic Augment
			
 
				         'degrees': 0.0,
			
 
				         'translate': 0.2,
			
 
				-        'scale': [0.5, 2.0],
			
 
				+        'scale': [0.1, 2.0],
			
 
				         'shear': 0.0,
			
 
				         'perspective': 0.0,
			
 
				         'hsv_h': 0.015,
			
 
				         'hsv_s': 0.7,
			
 
				         'hsv_v': 0.4,
			
 
				+        'use_ablu': True,
			
 
				         # Mosaic & Mixup
			
 
				-        'mosaic_prob': 1.0,
			
 
				-        'mixup_prob': 1.0,
			
 
				+        'mosaic_prob': args.mosaic,
			
 
				+        'mixup_prob': args.mixup,
			
 
				         'mosaic_type': 'yolov5_mosaic',
			
 
				-        'mixup_type': 'yolov5_mixup',
			
 
				+        'mixup_type': args.mixup_type,   # optional: yolov5_mixup, yolox_mixup
			
 
				+        'mosaic_keep_ratio': False,
			
 
				         'mixup_scale': [0.5, 1.5]
			
 
				     }
			
 
				-
			
 
				     transform, trans_cfg = build_transform(args, trans_config, 32, args.is_train)
			
 
				+    pixel_mean = transform.pixel_mean
			
 
				+    pixel_std  = transform.pixel_std
			
 
				+    color_format = transform.color_format
			
 
				 
			
 
				     dataset = CustomedDataset(
			
 
				         img_size=args.img_size,
			
@@ -305,6 +309,14 @@ if __name__ == "__main__":
 
				 
			
 
				         # to numpy
			
 
				         image = image.permute(1, 2, 0).numpy()
			
 
				+        
			
 
				+        # denormalize
			
 
				+        image = image * pixel_std + pixel_mean
			
 
				+        if color_format == 'rgb':
			
 
				+            # RGB to BGR
			
 
				+            image = image[..., (2, 1, 0)]
			
 
				+
			
 
				+        # to uint8
			
 
				         image = image.astype(np.uint8)
			
 
				         image = image.copy()
			
 
				         img_h, img_w = image.shape[:2]
			
--- a/dataset/data_augment/rtdetr_augment.py
+++ b/dataset/data_augment/rtdetr_augment.py
@@ -1,23 +1,367 @@
 
				+# ------------------------------------------------------------
			
 
				 # Data preprocessor for Real-time DETR
			
 
				+# ------------------------------------------------------------
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+from numpy import random
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				 
			
 
				 
			
 
				 # ------------------------- Augmentations -------------------------
			
 
				+class Compose(object):
			
 
				+    """Composes several augmentations together.
			
 
				+    Args:
			
 
				+        transforms (List[Transform]): list of transforms to compose.
			
 
				+    Example:
			
 
				+        >>> augmentations.Compose([
			
 
				+        >>>     transforms.CenterCrop(10),
			
 
				+        >>>     transforms.ToTensor(),
			
 
				+        >>> ])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, transforms):
			
 
				+        self.transforms = transforms
			
 
				+
			
 
				+    def __call__(self, image, target=None):
			
 
				+        for t in self.transforms:
			
 
				+            image, target = t(image, target)
			
 
				+        return image, target
			
 
				+
			
 
				+## Convert color format
			
 
				+class ConvertColorFormat(object):
			
 
				+    def __init__(self, color_format='rgb'):
			
 
				+        self.color_format = color_format
			
 
				+
			
 
				+    def __call__(self, image, target=None):
			
 
				+        """
			
 
				+        Input:
			
 
				+            image: (np.array) a OpenCV image with BGR color format.
			
 
				+            target: None
			
 
				+        Output:
			
 
				+            image: (np.array) a OpenCV image with given color format.
			
 
				+            target: None
			
 
				+        """
			
 
				+        # Convert color format
			
 
				+        if self.color_format == 'rgb':
			
 
				+            image = image[..., (2, 1, 0)]    # BGR -> RGB
			
 
				+        elif self.color_format == 'bgr':
			
 
				+            image = image
			
 
				+        else:
			
 
				+            raise NotImplementedError("Unknown color format: <{}>".format(self.color_format))
			
 
				+
			
 
				+        return image, target
			
 
				+
			
 
				+## Random Photometric Distort
			
 
				+class RandomPhotometricDistort(object):
			
 
				+    """
			
 
				+    Distort image w.r.t hue, saturation and exposure.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, hue=0.1, saturation=1.5, exposure=1.5):
			
 
				+        super().__init__()
			
 
				+        self.hue = hue
			
 
				+        self.saturation = saturation
			
 
				+        self.exposure = exposure
			
 
				+
			
 
				+    def __call__(self, image: np.ndarray, target=None) -> np.ndarray:
			
 
				+        """
			
 
				+        Args:
			
 
				+            img (ndarray): of shape HxW, HxWxC, or NxHxWxC. The array can be
			
 
				+                of type uint8 in range [0, 255], or floating point in range
			
 
				+                [0, 1] or [0, 255].
			
 
				+
			
 
				+        Returns:
			
 
				+            ndarray: the distorted image(s).
			
 
				+        """
			
 
				+        if random.random() < 0.5:
			
 
				+            dhue = np.random.uniform(low=-self.hue, high=self.hue)
			
 
				+            dsat = self._rand_scale(self.saturation)
			
 
				+            dexp = self._rand_scale(self.exposure)
			
 
				+
			
 
				+            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
			
 
				+            image = np.asarray(image, dtype=np.float32) / 255.
			
 
				+            image[:, :, 1] *= dsat
			
 
				+            image[:, :, 2] *= dexp
			
 
				+            H = image[:, :, 0] + dhue * 179 / 255.
			
 
				+
			
 
				+            if dhue > 0:
			
 
				+                H[H > 1.0] -= 1.0
			
 
				+            else:
			
 
				+                H[H < 0.0] += 1.0
			
 
				+
			
 
				+            image[:, :, 0] = H
			
 
				+            image = (image * 255).clip(0, 255).astype(np.uint8)
			
 
				+            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
			
 
				+            image = np.asarray(image, dtype=np.uint8)
			
 
				+
			
 
				+        return image, target
			
 
				+
			
 
				+    def _rand_scale(self, upper_bound):
			
 
				+        """
			
 
				+        Calculate random scaling factor.
			
 
				+
			
 
				+        Args:
			
 
				+            upper_bound (float): range of the random scale.
			
 
				+        Returns:
			
 
				+            random scaling factor (float) whose range is
			
 
				+            from 1 / s to s .
			
 
				+        """
			
 
				+        scale = np.random.uniform(low=1, high=upper_bound)
			
 
				+        if np.random.rand() > 0.5:
			
 
				+            return scale
			
 
				+        return 1 / scale
			
 
				+
			
 
				+## Random IoU based Sample Crop
			
 
				+class RandomSampleCrop(object):
			
 
				+    def __init__(self):
			
 
				+        self.sample_options = (
			
 
				+            # using entire original input image
			
 
				+            None,
			
 
				+            # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
			
 
				+            (0.1, None),
			
 
				+            (0.3, None),
			
 
				+            (0.7, None),
			
 
				+            (0.9, None),
			
 
				+            # randomly sample a patch
			
 
				+            (None, None),
			
 
				+        )
			
 
				+
			
 
				+    def intersect(self, box_a, box_b):
			
 
				+        max_xy = np.minimum(box_a[:, 2:], box_b[2:])
			
 
				+        min_xy = np.maximum(box_a[:, :2], box_b[:2])
			
 
				+        inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
			
 
				+
			
 
				+        return inter[:, 0] * inter[:, 1]
			
 
				+
			
 
				+    def compute_iou(self, box_a, box_b):
			
 
				+        inter = self.intersect(box_a, box_b)
			
 
				+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
			
 
				+                (box_a[:, 3]-box_a[:, 1]))  # [A,B]
			
 
				+        area_b = ((box_b[2]-box_b[0]) *
			
 
				+                (box_b[3]-box_b[1]))  # [A,B]
			
 
				+        union = area_a + area_b - inter
			
 
				+        return inter / union  # [A,B]
			
 
				+
			
 
				+    def __call__(self, image, target=None):
			
 
				+        height, width, _ = image.shape
			
 
				+
			
 
				+        # check target
			
 
				+        if len(target["boxes"]) == 0:
			
 
				+            return image, target
			
 
				+
			
 
				+        while True:
			
 
				+            # randomly choose a mode
			
 
				+            sample_id = np.random.randint(len(self.sample_options))
			
 
				+            mode = self.sample_options[sample_id]
			
 
				+            if mode is None:
			
 
				+                return image, target
			
 
				+
			
 
				+            boxes = target["boxes"]
			
 
				+            labels = target["labels"]
			
 
				+
			
 
				+            min_iou, max_iou = mode
			
 
				+            if min_iou is None:
			
 
				+                min_iou = float('-inf')
			
 
				+            if max_iou is None:
			
 
				+                max_iou = float('inf')
			
 
				+
			
 
				+            # max trails (50)
			
 
				+            for _ in range(50):
			
 
				+                current_image = image
			
 
				+
			
 
				+                w = random.uniform(0.3 * width, width)
			
 
				+                h = random.uniform(0.3 * height, height)
			
 
				+
			
 
				+                # aspect ratio constraint b/t .5 & 2
			
 
				+                if h / w < 0.5 or h / w > 2:
			
 
				+                    continue
			
 
				+
			
 
				+                left = random.uniform(width - w)
			
 
				+                top = random.uniform(height - h)
			
 
				+
			
 
				+                # convert to integer rect x1,y1,x2,y2
			
 
				+                rect = np.array([int(left), int(top), int(left+w), int(top+h)])
			
 
				+
			
 
				+                # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
			
 
				+                overlap = self.compute_iou(boxes, rect)
			
 
				+
			
 
				+                # is min and max overlap constraint satisfied? if not try again
			
 
				+                if overlap.min() < min_iou and max_iou < overlap.max():
			
 
				+                    continue
			
 
				+
			
 
				+                # cut the crop from the image
			
 
				+                current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
			
 
				+                                              :]
			
 
				+
			
 
				+                # keep overlap with gt box IF center in sampled patch
			
 
				+                centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
			
 
				+
			
 
				+                # mask in all gt boxes that above and to the left of centers
			
 
				+                m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
			
 
				+
			
 
				+                # mask in all gt boxes that under and to the right of centers
			
 
				+                m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
			
 
				+
			
 
				+                # mask in that both m1 and m2 are true
			
 
				+                mask = m1 * m2
			
 
				+
			
 
				+                # have any valid boxes? try again if not
			
 
				+                if not mask.any():
			
 
				+                    continue
			
 
				+
			
 
				+                # take only matching gt boxes
			
 
				+                current_boxes = boxes[mask, :].copy()
			
 
				+
			
 
				+                # take only matching gt labels
			
 
				+                current_labels = labels[mask]
			
 
				+
			
 
				+                # should we use the box left and top corner or the crop's
			
 
				+                current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
			
 
				+                                                  rect[:2])
			
 
				+                # adjust to crop (by substracting crop's left,top)
			
 
				+                current_boxes[:, :2] -= rect[:2]
			
 
				+
			
 
				+                current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
			
 
				+                                                  rect[2:])
			
 
				+                # adjust to crop (by substracting crop's left,top)
			
 
				+                current_boxes[:, 2:] -= rect[:2]
			
 
				+
			
 
				+                # update target
			
 
				+                target["boxes"] = current_boxes
			
 
				+                target["labels"] = current_labels
			
 
				+
			
 
				+                return current_image, target
			
 
				+
			
 
				+## Random HFlip
			
 
				+class RandomHorizontalFlip(object):
			
 
				+    def __init__(self, p=0.5):
			
 
				+        self.p = p
			
 
				+
			
 
				+    def __call__(self, image, target=None):
			
 
				+        if random.random() < self.p:
			
 
				+            orig_h, orig_w = image.shape[:2]
			
 
				+            image = image[:, ::-1]
			
 
				+            if target is not None:
			
 
				+                if "boxes" in target:
			
 
				+                    boxes = target["boxes"].copy()
			
 
				+                    boxes[..., [0, 2]] = orig_w - boxes[..., [2, 0]]
			
 
				+                    target["boxes"] = boxes
			
 
				+
			
 
				+        return image, target
			
 
				+
			
 
				+## Resize tensor image
			
 
				+class Resize(object):
			
 
				+    def __init__(self, img_size=640):
			
 
				+        self.img_size = img_size
			
 
				+
			
 
				+    def __call__(self, image, target=None):
			
 
				+        orig_h, orig_w = image.shape[:2]
			
 
				+
			
 
				+        # resize
			
 
				+        image = cv2.resize(image, (self.img_size, self.img_size)).astype(np.float32)
			
 
				+        img_h, img_w = image.shape[:2]
			
 
				+
			
 
				+        # rescale bboxes
			
 
				+        if target is not None:
			
 
				+            boxes = target["boxes"]
			
 
				+            boxes[:, [0, 2]] = boxes[:, [0, 2]] / orig_w * img_w
			
 
				+            boxes[:, [1, 3]] = boxes[:, [1, 3]] / orig_h * img_h
			
 
				+            target["boxes"] = boxes
			
 
				+
			
 
				+        return image, target
			
 
				+
			
 
				+## Normalize tensor image
			
 
				+class Normalize(object):
			
 
				+    def __init__(self, pixel_mean, pixel_std):
			
 
				+        self.pixel_mean = pixel_mean
			
 
				+        self.pixel_std = pixel_std
			
 
				+
			
 
				+    def __call__(self, image, target=None):
			
 
				+        # normalize image
			
 
				+        image = (image - self.pixel_mean) / self.pixel_std
			
 
				+
			
 
				+        return image, target
			
 
				+
			
 
				+## Convert ndarray to torch.Tensor
			
 
				+class ToTensor(object):
			
 
				+    def __call__(self, image, target=None):        
			
 
				+        # Convert torch.Tensor
			
 
				+        image = torch.from_numpy(image).permute(2, 0, 1).contiguous().float()
			
 
				+
			
 
				+        if target is not None:
			
 
				+            target["boxes"] = torch.as_tensor(target["boxes"]).float()
			
 
				+            target["labels"] = torch.as_tensor(target["labels"]).long()
			
 
				+
			
 
				+        return image, target
			
 
				 
			
 
				 
			
 
				 # ------------------------- Preprocessers -------------------------
			
 
				 ## Transform for Train
			
 
				 class RTDetrAugmentation(object):
			
 
				-    def __init__(self):
			
 
				-        return
			
 
				-    
			
 
				-    def __call__(self,):
			
 
				-        pass
			
 
				+    def __init__(self, img_size=640, pixel_mean=[123.675, 116.28, 103.53], pixel_std=[58.395, 57.12, 57.375], use_mosaic=False):
			
 
				+        # ----------------- Basic parameters -----------------
			
 
				+        self.img_size = img_size
			
 
				+        self.use_mosaic = use_mosaic
			
 
				+        self.pixel_mean = pixel_mean  # RGB format
			
 
				+        self.pixel_std = pixel_std    # RGB format
			
 
				+        self.color_format = 'rgb'
			
 
				+
			
 
				+        # ----------------- Transforms -----------------
			
 
				+        if use_mosaic:
			
 
				+            # For use-mosaic setting, we do not use RandomSampleCrop processor.
			
 
				+            self.augment = Compose([
			
 
				+                RandomPhotometricDistort(hue=0.5, saturation=1.5, exposure=1.5),
			
 
				+                RandomHorizontalFlip(p=0.5),
			
 
				+                Resize(img_size=self.img_size),
			
 
				+                ConvertColorFormat(self.color_format),
			
 
				+                Normalize(self.pixel_mean, self.pixel_std),
			
 
				+                ToTensor()
			
 
				+            ])
			
 
				+        else:
			
 
				+            # For no-mosaic setting, we use RandomSampleCrop processor.
			
 
				+            self.augment = Compose([
			
 
				+                RandomPhotometricDistort(hue=0.5, saturation=1.5, exposure=1.5),
			
 
				+                RandomSampleCrop(),
			
 
				+                RandomHorizontalFlip(p=0.5),
			
 
				+                Resize(img_size=self.img_size),
			
 
				+                ConvertColorFormat(self.color_format),
			
 
				+                Normalize(self.pixel_mean, self.pixel_std),
			
 
				+                ToTensor()
			
 
				+            ])
			
 
				+
			
 
				+    def __call__(self, image, target, mosaic=False):
			
 
				+        orig_h, orig_w = image.shape[:2]
			
 
				+        ratio = [self.img_size / orig_w, self.img_size / orig_h]
			
 
				 
			
 
				-## Transform for Val
			
 
				+        image, target = self.augment(image, target)
			
 
				+
			
 
				+        return image, target, ratio
			
 
				+
			
 
				+
			
 
				+## Transform for Eval
			
 
				 class RTDetrBaseTransform(object):
			
 
				-    def __init__(self):
			
 
				-        return
			
 
				-    
			
 
				-    def __call__(self,):
			
 
				-        pass
			
 
				+    def __init__(self, img_size=640, pixel_mean=[123.675, 116.28, 103.53], pixel_std=[58.395, 57.12, 57.375]):
			
 
				+        # ----------------- Basic parameters -----------------
			
 
				+        self.img_size = img_size
			
 
				+        self.pixel_mean = pixel_mean  # RGB format
			
 
				+        self.pixel_std = pixel_std    # RGB format
			
 
				+        self.color_format = 'rgb'
			
 
				+
			
 
				+        # ----------------- Transforms -----------------
			
 
				+        self.transform = Compose([
			
 
				+            Resize(img_size=self.img_size),
			
 
				+            ConvertColorFormat(self.color_format),
			
 
				+            Normalize(self.pixel_mean, self.pixel_std),
			
 
				+            ToTensor()
			
 
				+        ])
			
 
				+
			
 
				+
			
 
				+    def __call__(self, image, target, mosaic=False):
			
 
				+        orig_h, orig_w = image.shape[:2]
			
 
				+        ratio = [self.img_size / orig_w, self.img_size / orig_h]
			
 
				+
			
 
				+        image, target = self.transform(image, target)
			
 
				 
			
 
				+        return image, target, ratio
			
--- a/dataset/data_augment/ssd_augment.py
+++ b/dataset/data_augment/ssd_augment.py
@@ -1,36 +1,13 @@
 
				+# ------------------------------------------------------------
			
 
				+# Data preprocessor for SSD
			
 
				+# ------------------------------------------------------------
			
 
				 import cv2
			
 
				 import numpy as np
			
 
				 import torch
			
 
				 from numpy import random
			
 
				 
			
 
				 
			
 
				-def intersect(box_a, box_b):
			
 
				-    max_xy = np.minimum(box_a[:, 2:], box_b[2:])
			
 
				-    min_xy = np.maximum(box_a[:, :2], box_b[:2])
			
 
				-    inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
			
 
				-    return inter[:, 0] * inter[:, 1]
			
 
				-
			
 
				-
			
 
				-def jaccard_numpy(box_a, box_b):
			
 
				-    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
			
 
				-    is simply the intersection over union of two boxes.
			
 
				-    E.g.:
			
 
				-        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
			
 
				-    Args:
			
 
				-        box_a: Multiple bounding boxes, Shape: [num_boxes,4]
			
 
				-        box_b: Single bounding box, Shape: [4]
			
 
				-    Return:
			
 
				-        jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
			
 
				-    """
			
 
				-    inter = intersect(box_a, box_b)
			
 
				-    area_a = ((box_a[:, 2]-box_a[:, 0]) *
			
 
				-              (box_a[:, 3]-box_a[:, 1]))  # [A,B]
			
 
				-    area_b = ((box_b[2]-box_b[0]) *
			
 
				-              (box_b[3]-box_b[1]))  # [A,B]
			
 
				-    union = area_a + area_b - inter
			
 
				-    return inter / union  # [A,B]
			
 
				-
			
 
				-
			
 
				+# ------------------------- Augmentations -------------------------
			
 
				 class Compose(object):
			
 
				     """Composes several augmentations together.
			
 
				     Args:
			
@@ -50,12 +27,12 @@ class Compose(object):
 
				             img, boxes, labels = t(img, boxes, labels)
			
 
				         return img, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Convert Image to float type
			
 
				 class ConvertFromInts(object):
			
 
				     def __call__(self, image, boxes=None, labels=None):
			
 
				         return image.astype(np.float32), boxes, labels
			
 
				 
			
 
				-
			
 
				+## Convert color format
			
 
				 class ConvertColor(object):
			
 
				     def __init__(self, current='BGR', transform='HSV'):
			
 
				         self.transform = transform
			
@@ -70,7 +47,7 @@ class ConvertColor(object):
 
				             raise NotImplementedError
			
 
				         return image, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Resize image
			
 
				 class Resize(object):
			
 
				     def __init__(self, img_size=640):
			
 
				         self.img_size = img_size
			
@@ -86,7 +63,7 @@ class Resize(object):
 
				 
			
 
				         return image, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Random Saturation
			
 
				 class RandomSaturation(object):
			
 
				     def __init__(self, lower=0.5, upper=1.5):
			
 
				         self.lower = lower
			
@@ -100,7 +77,7 @@ class RandomSaturation(object):
 
				 
			
 
				         return image, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Random Hue
			
 
				 class RandomHue(object):
			
 
				     def __init__(self, delta=18.0):
			
 
				         assert delta >= 0.0 and delta <= 360.0
			
@@ -113,7 +90,7 @@ class RandomHue(object):
 
				             image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
			
 
				         return image, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Random Lighting noise
			
 
				 class RandomLightingNoise(object):
			
 
				     def __init__(self):
			
 
				         self.perms = ((0, 1, 2), (0, 2, 1),
			
@@ -127,7 +104,7 @@ class RandomLightingNoise(object):
 
				             image = shuffle(image)
			
 
				         return image, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Random Contrast
			
 
				 class RandomContrast(object):
			
 
				     def __init__(self, lower=0.5, upper=1.5):
			
 
				         self.lower = lower
			
@@ -142,7 +119,7 @@ class RandomContrast(object):
 
				             image *= alpha
			
 
				         return image, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Random Brightness
			
 
				 class RandomBrightness(object):
			
 
				     def __init__(self, delta=32):
			
 
				         assert delta >= 0.0
			
@@ -155,7 +132,7 @@ class RandomBrightness(object):
 
				             image += delta
			
 
				         return image, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Random SampleCrop
			
 
				 class RandomSampleCrop(object):
			
 
				     """Crop
			
 
				     Arguments:
			
@@ -182,6 +159,21 @@ class RandomSampleCrop(object):
 
				             (None, None),
			
 
				         )
			
 
				 
			
 
				+    def intersect(self, box_a, box_b):
			
 
				+        max_xy = np.minimum(box_a[:, 2:], box_b[2:])
			
 
				+        min_xy = np.maximum(box_a[:, :2], box_b[:2])
			
 
				+        inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
			
 
				+        return inter[:, 0] * inter[:, 1]
			
 
				+
			
 
				+    def compute_iou(self, box_a, box_b):
			
 
				+        inter = self.intersect(box_a, box_b)
			
 
				+        area_a = ((box_a[:, 2]-box_a[:, 0]) *
			
 
				+                (box_a[:, 3]-box_a[:, 1]))  # [A,B]
			
 
				+        area_b = ((box_b[2]-box_b[0]) *
			
 
				+                (box_b[3]-box_b[1]))  # [A,B]
			
 
				+        union = area_a + area_b - inter
			
 
				+        return inter / union  # [A,B]
			
 
				+
			
 
				     def __call__(self, image, boxes=None, labels=None):
			
 
				         height, width, _ = image.shape
			
 
				         # check
			
@@ -219,7 +211,7 @@ class RandomSampleCrop(object):
 
				                 rect = np.array([int(left), int(top), int(left+w), int(top+h)])
			
 
				 
			
 
				                 # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
			
 
				-                overlap = jaccard_numpy(boxes, rect)
			
 
				+                overlap = self.compute_iou(boxes, rect)
			
 
				 
			
 
				                 # is min and max overlap constraint satisfied? if not try again
			
 
				                 if overlap.min() < min_iou and max_iou < overlap.max():
			
@@ -264,7 +256,7 @@ class RandomSampleCrop(object):
 
				 
			
 
				                 return current_image, current_boxes, current_labels
			
 
				 
			
 
				-
			
 
				+## Random scaling
			
 
				 class Expand(object):
			
 
				     def __call__(self, image, boxes, labels):
			
 
				         if random.randint(2):
			
@@ -288,7 +280,7 @@ class Expand(object):
 
				 
			
 
				         return image, boxes, labels
			
 
				 
			
 
				-
			
 
				+## Random HFlip
			
 
				 class RandomHorizontalFlip(object):
			
 
				     def __call__(self, image, boxes, classes):
			
 
				         _, width, _ = image.shape
			
@@ -298,7 +290,7 @@ class RandomHorizontalFlip(object):
 
				             boxes[:, 0::2] = width - boxes[:, 2::-2]
			
 
				         return image, boxes, classes
			
 
				 
			
 
				-
			
 
				+## Random swap channels
			
 
				 class SwapChannels(object):
			
 
				     """Transforms a tensorized image by swapping the channels in the order
			
 
				      specified in the swap tuple.
			
@@ -324,7 +316,7 @@ class SwapChannels(object):
 
				         image = image[:, :, self.swaps]
			
 
				         return image
			
 
				 
			
 
				-
			
 
				+## Random color jitter
			
 
				 class PhotometricDistort(object):
			
 
				     def __init__(self):
			
 
				         self.pd = [
			
@@ -348,11 +340,14 @@ class PhotometricDistort(object):
 
				         return im, boxes, labels
			
 
				 
			
 
				 
			
 
				-# ----------------------- Main Functions -----------------------
			
 
				+# ------------------------- Preprocessers -------------------------
			
 
				 ## SSD-style Augmentation
			
 
				 class SSDAugmentation(object):
			
 
				     def __init__(self, img_size=640):
			
 
				         self.img_size = img_size
			
 
				+        self.pixel_mean = [0., 0., 0.]
			
 
				+        self.pixel_std  = [1., 1., 1.]
			
 
				+        self.color_format = 'bgr'
			
 
				         self.augment = Compose([
			
 
				             ConvertFromInts(),                         # 将int类型转换为float32类型
			
 
				             PhotometricDistort(),                      # 图像颜色增强
			
@@ -384,6 +379,9 @@ class SSDAugmentation(object):
 
				 class SSDBaseTransform(object):
			
 
				     def __init__(self, img_size):
			
 
				         self.img_size = img_size
			
 
				+        self.pixel_mean = [0., 0., 0.]
			
 
				+        self.pixel_std  = [1., 1., 1.]
			
 
				+        self.color_format = 'bgr'
			
 
				 
			
 
				     def __call__(self, image, target=None, mosaic=False):
			
 
				         # resize
			
--- a/dataset/data_augment/yolov5_augment.py
+++ b/dataset/data_augment/yolov5_augment.py
@@ -123,7 +123,7 @@ class Albumentations(object):
 
				 
			
 
				 # ------------------------- Strong augmentations -------------------------
			
 
				 ## YOLOv5-Mosaic
			
 
				-def yolov5_mosaic_augment(image_list, target_list, img_size, affine_params, is_train=False):
			
 
				+def yolov5_mosaic_augment(image_list, target_list, img_size, affine_params, keep_ratio=True, is_train=False):
			
 
				     assert len(image_list) == 4
			
 
				 
			
 
				     mosaic_img = np.ones([img_size*2, img_size*2, image_list[0].shape[2]], dtype=np.uint8) * 114
			
@@ -141,10 +141,14 @@ def yolov5_mosaic_augment(image_list, target_list, img_size, affine_params, is_t
 
				         orig_h, orig_w, _ = img_i.shape
			
 
				 
			
 
				         # resize
			
 
				-        r = img_size / max(orig_h, orig_w)
			
 
				-        if r != 1: 
			
 
				-            interp = cv2.INTER_LINEAR if (is_train or r > 1) else cv2.INTER_AREA
			
 
				-            img_i = cv2.resize(img_i, (int(orig_w * r), int(orig_h * r)), interpolation=interp)
			
 
				+        if keep_ratio:
			
 
				+            r = img_size / max(orig_h, orig_w)
			
 
				+            if r != 1: 
			
 
				+                interp = cv2.INTER_LINEAR if (is_train or r > 1) else cv2.INTER_AREA
			
 
				+                img_i = cv2.resize(img_i, (int(orig_w * r), int(orig_h * r)), interpolation=interp)
			
 
				+        else:
			
 
				+            interp = cv2.INTER_LINEAR if is_train else cv2.INTER_AREA
			
 
				+            img_i = cv2.resize(img_i, (img_size, img_size), interpolation=interp)
			
 
				         h, w, _ = img_i.shape
			
 
				 
			
 
				         # place img in img4
			
@@ -332,6 +336,9 @@ class YOLOv5Augmentation(object):
 
				     def __init__(self, img_size=640, trans_config=None, use_ablu=False):
			
 
				         # Basic parameters
			
 
				         self.img_size = img_size
			
 
				+        self.pixel_mean = [0., 0., 0.]
			
 
				+        self.pixel_std  = [1., 1., 1.]
			
 
				+        self.color_format = 'bgr'
			
 
				         self.trans_config = trans_config
			
 
				         # Albumentations
			
 
				         self.ablu_trans = Albumentations(img_size) if use_ablu else None
			
@@ -413,7 +420,9 @@ class YOLOv5BaseTransform(object):
 
				     def __init__(self, img_size=640, max_stride=32):
			
 
				         self.img_size = img_size
			
 
				         self.max_stride = max_stride
			
 
				-
			
 
				+        self.pixel_mean = [0., 0., 0.]
			
 
				+        self.pixel_std  = [1., 1., 1.]
			
 
				+        self.color_format = 'bgr'
			
 
				 
			
 
				     def __call__(self, image, target=None, mosaic=False):
			
 
				         # --------------- Keep ratio Resize ---------------
			
--- a/dataset/voc.py
+++ b/dataset/voc.py
@@ -164,7 +164,7 @@ class VOCDataset(data.Dataset):
 
				         # Mosaic
			
 
				         if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
			
 
				             image, target = yolov5_mosaic_augment(
			
 
				-                image_list, target_list, self.img_size, self.trans_config, self.is_train)
			
 
				+                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
			
 
				 
			
 
				         return image, target
			
 
				 
			
@@ -257,7 +257,7 @@ if __name__ == "__main__":
 
				     parser.add_argument('-size', '--img_size', default=640, type=int,
			
 
				                         help='input image size.')
			
 
				     parser.add_argument('--aug_type', type=str, default='ssd',
			
 
				-                        help='augmentation type')
			
 
				+                        help='augmentation type: ssd, yolov5, rtdetr.')
			
 
				     parser.add_argument('--mosaic', default=0., type=float,
			
 
				                         help='mosaic augmentation.')
			
 
				     parser.add_argument('--mixup', default=0., type=float,
			
@@ -288,9 +288,13 @@ if __name__ == "__main__":
 
				         'mixup_prob': args.mixup,
			
 
				         'mosaic_type': 'yolov5_mosaic',
			
 
				         'mixup_type': args.mixup_type,   # optional: yolov5_mixup, yolox_mixup
			
 
				+        'mosaic_keep_ratio': False,
			
 
				         'mixup_scale': [0.5, 1.5]
			
 
				     }
			
 
				     transform, trans_cfg = build_transform(args, trans_config, 32, args.is_train)
			
 
				+    pixel_mean = transform.pixel_mean
			
 
				+    pixel_std  = transform.pixel_std
			
 
				+    color_format = transform.color_format
			
 
				 
			
 
				     dataset = VOCDataset(
			
 
				         img_size=args.img_size,
			
@@ -315,6 +319,13 @@ if __name__ == "__main__":
 
				 
			
 
				         # to numpy
			
 
				         image = image.permute(1, 2, 0).numpy()
			
 
				+        
			
 
				+        # denormalize
			
 
				+        image = image * pixel_std + pixel_mean
			
 
				+        if color_format == 'rgb':
			
 
				+            # RGB to BGR
			
 
				+            image = image[..., (2, 1, 0)]
			
 
				+
			
 
				         # to uint8
			
 
				         image = image.astype(np.uint8)
			
 
				         image = image.copy()
			
@@ -330,7 +341,7 @@ if __name__ == "__main__":
 
				                 color = class_colors[cls_id]
			
 
				                 # class name
			
 
				                 label = VOC_CLASSES[cls_id]
			
 
				-                image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,255), 2)
			
 
				+                image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
			
 
				                 # put the test on the bbox
			
 
				                 cv2.putText(image, label, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA)
			
 
				         cv2.imshow('gt', image)
			
--- a/dataset/widerface.py
+++ b/dataset/widerface.py
@@ -81,7 +81,7 @@ class WiderFaceDataset(Dataset):
 
				         # Mosaic
			
 
				         if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
			
 
				             image, target = yolov5_mosaic_augment(
			
 
				-                image_list, target_list, self.img_size, self.trans_config, self.is_train)
			
 
				+                image_list, target_list, self.img_size, self.trans_config, self.trans_config['mosaic_keep_ratio'], self.is_train)
			
 
				 
			
 
				         return image, target
			
 
				 
			
@@ -222,10 +222,13 @@ if __name__ == "__main__":
 
				         'mixup_prob': args.mixup,
			
 
				         'mosaic_type': 'yolov5_mosaic',
			
 
				         'mixup_type': args.mixup_type,   # optional: yolov5_mixup, yolox_mixup
			
 
				+        'mosaic_keep_ratio': False,
			
 
				         'mixup_scale': [0.5, 1.5]
			
 
				     }
			
 
				-
			
 
				     transform, trans_cfg = build_transform(args, trans_config, 32, args.is_train)
			
 
				+    pixel_mean = transform.pixel_mean
			
 
				+    pixel_std  = transform.pixel_std
			
 
				+    color_format = transform.color_format
			
 
				 
			
 
				     dataset = WiderFaceDataset(
			
 
				         img_size=args.img_size,
			
@@ -248,6 +251,13 @@ if __name__ == "__main__":
 
				 
			
 
				         # to numpy
			
 
				         image = image.permute(1, 2, 0).numpy()
			
 
				+        
			
 
				+        # denormalize
			
 
				+        image = image * pixel_std + pixel_mean
			
 
				+        if color_format == 'rgb':
			
 
				+            # RGB to BGR
			
 
				+            image = image[..., (2, 1, 0)]
			
 
				+
			
 
				         # to uint8
			
 
				         image = image.astype(np.uint8)
			
 
				         image = image.copy()
			
--- a/models/detectors/rtdetr/README.md
+++ b/models/detectors/rtdetr/README.md
--- a/models/detectors/rtdetr/basic_modules/backbone.py
+++ b/models/detectors/rtdetr/basic_modules/backbone.py
@@ -0,0 +1,187 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+from torch import Tensor
			
 
				+from typing import Callable, List, Optional, Type, Union
			
 
				+
			
 
				+try:
			
 
				+    from .basic import conv1x1, BasicBlock, Bottleneck
			
 
				+except:
			
 
				+    from basic import conv1x1, BasicBlock, Bottleneck
			
 
				+   
			
 
				+
			
 
				+# IN1K pretrained weights
			
 
				+pretrained_urls = {
			
 
				+    # ResNet series
			
 
				+    'resnet18': None,
			
 
				+    'resnet34': None,
			
 
				+    'resnet50': None,
			
 
				+    'resnet101': None,
			
 
				+    'resnet152': None,
			
 
				+    # ShuffleNet series
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# ----------------- Model functions -----------------
			
 
				+## Build backbone network
			
 
				+def build_backbone(cfg, pretrained):
			
 
				+    if 'resnet' in cfg['backbone']:
			
 
				+        # Build ResNet
			
 
				+        model, feats = build_resnet(cfg, pretrained)
			
 
				+    else:
			
 
				+        raise NotImplementedError("Unknown backbone: <>.".format(cfg['backbone']))
			
 
				+    
			
 
				+    return model, feats
			
 
				+
			
 
				+## Load pretrained weight
			
 
				+def load_pretrained(model_name):
			
 
				+    return
			
 
				+
			
 
				+
			
 
				+# ----------------- ResNet Backbone -----------------
			
 
				+class ResNet(nn.Module):
			
 
				+    def __init__(self,
			
 
				+                 block: Type[Union[BasicBlock, Bottleneck]],
			
 
				+                 layers: List[int],
			
 
				+                 num_classes: int = 1000,
			
 
				+                 zero_init_residual: bool = False,
			
 
				+                 groups: int = 1,
			
 
				+                 width_per_group: int = 64,
			
 
				+                 replace_stride_with_dilation: Optional[List[bool]] = None,
			
 
				+                 norm_layer: Optional[Callable[..., nn.Module]] = None,
			
 
				+                 ) -> None:
			
 
				+        super().__init__()
			
 
				+        # --------------- Basic parameters ----------------
			
 
				+        self.groups = groups
			
 
				+        self.base_width = width_per_group
			
 
				+        self.inplanes = 64
			
 
				+        self.dilation = 1
			
 
				+        self.zero_init_residual = zero_init_residual
			
 
				+        self.replace_stride_with_dilation = [False, False, False] if replace_stride_with_dilation is None else replace_stride_with_dilation
			
 
				+        if len(self.replace_stride_with_dilation) != 3:
			
 
				+            raise ValueError(
			
 
				+                "replace_stride_with_dilation should be None "
			
 
				+                f"or a 3-element tuple, got {self.replace_stride_with_dilation}"
			
 
				+            )
			
 
				+
			
 
				+        # --------------- Network parameters ----------------
			
 
				+        self._norm_layer = nn.BatchNorm2d if norm_layer is None else norm_layer
			
 
				+        ## Stem layer
			
 
				+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
			
 
				+        self.bn1 = self._norm_layer(self.inplanes)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
			
 
				+        ## Res Layer
			
 
				+        self.layer1 = self._make_layer(block, 64, layers[0])
			
 
				+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=self.replace_stride_with_dilation[0])
			
 
				+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=self.replace_stride_with_dilation[1])
			
 
				+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=self.replace_stride_with_dilation[2])
			
 
				+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
			
 
				+        self.fc = nn.Linear(512 * block.expansion, num_classes)
			
 
				+
			
 
				+        self._init_layer()
			
 
				+
			
 
				+    def _init_layer(self):
			
 
				+        for m in self.modules():
			
 
				+            if isinstance(m, nn.Conv2d):
			
 
				+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
			
 
				+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
			
 
				+                nn.init.constant_(m.weight, 1)
			
 
				+                nn.init.constant_(m.bias, 0)
			
 
				+
			
 
				+        if self.zero_init_residual:
			
 
				+            for m in self.modules():
			
 
				+                if isinstance(m, Bottleneck) and m.bn3.weight is not None:
			
 
				+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
			
 
				+                elif isinstance(m, BasicBlock) and m.bn2.weight is not None:
			
 
				+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
			
 
				+
			
 
				+    def _make_layer(
			
 
				+        self,
			
 
				+        block: Type[Union[BasicBlock, Bottleneck]],
			
 
				+        planes: int,
			
 
				+        blocks: int,
			
 
				+        stride: int = 1,
			
 
				+        dilate: bool = False,
			
 
				+    ) -> nn.Sequential:
			
 
				+        norm_layer = self._norm_layer
			
 
				+        downsample = None
			
 
				+        previous_dilation = self.dilation
			
 
				+        if dilate:
			
 
				+            self.dilation *= stride
			
 
				+            stride = 1
			
 
				+        if stride != 1 or self.inplanes != planes * block.expansion:
			
 
				+            downsample = nn.Sequential(
			
 
				+                conv1x1(self.inplanes, planes * block.expansion, stride),
			
 
				+                norm_layer(planes * block.expansion),
			
 
				+            )
			
 
				+
			
 
				+        layers = []
			
 
				+        layers.append(
			
 
				+            block(
			
 
				+                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
			
 
				+            )
			
 
				+        )
			
 
				+        self.inplanes = planes * block.expansion
			
 
				+        for _ in range(1, blocks):
			
 
				+            layers.append(
			
 
				+                block(
			
 
				+                    self.inplanes,
			
 
				+                    planes,
			
 
				+                    groups=self.groups,
			
 
				+                    base_width=self.base_width,
			
 
				+                    dilation=self.dilation,
			
 
				+                    norm_layer=norm_layer,
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        return nn.Sequential(*layers)
			
 
				+
			
 
				+    def forward(self, x: Tensor) -> Tensor:
			
 
				+        # See note [TorchScript super()]
			
 
				+        x = self.conv1(x)
			
 
				+        x = self.bn1(x)
			
 
				+        x = self.relu(x)
			
 
				+        x = self.maxpool(x)
			
 
				+
			
 
				+        x = self.layer1(x)
			
 
				+        x = self.layer2(x)
			
 
				+        x = self.layer3(x)
			
 
				+        x = self.layer4(x)
			
 
				+
			
 
				+        x = self.avgpool(x)
			
 
				+        x = torch.flatten(x, 1)
			
 
				+        x = self.fc(x)
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+def _resnet(block: Type[Union[BasicBlock, Bottleneck]], layers: List[int], **kwargs) -> ResNet:
			
 
				+    return ResNet(block, layers, **kwargs)
			
 
				+
			
 
				+def build_resnet(cfg, pretrained=False, **kwargs):
			
 
				+    # ---------- Build ResNet ----------
			
 
				+    if   cfg['backbone'] == 'resnet18':
			
 
				+        model = _resnet(BasicBlock, [2, 2, 2, 2], **kwargs)
			
 
				+        feats = [128, 256, 512]
			
 
				+    elif cfg['backbone'] == 'resnet34':
			
 
				+        model = _resnet(BasicBlock, [3, 4, 6, 3], **kwargs)
			
 
				+        feats = [128, 256, 512]
			
 
				+    elif cfg['backbone'] == 'resnet50':
			
 
				+        model = _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
			
 
				+        feats = [512, 1024, 2048]
			
 
				+    elif cfg['backbone'] == 'resnet101':
			
 
				+        model = _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
			
 
				+        feats = [512, 1024, 2048]
			
 
				+    elif cfg['backbone'] == 'resnet152':
			
 
				+        model = _resnet(Bottleneck, [3, 8, 36, 3], **kwargs)
			
 
				+        feats = [512, 1024, 2048]
			
 
				+
			
 
				+    # ---------- Load pretrained ----------
			
 
				+    if pretrained:
			
 
				+        # TODO: load IN1K pretrained
			
 
				+        pass
			
 
				+
			
 
				+    return model, feats
			
 
				+
			
 
				+
			
 
				+# ----------------- ShuffleNet Backbone -----------------
			
 
				+## TODO: Add shufflenet-v2
			
--- a/models/detectors/rtdetr/basic_modules/basic.py
+++ b/models/detectors/rtdetr/basic_modules/basic.py
@@ -0,0 +1,195 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+from torch import Tensor
			
 
				+from typing import List, Optional, Callable
			
 
				+
			
 
				+
			
 
				+# ----------------- CNN modules -----------------
			
 
				+def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
			
 
				+    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
			
 
				+
			
 
				+    return conv
			
 
				+
			
 
				+def get_activation(act_type=None):
			
 
				+    if act_type == 'relu':
			
 
				+        return nn.ReLU(inplace=True)
			
 
				+    elif act_type == 'lrelu':
			
 
				+        return nn.LeakyReLU(0.1, inplace=True)
			
 
				+    elif act_type == 'mish':
			
 
				+        return nn.Mish(inplace=True)
			
 
				+    elif act_type == 'silu':
			
 
				+        return nn.SiLU(inplace=True)
			
 
				+    elif act_type is None:
			
 
				+        return nn.Identity()
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+        
			
 
				+def get_norm(norm_type, dim):
			
 
				+    if norm_type == 'BN':
			
 
				+        return nn.BatchNorm2d(dim)
			
 
				+    elif norm_type == 'GN':
			
 
				+        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				+    elif norm_type is None:
			
 
				+        return nn.Identity()
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
			
 
				+    """3x3 convolution with padding"""
			
 
				+    return nn.Conv2d(
			
 
				+        in_planes,
			
 
				+        out_planes,
			
 
				+        kernel_size=3,
			
 
				+        stride=stride,
			
 
				+        padding=dilation,
			
 
				+        groups=groups,
			
 
				+        bias=False,
			
 
				+        dilation=dilation,
			
 
				+    )
			
 
				+
			
 
				+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
			
 
				+    """1x1 convolution"""
			
 
				+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
			
 
				+
			
 
				+class Conv(nn.Module):
			
 
				+    def __init__(self, 
			
 
				+                 c1,                   # in channels
			
 
				+                 c2,                   # out channels 
			
 
				+                 k=1,                  # kernel size 
			
 
				+                 p=0,                  # padding
			
 
				+                 s=1,                  # padding
			
 
				+                 d=1,                  # dilation
			
 
				+                 act_type  :str  = 'lrelu',   # activation
			
 
				+                 norm_type :str  ='BN',       # normalization
			
 
				+                 depthwise :bool =False):
			
 
				+        super(Conv, self).__init__()
			
 
				+        convs = []
			
 
				+        add_bias = False if norm_type else True
			
 
				+        if depthwise:
			
 
				+            convs.append(get_conv2d(c1, c1, k=k, p=p, s=s, d=d, g=c1, bias=add_bias))
			
 
				+            # depthwise conv
			
 
				+            if norm_type:
			
 
				+                convs.append(get_norm(norm_type, c1))
			
 
				+            if act_type:
			
 
				+                convs.append(get_activation(act_type))
			
 
				+            # pointwise conv
			
 
				+            convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias))
			
 
				+            if norm_type:
			
 
				+                convs.append(get_norm(norm_type, c2))
			
 
				+            if act_type:
			
 
				+                convs.append(get_activation(act_type))
			
 
				+
			
 
				+        else:
			
 
				+            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=1, bias=add_bias))
			
 
				+            if norm_type:
			
 
				+                convs.append(get_norm(norm_type, c2))
			
 
				+            if act_type:
			
 
				+                convs.append(get_activation(act_type))
			
 
				+            
			
 
				+        self.convs = nn.Sequential(*convs)
			
 
				+
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return self.convs(x)
			
 
				+
			
 
				+class BasicBlock(nn.Module):
			
 
				+    expansion: int = 1
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        inplanes: int,
			
 
				+        planes: int,
			
 
				+        stride: int = 1,
			
 
				+        downsample: Optional[nn.Module] = None,
			
 
				+        groups: int = 1,
			
 
				+        base_width: int = 64,
			
 
				+        dilation: int = 1,
			
 
				+        norm_layer: Optional[Callable[..., nn.Module]] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        if norm_layer is None:
			
 
				+            norm_layer = nn.BatchNorm2d
			
 
				+        if groups != 1 or base_width != 64:
			
 
				+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
			
 
				+        if dilation > 1:
			
 
				+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
			
 
				+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
			
 
				+        self.conv1 = conv3x3(inplanes, planes, stride)
			
 
				+        self.bn1 = norm_layer(planes)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.conv2 = conv3x3(planes, planes)
			
 
				+        self.bn2 = norm_layer(planes)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x: Tensor) -> Tensor:
			
 
				+        identity = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            identity = self.downsample(x)
			
 
				+
			
 
				+        out += identity
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+class Bottleneck(nn.Module):
			
 
				+    expansion: int = 4
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        inplanes: int,
			
 
				+        planes: int,
			
 
				+        stride: int = 1,
			
 
				+        downsample: Optional[nn.Module] = None,
			
 
				+        groups: int = 1,
			
 
				+        base_width: int = 64,
			
 
				+        dilation: int = 1,
			
 
				+        norm_layer: Optional[Callable[..., nn.Module]] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        if norm_layer is None:
			
 
				+            norm_layer = nn.BatchNorm2d
			
 
				+        width = int(planes * (base_width / 64.0)) * groups
			
 
				+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
			
 
				+        self.conv1 = conv1x1(inplanes, width)
			
 
				+        self.bn1 = norm_layer(width)
			
 
				+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
			
 
				+        self.bn2 = norm_layer(width)
			
 
				+        self.conv3 = conv1x1(width, planes * self.expansion)
			
 
				+        self.bn3 = norm_layer(planes * self.expansion)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x: Tensor) -> Tensor:
			
 
				+        identity = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv3(out)
			
 
				+        out = self.bn3(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            identity = self.downsample(x)
			
 
				+
			
 
				+        out += identity
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+# ----------------- Transformer modules -----------------
			
--- a/models/detectors/rtdetr/basic_modules/neck.py
+++ b/models/detectors/rtdetr/basic_modules/neck.py
@@ -0,0 +1,7 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+
			
 
				+
			
 
				+# Build neck
			
 
				+def build_neck(cfg, in_dim, out_dim):
			
 
				+    return
			
--- a/models/detectors/rtdetr/basic_modules/pafpn.py
+++ b/models/detectors/rtdetr/basic_modules/pafpn.py
@@ -0,0 +1,110 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+from .basic import Conv, RTCBlock
			
 
				+
			
 
				+
			
 
				+# Build PaFPN
			
 
				+def build_pafpn(cfg, in_dims, out_dim):
			
 
				+    return
			
 
				+
			
 
				+
			
 
				+# ----------------- Feature Pyramid Network -----------------
			
 
				+## Real-time Convolutional PaFPN
			
 
				+class RTCPaFPN(nn.Module):
			
 
				+    def __init__(self, 
			
 
				+                 in_dims   = [256, 512, 512],
			
 
				+                 width     = 1.0,
			
 
				+                 depth     = 1.0,
			
 
				+                 ratio     = 1.0,
			
 
				+                 act_type  = 'silu',
			
 
				+                 norm_type = 'BN',
			
 
				+                 depthwise = False):
			
 
				+        super(RTCPaFPN, self).__init__()
			
 
				+        print('==============================')
			
 
				+        print('FPN: {}'.format("RTC-PaFPN"))
			
 
				+        # ---------------- Basic parameters ----------------
			
 
				+        self.in_dims = in_dims
			
 
				+        self.width = width
			
 
				+        self.depth = depth
			
 
				+        self.out_dim = [round(256 * width), round(512 * width), round(512 * width * ratio)]
			
 
				+        c3, c4, c5 = in_dims
			
 
				+
			
 
				+        # ---------------- Top dwon ----------------
			
 
				+        ## P5 -> P4
			
 
				+        self.top_down_layer_1 = RTCBlock(in_dim       = c5 + c4,
			
 
				+                                         out_dim      = round(512*width),
			
 
				+                                         num_blocks   = round(3*depth),
			
 
				+                                         shortcut     = False,
			
 
				+                                         act_type     = act_type,
			
 
				+                                         norm_type    = norm_type,
			
 
				+                                         depthwise    = depthwise,
			
 
				+                                         )
			
 
				+        ## P4 -> P3
			
 
				+        self.top_down_layer_2 = RTCBlock(in_dim       = round(512*width) + c3,
			
 
				+                                         out_dim      = round(256*width),
			
 
				+                                         num_blocks   = round(3*depth),
			
 
				+                                         shortcut     = False,
			
 
				+                                         act_type     = act_type,
			
 
				+                                         norm_type    = norm_type,
			
 
				+                                         depthwise    = depthwise,
			
 
				+                                         )
			
 
				+        # ---------------- Bottom up ----------------
			
 
				+        ## P3 -> P4
			
 
				+        self.dowmsample_layer_1 = Conv(round(256*width), round(256*width), k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				+        self.bottom_up_layer_1  = RTCBlock(in_dim       = round(256*width) + round(512*width),
			
 
				+                                           out_dim      = round(512*width),
			
 
				+                                           num_blocks   = round(3*depth),
			
 
				+                                           shortcut     = False,
			
 
				+                                           act_type     = act_type,
			
 
				+                                           norm_type    = norm_type,
			
 
				+                                           depthwise    = depthwise,
			
 
				+                                           )
			
 
				+        ## P4 -> P5
			
 
				+        self.dowmsample_layer_2 = Conv(round(512*width), round(512*width), k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				+        self.bottom_up_layer_2 = RTCBlock(in_dim       = round(512 * width) + c5,
			
 
				+                                          out_dim      = round(512 * width * ratio),
			
 
				+                                          num_blocks   = round(3*depth),
			
 
				+                                          shortcut     = False,
			
 
				+                                          act_type     = act_type,
			
 
				+                                          norm_type    = norm_type,
			
 
				+                                          depthwise    = depthwise,
			
 
				+                                          )
			
 
				+
			
 
				+        self.init_weights()
			
 
				+        
			
 
				+    def init_weights(self):
			
 
				+        """Initialize the parameters."""
			
 
				+        for m in self.modules():
			
 
				+            if isinstance(m, torch.nn.Conv2d):
			
 
				+                # In order to be consistent with the source code,
			
 
				+                # reset the Conv2d initialization parameters
			
 
				+                m.reset_parameters()
			
 
				+
			
 
				+    def forward(self, features):
			
 
				+        c3, c4, c5 = features
			
 
				+
			
 
				+        # Top down
			
 
				+        ## P5 -> P4
			
 
				+        c6 = F.interpolate(c5, scale_factor=2.0)
			
 
				+        c7 = torch.cat([c6, c4], dim=1)
			
 
				+        c8 = self.top_down_layer_1(c7)
			
 
				+        ## P4 -> P3
			
 
				+        c9 = F.interpolate(c8, scale_factor=2.0)
			
 
				+        c10 = torch.cat([c9, c3], dim=1)
			
 
				+        c11 = self.top_down_layer_2(c10)
			
 
				+
			
 
				+        # Bottom up
			
 
				+        # p3 -> P4
			
 
				+        c12 = self.dowmsample_layer_1(c11)
			
 
				+        c13 = torch.cat([c12, c8], dim=1)
			
 
				+        c14 = self.bottom_up_layer_1(c13)
			
 
				+        # P4 -> P5
			
 
				+        c15 = self.dowmsample_layer_2(c14)
			
 
				+        c16 = torch.cat([c15, c5], dim=1)
			
 
				+        c17 = self.bottom_up_layer_2(c16)
			
 
				+
			
 
				+        out_feats = [c11, c14, c17] # [P3, P4, P5]
			
 
				+        
			
 
				+        return out_feats
			
--- a/models/detectors/rtdetr/build.py
+++ b/models/detectors/rtdetr/build.py
--- a/models/detectors/rtdetr/loss.py
+++ b/models/detectors/rtdetr/loss.py
--- a/models/detectors/rtdetr/matcher.py
+++ b/models/detectors/rtdetr/matcher.py
--- a/models/detectors/rtdetr/rtdetr.py
+++ b/models/detectors/rtdetr/rtdetr.py
@@ -1,5 +1,5 @@
 
				 # Real-time Transformer-based Object Detector
			
 
				 
			
 
				 
			
 
				-class RTRDet():
			
 
				+class RT_DETR():
			
 
				     pass
			
--- a/models/detectors/rtdetr/rtdetr_decoder.py
+++ b/models/detectors/rtdetr/rtdetr_decoder.py
@@ -0,0 +1,35 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+
			
 
				+# ----------------- Dencoder for Detection task -----------------
			
 
				+class DetDecoder(nn.Module):
			
 
				+    def __init__(self, ):
			
 
				+        super().__init__()
			
 
				+        self.backbone = None
			
 
				+        self.neck = None
			
 
				+        self.fpn = None
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return
			
 
				+
			
 
				+
			
 
				+# ----------------- Dencoder for Segmentation task -----------------
			
 
				+class SegDecoder(nn.Module):
			
 
				+    def __init__(self, ):
			
 
				+        super().__init__()
			
 
				+        # TODO: design seg-decoder
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return
			
 
				+
			
 
				+
			
 
				+# ----------------- Dencoder for Pose estimation task -----------------
			
 
				+class PosDecoder(nn.Module):
			
 
				+    def __init__(self, ):
			
 
				+        super().__init__()
			
 
				+        # TODO: design seg-decoder
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return
			
--- a/models/detectors/rtdetr/rtdetr_encoder.py
+++ b/models/detectors/rtdetr/rtdetr_encoder.py
@@ -0,0 +1,19 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+from .basic_modules.backbone import build_backbone
			
 
				+from .basic_modules.pafpn    import build_pafpn
			
 
				+
			
 
				+
			
 
				+# ----------------- Image Encoder -----------------
			
 
				+class ImageEncoder(nn.Module):
			
 
				+    def __init__(self, ):
			
 
				+        super().__init__()
			
 
				+        self.backbone = None
			
 
				+        self.neck = None
			
 
				+        self.fpn = None
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return
			
 
				+    
			
--- a/models/detectors/rtrdet/rtrdet_basic.py
+++ b/models/detectors/rtrdet/rtrdet_basic.py
@@ -1,129 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-from typing import List
			
 
				-
			
 
				-
			
 
				-# ----------------- CNN modules -----------------
			
 
				-def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
			
 
				-    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
			
 
				-
			
 
				-    return conv
			
 
				-
			
 
				-def get_activation(act_type=None):
			
 
				-    if act_type == 'relu':
			
 
				-        return nn.ReLU(inplace=True)
			
 
				-    elif act_type == 'lrelu':
			
 
				-        return nn.LeakyReLU(0.1, inplace=True)
			
 
				-    elif act_type == 'mish':
			
 
				-        return nn.Mish(inplace=True)
			
 
				-    elif act_type == 'silu':
			
 
				-        return nn.SiLU(inplace=True)
			
 
				-    elif act_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-        
			
 
				-def get_norm(norm_type, dim):
			
 
				-    if norm_type == 'BN':
			
 
				-        return nn.BatchNorm2d(dim)
			
 
				-    elif norm_type == 'GN':
			
 
				-        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				-    elif norm_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-
			
 
				-class Conv(nn.Module):
			
 
				-    def __init__(self, 
			
 
				-                 c1,                   # in channels
			
 
				-                 c2,                   # out channels 
			
 
				-                 k=1,                  # kernel size 
			
 
				-                 p=0,                  # padding
			
 
				-                 s=1,                  # padding
			
 
				-                 d=1,                  # dilation
			
 
				-                 act_type  :str  = 'lrelu',   # activation
			
 
				-                 norm_type :str  ='BN',       # normalization
			
 
				-                 depthwise :bool =False):
			
 
				-        super(Conv, self).__init__()
			
 
				-        convs = []
			
 
				-        add_bias = False if norm_type else True
			
 
				-        if depthwise:
			
 
				-            convs.append(get_conv2d(c1, c1, k=k, p=p, s=s, d=d, g=c1, bias=add_bias))
			
 
				-            # depthwise conv
			
 
				-            if norm_type:
			
 
				-                convs.append(get_norm(norm_type, c1))
			
 
				-            if act_type:
			
 
				-                convs.append(get_activation(act_type))
			
 
				-            # pointwise conv
			
 
				-            convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias))
			
 
				-            if norm_type:
			
 
				-                convs.append(get_norm(norm_type, c2))
			
 
				-            if act_type:
			
 
				-                convs.append(get_activation(act_type))
			
 
				-
			
 
				-        else:
			
 
				-            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=1, bias=add_bias))
			
 
				-            if norm_type:
			
 
				-                convs.append(get_norm(norm_type, c2))
			
 
				-            if act_type:
			
 
				-                convs.append(get_activation(act_type))
			
 
				-            
			
 
				-        self.convs = nn.Sequential(*convs)
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return self.convs(x)
			
 
				-
			
 
				-class Bottleneck(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim       :int,
			
 
				-                 out_dim      :int,
			
 
				-                 expand_ratio :float = 0.5,
			
 
				-                 kernel_sizes :List = [3, 3],
			
 
				-                 shortcut     :bool = True,
			
 
				-                 act_type     :str  = 'silu',
			
 
				-                 norm_type    :str  = 'BN',
			
 
				-                 depthwise    :bool = False,):
			
 
				-        super(Bottleneck, self).__init__()
			
 
				-        inter_dim = int(out_dim * expand_ratio)  # hidden channels            
			
 
				-        self.cv1 = Conv(in_dim, inter_dim, k=kernel_sizes[0], p=kernel_sizes[0]//2, norm_type=norm_type, act_type=act_type, depthwise=depthwise)
			
 
				-        self.cv2 = Conv(inter_dim, out_dim, k=kernel_sizes[1], p=kernel_sizes[1]//2, norm_type=norm_type, act_type=act_type, depthwise=depthwise)
			
 
				-        self.shortcut = shortcut and in_dim == out_dim
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        h = self.cv2(self.cv1(x))
			
 
				-
			
 
				-        return x + h if self.shortcut else h
			
 
				-
			
 
				-class RTCBlock(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim     :int,
			
 
				-                 out_dim    :int,
			
 
				-                 num_blocks :int  = 1,
			
 
				-                 shortcut   :bool = False,
			
 
				-                 act_type   :str  = 'silu',
			
 
				-                 norm_type  :str  = 'BN',
			
 
				-                 depthwise  :bool = False,):
			
 
				-        super(RTCBlock, self).__init__()
			
 
				-        self.inter_dim = out_dim // 2
			
 
				-        self.input_proj = Conv(in_dim, out_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.m = nn.Sequential(*(
			
 
				-            Bottleneck(self.inter_dim, self.inter_dim, 1.0, [3, 3], shortcut, act_type, norm_type, depthwise)
			
 
				-            for _ in range(num_blocks)))
			
 
				-        self.output_proj = Conv((2 + num_blocks) * self.inter_dim, out_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        # Input proj
			
 
				-        x1, x2 = torch.chunk(self.input_proj(x), 2, dim=1)
			
 
				-        out = list([x1, x2])
			
 
				-
			
 
				-        # Bottlenecl
			
 
				-        out.extend(m(out[-1]) for m in self.m)
			
 
				-
			
 
				-        # Output proj
			
 
				-        out = self.output_proj(torch.cat(out, dim=1))
			
 
				-
			
 
				-        return out
			
 
				-
			
 
				-
			
 
				-# ----------------- Transformer modules -----------------
			
--- a/models/detectors/rtrdet/rtrdet_decoder.py
+++ b/models/detectors/rtrdet/rtrdet_decoder.py
--- a/models/detectors/rtrdet/rtrdet_encoder.py
+++ b/models/detectors/rtrdet/rtrdet_encoder.py
--- a/models/detectors/rtrdet/rtrdet_head.py
+++ b/models/detectors/rtrdet/rtrdet_head.py