浏览代码

add RandomExpand into RT-DETR's augmentation

yjh0410 1 年之前
父节点
当前提交
a716a66687

+ 181 - 0
config/model_config/rtdetr_config.py

@@ -64,4 +64,185 @@ rtdetr_cfg = {
         'trainer_type': 'rtdetr',
     },
 
+    'rtdetr_r50':{
+        # ---------------- Model config ----------------
+        ## Model scale
+        'width': 1.0,
+        'depth': 1.0,
+        ## Image Encoder - Backbone
+        'backbone': 'resnet50',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'pretrained_weight': 'imagenet1k_v1',
+        'freeze_at': 0,
+        'freeze_stem_only': False,
+        'out_stride': [8, 16, 32],
+        'max_stride': 32,
+        ## Image Encoder - FPN
+        'fpn': 'hybrid_encoder',
+        'fpn_act': 'silu',
+        'fpn_norm': 'BN',
+        'fpn_depthwise': False,
+        'hidden_dim': 256,
+        'en_num_heads': 8,
+        'en_num_layers': 1,
+        'en_mlp_ratio': 4.0,
+        'en_dropout': 0.0,
+        'pe_temperature': 10000.,
+        'en_act': 'gelu',
+        # Transformer Decoder
+        'transformer': 'rtdetr_transformer',
+        'hidden_dim': 256,
+        'de_num_heads': 8,
+        'de_num_layers': 6,
+        'de_mlp_ratio': 4.0,
+        'de_dropout': 0.0,
+        'de_act': 'relu',
+        'de_num_points': 4,
+        'num_queries': 300,
+        'learnt_init_query': False,
+        'pe_temperature': 10000.,
+        'dn_num_denoising': 100,
+        'dn_label_noise_ratio': 0.5,
+        'dn_box_noise_scale': 1,
+        # Head
+        'det_head': 'dino_head',
+        # ---------------- Assignment config ----------------
+        'matcher_hpy': {'cost_class': 2.0,
+                        'cost_bbox': 5.0,
+                        'cost_giou': 2.0,},
+        # ---------------- Loss config ----------------
+        'use_vfl': True,
+        'loss_coeff': {'class': 1,
+                       'bbox': 5,
+                       'giou': 2,},
+        # ---------------- Train config ----------------
+        ## input
+        'multi_scale': [0.5, 1.25],   # 320 -> 800
+        'trans_type': 'rtdetr_base',
+        # ---------------- Train config ----------------
+        'trainer_type': 'rtdetr',
+    },
+
+    'rtdetr_r101':{
+        # ---------------- Model config ----------------
+        ## Model scale
+        'width': 1.0,
+        'depth': 1.0,
+        ## Image Encoder - Backbone
+        'backbone': 'resnet101',
+        'backbone_norm': 'FrozeBN',
+        'res5_dilation': False,
+        'pretrained': True,
+        'pretrained_weight': 'imagenet1k_v1',
+        'freeze_at': 0,
+        'freeze_stem_only': False,
+        'out_stride': [8, 16, 32],
+        'max_stride': 32,
+        ## Image Encoder - FPN
+        'fpn': 'hybrid_encoder',
+        'fpn_act': 'silu',
+        'fpn_norm': 'BN',
+        'fpn_depthwise': False,
+        'hidden_dim': 256,
+        'en_num_heads': 8,
+        'en_num_layers': 1,
+        'en_mlp_ratio': 4.0,
+        'en_dropout': 0.0,
+        'pe_temperature': 10000.,
+        'en_act': 'gelu',
+        # Transformer Decoder
+        'transformer': 'rtdetr_transformer',
+        'hidden_dim': 256,
+        'de_num_heads': 8,
+        'de_num_layers': 6,
+        'de_mlp_ratio': 4.0,
+        'de_dropout': 0.0,
+        'de_act': 'relu',
+        'de_num_points': 4,
+        'num_queries': 300,
+        'learnt_init_query': False,
+        'pe_temperature': 10000.,
+        'dn_num_denoising': 100,
+        'dn_label_noise_ratio': 0.5,
+        'dn_box_noise_scale': 1,
+        # Head
+        'det_head': 'dino_head',
+        # ---------------- Assignment config ----------------
+        'matcher_hpy': {'cost_class': 2.0,
+                        'cost_bbox': 5.0,
+                        'cost_giou': 2.0,},
+        # ---------------- Loss config ----------------
+        'use_vfl': True,
+        'loss_coeff': {'class': 1,
+                       'bbox': 5,
+                       'giou': 2,},
+        # ---------------- Train config ----------------
+        ## input
+        'multi_scale': [0.5, 1.25],   # 320 -> 800
+        'trans_type': 'rtdetr_base',
+        # ---------------- Train config ----------------
+        'trainer_type': 'rtdetr',
+    },
+
+    # Below RT-DETR is not complete
+    'rtdetr_l':{
+        # ---------------- Model config ----------------
+        ## Model scale
+        'width': 1.0,
+        'depth': 1.0,
+        ## Image Encoder - Backbone
+        'backbone': 'rtcnet_l',
+        'pretrained': True,
+        'freeze_at': 0,
+        'freeze_stem_only': False,
+        'out_stride': [8, 16, 32],
+        'max_stride': 32,
+        ## Image Encoder - FPN
+        'fpn': 'hybrid_encoder',
+        'fpn_act': 'silu',
+        'fpn_norm': 'BN',
+        'fpn_depthwise': False,
+        'hidden_dim': 256,
+        'en_num_heads': 8,
+        'en_num_layers': 1,
+        'en_mlp_ratio': 4.0,
+        'en_dropout': 0.0,
+        'pe_temperature': 10000.,
+        'en_act': 'gelu',
+        # Transformer Decoder
+        'transformer': 'rtdetr_transformer',
+        'hidden_dim': 256,
+        'de_num_heads': 8,
+        'de_num_layers': 6,
+        'de_mlp_ratio': 4.0,
+        'de_dropout': 0.0,
+        'de_act': 'relu',
+        'de_num_points': 4,
+        'num_queries': 300,
+        'learnt_init_query': False,
+        'pe_temperature': 10000.,
+        'dn_num_denoising': 100,
+        'dn_label_noise_ratio': 0.5,
+        'dn_box_noise_scale': 1,
+        # Head
+        'det_head': 'dino_head',
+        # ---------------- Assignment config ----------------
+        'matcher_hpy': {'cost_class': 2.0,
+                        'cost_bbox': 5.0,
+                        'cost_giou': 2.0,},
+        # ---------------- Loss config ----------------
+        'use_vfl': True,
+        'loss_coeff': {'class': 1,
+                       'bbox': 5,
+                       'giou': 2,},
+        # ---------------- Train config ----------------
+        ## input
+        'multi_scale': [0.5, 1.25],   # 320 -> 800
+        'trans_type': 'rtdetr_base',
+        # ---------------- Train config ----------------
+        'trainer_type': 'rtdetr',
+    },
+
 }

+ 39 - 1
dataset/data_augment/rtdetr_augment.py

@@ -113,6 +113,34 @@ class RandomPhotometricDistort(object):
             return scale
         return 1 / scale
 
+## Random scaling
+class RandomExpand(object):
+    def __init__(self, fill_value) -> None:
+        self.fill_value = fill_value
+
+    def __call__(self, image, target=None):
+        if random.randint(2):
+            return image, target
+
+        height, width, channels = image.shape
+        ratio = random.uniform(1, 4)
+        left = random.uniform(0, width*ratio - width)
+        top = random.uniform(0, height*ratio - height)
+
+        expand_image = np.ones(
+            (int(height*ratio), int(width*ratio), channels),
+            dtype=image.dtype) * self.fill_value
+        expand_image[int(top):int(top + height),
+                     int(left):int(left + width)] = image
+        image = expand_image
+
+        boxes = target['boxes'].copy()
+        boxes[:, :2] += (int(left), int(top))
+        boxes[:, 2:] += (int(left), int(top))
+        target['boxes'] = boxes
+
+        return image, target
+
 ## Random IoU based Sample Crop
 class RandomSampleCrop(object):
     def __init__(self):
@@ -321,9 +349,10 @@ class RTDetrAugmentation(object):
                 ToTensor()
             ])
         else:
-            # For no-mosaic setting, we use RandomSampleCrop processor.
+            # For no-mosaic setting, we use RandomExpand & RandomSampleCrop processor.
             self.augment = Compose([
                 RandomPhotometricDistort(hue=0.5, saturation=1.5, exposure=1.5),
+                RandomExpand(self.pixel_mean[::-1]),
                 RandomSampleCrop(),
                 RandomHorizontalFlip(p=0.5),
                 Resize(img_size=self.img_size),
@@ -332,6 +361,15 @@ class RTDetrAugmentation(object):
                 ToTensor()
             ])
 
+    def set_weak_augment(self):
+        self.augment = Compose([
+            RandomHorizontalFlip(p=0.5),
+            Resize(img_size=self.img_size),
+            ConvertColorFormat(self.color_format),
+            Normalize(self.pixel_mean, self.pixel_std),
+            ToTensor()
+        ])
+
     def __call__(self, image, target, mosaic=False):
         orig_h, orig_w = image.shape[:2]
         ratio = [self.img_size / orig_w, self.img_size / orig_h]

+ 2 - 2
dataset/voc.py

@@ -273,8 +273,8 @@ if __name__ == "__main__":
 
     trans_config = {
         'aug_type': args.aug_type,    # optional: ssd, yolov5
-        'pixel_mean': [0., 0., 0.],
-        'pixel_std':  [255., 255., 255.],
+        'pixel_mean': [123.675, 116.28, 103.53],
+        'pixel_std':  [58.395, 57.12, 57.375],
         # Basic Augment
         'degrees': 0.0,
         'translate': 0.2,

+ 2 - 0
engine.py

@@ -1479,6 +1479,8 @@ class RTRTrainer(object):
         print(' - Rebuild transforms ...')
         self.train_transform, self.trans_cfg = build_transform(
             args=self.args, trans_config=self.trans_cfg, max_stride=self.model_cfg['max_stride'], is_train=True)
+        
+        self.train_transform.set_weak_augment()
         self.train_loader.dataset.transform = self.train_transform
         
 

+ 104 - 0
models/detectors/rtdetr/basic_modules/basic.py

@@ -170,6 +170,7 @@ class PointwiseConv(nn.Module):
         return self.act(self.norm(self.conv(x)))
 
 
+
 # ----------------- CNN Modules -----------------
 class Bottleneck(nn.Module):
     def __init__(self,
@@ -231,3 +232,106 @@ class RTCBlock(nn.Module):
         out = self.output_proj(torch.cat(out, dim=1))
 
         return out
+
+class RepVggBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, act_type='relu', norm_type='BN', alpha=False):
+        super(RepVggBlock, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.conv1 = BasicConv(in_dim, out_dim, kernel_size=3, padding=1, act_type=None, norm_type=norm_type)
+        self.conv2 = BasicConv(in_dim, out_dim, kernel_size=3, padding=1, act_type=None, norm_type=norm_type)
+        self.act = get_activation(act_type)
+
+        if alpha:
+            self.alpha = nn.Parameter(torch.as_tensor([1.0]).float())
+        else:
+            self.alpha = None
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            if self.alpha:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(
+                self.in_dim,
+                self.out_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        # self.conv.weight.set_value(kernel)
+        # self.conv.bias.set_value(bias)
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + self.alpha * bias1x1
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+
+        return kernel * t, beta - running_mean * gamma / std
+
+class CSPRepLayer(nn.Module):
+    def __init__(self,
+                 in_dim     :int,
+                 out_dim    :int,
+                 num_blocks :int   = 3,
+                 expansion  :float = 1.0,
+                 act_type   :str   ="silu",
+                 norm_type  :str   = 'BN'):
+        super(CSPRepLayer, self).__init__()
+        hidden_dim = int(out_dim * expansion)
+        self.conv1 = BasicConv(
+            in_dim, hidden_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.conv2 = BasicConv(
+            in_dim, hidden_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.bottlenecks = nn.Sequential(*[
+            RepVggBlock(
+                hidden_dim, hidden_dim, act_type=act_type, norm_type=norm_type)
+            for _ in range(num_blocks)
+        ])
+        if hidden_dim != out_dim:
+            self.conv3 = BasicConv(hidden_dim, out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+
+        return self.conv3(x_1 + x_2)

+ 146 - 7
models/detectors/rtdetr/basic_modules/fpn.py

@@ -4,10 +4,10 @@ import torch.nn.functional as F
 from typing import List
 
 try:
-    from .basic import BasicConv, RTCBlock
+    from .basic import BasicConv, RTCBlock, CSPRepLayer
     from .transformer import TransformerEncoder
 except:
-    from  basic import BasicConv, RTCBlock
+    from  basic import BasicConv, RTCBlock, CSPRepLayer
     from  transformer import TransformerEncoder
 
 
@@ -16,7 +16,6 @@ def build_fpn(cfg, in_dims, out_dim):
     if cfg['fpn'] == 'hybrid_encoder':
         return HybridEncoder(in_dims     = in_dims,
                              out_dim     = out_dim,
-                             width       = cfg['width'],
                              depth       = cfg['depth'],
                              act_type    = cfg['fpn_act'],
                              norm_type   = cfg['fpn_norm'],
@@ -28,6 +27,21 @@ def build_fpn(cfg, in_dims, out_dim):
                              pe_temperature = cfg['pe_temperature'],
                              en_act_type    = cfg['en_act'],
                              )
+    elif cfg['fpn'] == 'pp_hybrid_encoder':
+        return PPHybridEncoder(in_dims     = in_dims,
+                               out_dim     = out_dim,
+                               depth       = cfg['depth'],
+                               expansion   = cfg['expansion'],
+                               act_type    = cfg['fpn_act'],
+                               norm_type   = cfg['fpn_norm'],
+                               depthwise   = cfg['fpn_depthwise'],
+                               num_heads   = cfg['en_num_heads'],
+                               num_layers  = cfg['en_num_layers'],
+                               mlp_ratio   = cfg['en_mlp_ratio'],
+                               dropout     = cfg['en_dropout'],
+                               pe_temperature = cfg['pe_temperature'],
+                               en_act_type    = cfg['en_act'],
+                               )
     else:
         raise NotImplementedError("Unknown PaFPN: <{}>".format(cfg['fpn']))
 
@@ -38,7 +52,6 @@ class HybridEncoder(nn.Module):
     def __init__(self, 
                  in_dims     :List  = [256, 512, 1024],
                  out_dim     :int   = 256,
-                 width       :float = 1.0,
                  depth       :float = 1.0,
                  act_type    :str   = 'silu',
                  norm_type   :str   = 'BN',
@@ -56,9 +69,8 @@ class HybridEncoder(nn.Module):
         print('FPN: {}'.format("RTC-PaFPN"))
         # ---------------- Basic parameters ----------------
         self.in_dims = in_dims
-        self.out_dim = round(out_dim * width)
+        self.out_dim = out_dim
         self.out_dims = [self.out_dim] * len(in_dims)
-        self.width = width
         self.depth = depth
         self.num_heads = num_heads
         self.num_layers = num_layers
@@ -164,6 +176,132 @@ class HybridEncoder(nn.Module):
         return out_feats
 
 
+## PaddlePaddle Hybrid Encoder (Transformer encoder + Convolutional PaFPN)
+class PPHybridEncoder(nn.Module):
+    def __init__(self, 
+                 in_dims     :List  = [256, 512, 1024],
+                 out_dim     :int   = 256,
+                 depth       :float = 1.0,
+                 expansion   :float = 1.0,
+                 act_type    :str   = 'silu',
+                 norm_type   :str   = 'BN',
+                 depthwise   :bool  = False,
+                 # Transformer's parameters
+                 num_heads      :int   = 8,
+                 num_layers     :int   = 1,
+                 mlp_ratio      :float = 4.0,
+                 dropout        :float = 0.1,
+                 pe_temperature :float = 10000.,
+                 en_act_type    :str   = 'gelu'
+                 ) -> None:
+        super(PPHybridEncoder, self).__init__()
+        print('==============================')
+        print('FPN: {}'.format("RTC-PaFPN"))
+        # ---------------- Basic parameters ----------------
+        self.in_dims = in_dims
+        self.out_dim = out_dim
+        self.out_dims = [self.out_dim] * len(in_dims)
+        self.depth = depth
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.mlp_ratio = mlp_ratio
+        c3, c4, c5 = in_dims
+
+        # ---------------- Input projs ----------------
+        self.reduce_layer_1 = BasicConv(c5, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.reduce_layer_2 = BasicConv(c4, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+        self.reduce_layer_3 = BasicConv(c3, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
+
+        # ---------------- Downsample ----------------
+        self.dowmsample_layer_1 = BasicConv(self.out_dim, self.out_dim, kernel_size=3, padding=1, stride=2, act_type=act_type, norm_type=norm_type)
+        self.dowmsample_layer_2 = BasicConv(self.out_dim, self.out_dim, kernel_size=3, padding=1, stride=2, act_type=act_type, norm_type=norm_type)
+
+        # ---------------- Transformer Encoder ----------------
+        self.transformer_encoder = TransformerEncoder(d_model        = self.out_dim,
+                                                      num_heads      = num_heads,
+                                                      num_layers     = num_layers,
+                                                      mlp_ratio      = mlp_ratio,
+                                                      pe_temperature = pe_temperature,
+                                                      dropout        = dropout,
+                                                      act_type       = en_act_type
+                                                      )
+
+        # ---------------- Top dwon FPN ----------------
+        ## P5 -> P4
+        self.top_down_layer_1 = CSPRepLayer(in_dim       = self.out_dim * 2,
+                                            out_dim      = self.out_dim,
+                                            num_blocks   = round(3*depth),
+                                            expansion    = expansion,
+                                            act_type     = act_type,
+                                            norm_type    = norm_type,
+                                            )
+        ## P4 -> P3
+        self.top_down_layer_2 = CSPRepLayer(in_dim       = self.out_dim * 2,
+                                            out_dim      = self.out_dim,
+                                            num_blocks   = round(3*depth),
+                                            expansion    = expansion,
+                                            act_type     = act_type,
+                                            norm_type    = norm_type,
+                                            )
+        
+        # ---------------- Bottom up PAN----------------
+        ## P3 -> P4
+        self.bottom_up_layer_1 = CSPRepLayer(in_dim       = self.out_dim * 2,
+                                             out_dim      = self.out_dim,
+                                             num_blocks   = round(3*depth),
+                                             expansion    = expansion,
+                                             act_type     = act_type,
+                                             norm_type    = norm_type,
+                                             )
+        ## P4 -> P5
+        self.bottom_up_layer_2 = CSPRepLayer(in_dim       = self.out_dim * 2,
+                                             out_dim      = self.out_dim,
+                                             num_blocks   = round(3*depth),
+                                             expansion    = expansion,
+                                             act_type     = act_type,
+                                             norm_type    = norm_type,
+                                             )
+
+        self.init_weights()
+  
+    def init_weights(self):
+        """Initialize the parameters."""
+        for m in self.modules():
+            if isinstance(m, torch.nn.Conv2d):
+                # In order to be consistent with the source code,
+                # reset the Conv2d initialization parameters
+                m.reset_parameters()
+
+    def forward(self, features):
+        c3, c4, c5 = features
+
+        # -------- Input projs --------
+        p5 = self.reduce_layer_1(c5)
+        p4 = self.reduce_layer_2(c4)
+        p3 = self.reduce_layer_3(c3)
+
+        # -------- Transformer encoder --------
+        p5 = self.transformer_encoder(p5)
+
+        # -------- Top down FPN --------
+        p5_up = F.interpolate(p5, scale_factor=2.0)
+        p4 = self.top_down_layer_1(torch.cat([p4, p5_up], dim=1))
+
+        p4_up = F.interpolate(p4, scale_factor=2.0)
+        p3 = self.top_down_layer_2(torch.cat([p3, p4_up], dim=1))
+
+        # -------- Bottom up PAN --------
+        p3_ds = self.dowmsample_layer_1(p3)
+        p4 = self.bottom_up_layer_1(torch.cat([p4, p3_ds], dim=1))
+
+        p4_ds = self.dowmsample_layer_2(p4)
+        p5 = self.bottom_up_layer_2(torch.cat([p5, p4_ds], dim=1))
+
+        out_feats = [p3, p4, p5]
+        
+        return out_feats
+
+
 if __name__ == '__main__':
     import time
     from thop import profile
@@ -174,10 +312,11 @@ if __name__ == '__main__':
         'fpn_act': 'silu',
         'fpn_norm': 'BN',
         'fpn_depthwise': False,
+        'expansion': 1.0,
         'en_num_heads': 8,
         'en_num_layers': 1,
         'en_mlp_ratio': 4.0,
-        'en_dropout': 0.1,
+        'en_dropout': 0.0,
         'pe_temperature': 10000.,
         'en_act': 'gelu',
     }

+ 10 - 4
models/detectors/rtdetr/rtdetr.py

@@ -26,6 +26,8 @@ class RT_DETR(nn.Module):
         self.conf_thresh = conf_thresh
         self.no_multi_labels = no_multi_labels
         self.deploy = deploy
+        # scale hidden channels by width_factor
+        cfg['hidden_dim'] = round(cfg['hidden_dim'] * cfg['width'])
 
         # ----------- Network setting -----------
         ## Image encoder
@@ -137,6 +139,10 @@ if __name__ == '__main__':
         'res5_dilation': False,
         'pretrained': True,
         'pretrained_weight': 'imagenet1k_v1',
+        'freeze_at': 0,
+        'freeze_stem_only': False,
+        'out_stride': [8, 16, 32],
+        'max_stride': 32,
         # Image Encoder - FPN
         'fpn': 'hybrid_encoder',
         'fpn_act': 'silu',
@@ -146,14 +152,14 @@ if __name__ == '__main__':
         'en_num_heads': 8,
         'en_num_layers': 1,
         'en_mlp_ratio': 4.0,
-        'en_dropout': 0.1,
+        'en_dropout': 0.0,
         'pe_temperature': 10000.,
         'en_act': 'gelu',
         # Transformer Decoder
         'transformer': 'rtdetr_transformer',
         'hidden_dim': 256,
         'de_num_heads': 8,
-        'de_num_layers': 6,
+        'de_num_layers': 3,
         'de_mlp_ratio': 4.0,
         'de_dropout': 0.0,
         'de_act': 'gelu',
@@ -186,11 +192,11 @@ if __name__ == '__main__':
     }] * bs
 
     # Create model
-    model = RT_DETR(cfg, num_classes=80)
+    model = RT_DETR(cfg, num_classes=20)
     model.train()
 
     # Create criterion
-    criterion = build_criterion(cfg, num_classes=80)
+    criterion = build_criterion(cfg, num_classes=20)
 
     # Model inference
     t0 = time.time()