yjh0410 пре 1 година
родитељ
комит
b4813721db

+ 63 - 10
config/model_config/rtdetr_config.py

@@ -5,9 +5,6 @@
 rtdetr_cfg = {
     'rtdetr_r18':{
         # ---------------- Model config ----------------
-        ## Model scale
-        'width': 1.0,
-        'depth': 1.0,
         ## Image Encoder - Backbone
         'backbone': 'resnet18',
         'backbone_norm': 'FrozeBN',
@@ -19,13 +16,14 @@ rtdetr_cfg = {
         'max_stride': 32,
         ## Image Encoder - FPN
         'fpn': 'hybrid_encoder',
+        'fpn_num_blocks': 3,
         'fpn_act': 'silu',
         'fpn_norm': 'BN',
         'fpn_depthwise': False,
         'hidden_dim': 256,
         'en_num_heads': 8,
         'en_num_layers': 1,
-        'en_mlp_ratio': 4.0,
+        'en_ffn_dim': 1024,
         'en_dropout': 0.0,
         'pe_temperature': 10000.,
         'en_act': 'gelu',
@@ -33,7 +31,7 @@ rtdetr_cfg = {
         'transformer': 'rtdetr_transformer',
         'de_num_heads': 8,
         'de_num_layers': 3,
-        'de_mlp_ratio': 4.0,
+        'de_ffn_dim': 1024,
         'de_dropout': 0.0,
         'de_act': 'relu',
         'de_num_points': 4,
@@ -62,9 +60,6 @@ rtdetr_cfg = {
 
     'rtdetr_r50':{
         # ---------------- Model config ----------------
-        ## Model scale
-        'width': 1.0,
-        'depth': 1.0,
         ## Image Encoder - Backbone
         'backbone': 'resnet50',
         'backbone_norm': 'FrozeBN',
@@ -76,13 +71,14 @@ rtdetr_cfg = {
         'max_stride': 32,
         ## Image Encoder - FPN
         'fpn': 'hybrid_encoder',
+        'fpn_num_blocks': 3,
         'fpn_act': 'silu',
         'fpn_norm': 'BN',
         'fpn_depthwise': False,
         'hidden_dim': 256,
         'en_num_heads': 8,
         'en_num_layers': 1,
-        'en_mlp_ratio': 4.0,
+        'en_ffn_dim': 1024,
         'en_dropout': 0.0,
         'pe_temperature': 10000.,
         'en_act': 'gelu',
@@ -90,7 +86,64 @@ rtdetr_cfg = {
         'transformer': 'rtdetr_transformer',
         'de_num_heads': 8,
         'de_num_layers': 6,
-        'de_mlp_ratio': 4.0,
+        'de_ffn_dim': 1024,
+        'de_dropout': 0.0,
+        'de_act': 'relu',
+        'de_num_points': 4,
+        'num_queries': 300,
+        'learnt_init_query': False,
+        'pe_temperature': 10000.,
+        'dn_num_denoising': 100,
+        'dn_label_noise_ratio': 0.5,
+        'dn_box_noise_scale': 1,
+        # Head
+        'det_head': 'dino_head',
+        # ---------------- Assignment config ----------------
+        'matcher_hpy': {'cost_class': 2.0,
+                        'cost_bbox': 5.0,
+                        'cost_giou': 2.0,},
+        # ---------------- Loss config ----------------
+        'use_vfl': True,
+        'loss_coeff': {'class': 1,
+                       'bbox': 5,
+                       'giou': 2,},
+        # ---------------- Train config ----------------
+        ## input
+        'multi_scale': [0.5, 1.25],   # 320 -> 800
+        'trans_type': 'rtdetr_base',
+        # ---------------- Train config ----------------
+        'trainer_type': 'rtdetr',
+    },
+
+    'rtdetr_r101':{
+        # ---------------- Model config ----------------
+        ## Image Encoder - Backbone
+        'backbone': 'resnet101',
+        'backbone_norm': 'FrozeBN',
+        'pretrained': True,
+        'pretrained_weight': 'imagenet1k_v2',
+        'freeze_at': 0,
+        'freeze_stem_only': False,
+        'out_stride': [8, 16, 32],
+        'max_stride': 32,
+        ## Image Encoder - FPN
+        'fpn': 'hybrid_encoder',
+        'fpn_num_blocks': 4,
+        'fpn_act': 'silu',
+        'fpn_norm': 'BN',
+        'fpn_depthwise': False,
+        'hidden_dim': 384,
+        'en_num_heads': 8,
+        'en_num_layers': 1,
+        'en_ffn_dim': 2048,
+        'en_dropout': 0.0,
+        'pe_temperature': 10000.,
+        'en_act': 'gelu',
+        # Transformer Decoder
+        'transformer': 'rtdetr_transformer',
+        'de_num_heads': 8,
+        'de_num_layers': 6,
+        'de_ffn_dim': 2048,
         'de_dropout': 0.0,
         'de_act': 'relu',
         'de_num_points': 4,

+ 2 - 2
config/model_config/rtpdetr_config.py

@@ -19,14 +19,14 @@ rtpdetr_cfg = {
         'hidden_dim': 256,
         'en_num_heads': 8,
         'en_num_layers': 6,
-        'en_mlp_ratio': 4.0,
+        'en_ffn_dim': 1024,
         'en_dropout': 0.0,
         'en_act': 'gelu',
         # Transformer Decoder
         'transformer': 'plain_detr_transformer',
         'de_num_heads': 8,
         'de_num_layers': 6,
-        'de_mlp_ratio': 4.0,
+        'de_ffn_dim': 1024,
         'de_dropout': 0.0,
         'de_act': 'gelu',
         'de_pre_norm': True,

+ 4 - 4
models/detectors/rtdetr/basic_modules/basic.py

@@ -88,13 +88,13 @@ class MLP(nn.Module):
         return x
 
 class FFN(nn.Module):
-    def __init__(self, d_model=256, mlp_ratio=4.0, dropout=0., act_type='relu'):
+    def __init__(self, d_model=256, ffn_dim=1024, dropout=0., act_type='relu'):
         super().__init__()
-        self.fpn_dim = round(d_model * mlp_ratio)
-        self.linear1 = nn.Linear(d_model, self.fpn_dim)
+        self.ffn_dim = ffn_dim
+        self.linear1 = nn.Linear(d_model, self.ffn_dim)
         self.activation = get_activation(act_type)
         self.dropout2 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(self.fpn_dim, d_model)
+        self.linear2 = nn.Linear(self.ffn_dim, d_model)
         self.dropout3 = nn.Dropout(dropout)
         self.norm = nn.LayerNorm(d_model)
 

+ 14 - 17
models/detectors/rtdetr/basic_modules/fpn.py

@@ -4,10 +4,10 @@ import torch.nn.functional as F
 from typing import List
 
 try:
-    from .basic import BasicConv, RTCBlock, CSPRepLayer
+    from .basic import BasicConv, RTCBlock
     from .transformer import TransformerEncoder
 except:
-    from  basic import BasicConv, RTCBlock, CSPRepLayer
+    from  basic import BasicConv, RTCBlock
     from  transformer import TransformerEncoder
 
 
@@ -16,13 +16,13 @@ def build_fpn(cfg, in_dims, out_dim):
     if cfg['fpn'] == 'hybrid_encoder':
         return HybridEncoder(in_dims     = in_dims,
                              out_dim     = out_dim,
-                             depth       = cfg['depth'],
+                             num_blocks  = cfg['fpn_num_blocks'],
                              act_type    = cfg['fpn_act'],
                              norm_type   = cfg['fpn_norm'],
                              depthwise   = cfg['fpn_depthwise'],
                              num_heads   = cfg['en_num_heads'],
                              num_layers  = cfg['en_num_layers'],
-                             mlp_ratio   = cfg['en_mlp_ratio'],
+                             ffn_dim   = cfg['en_ffn_dim'],
                              dropout     = cfg['en_dropout'],
                              pe_temperature = cfg['pe_temperature'],
                              en_act_type    = cfg['en_act'],
@@ -37,14 +37,14 @@ class HybridEncoder(nn.Module):
     def __init__(self, 
                  in_dims     :List  = [256, 512, 1024],
                  out_dim     :int   = 256,
-                 depth       :float = 1.0,
+                 num_blocks  :int   = 3,
                  act_type    :str   = 'silu',
                  norm_type   :str   = 'BN',
                  depthwise   :bool  = False,
                  # Transformer's parameters
                  num_heads      :int   = 8,
                  num_layers     :int   = 1,
-                 mlp_ratio      :float = 4.0,
+                 ffn_dim        :int   = 1024,
                  dropout        :float = 0.1,
                  pe_temperature :float = 10000.,
                  en_act_type    :str   = 'gelu'
@@ -56,10 +56,9 @@ class HybridEncoder(nn.Module):
         self.in_dims = in_dims
         self.out_dim = out_dim
         self.out_dims = [self.out_dim] * len(in_dims)
-        self.depth = depth
         self.num_heads = num_heads
         self.num_layers = num_layers
-        self.mlp_ratio = mlp_ratio
+        self.ffn_dim = ffn_dim
         c3, c4, c5 = in_dims
 
         # ---------------- Input projs ----------------
@@ -75,7 +74,7 @@ class HybridEncoder(nn.Module):
         self.transformer_encoder = TransformerEncoder(d_model        = self.out_dim,
                                                       num_heads      = num_heads,
                                                       num_layers     = num_layers,
-                                                      mlp_ratio      = mlp_ratio,
+                                                      ffn_dim      = ffn_dim,
                                                       pe_temperature = pe_temperature,
                                                       dropout        = dropout,
                                                       act_type       = en_act_type
@@ -85,7 +84,7 @@ class HybridEncoder(nn.Module):
         ## P5 -> P4
         self.top_down_layer_1 = RTCBlock(in_dim       = self.out_dim * 2,
                                          out_dim      = self.out_dim,
-                                         num_blocks   = round(3*depth),
+                                         num_blocks   = num_blocks,
                                          shortcut     = False,
                                          act_type     = act_type,
                                          norm_type    = norm_type,
@@ -94,7 +93,7 @@ class HybridEncoder(nn.Module):
         ## P4 -> P3
         self.top_down_layer_2 = RTCBlock(in_dim       = self.out_dim * 2,
                                          out_dim      = self.out_dim,
-                                         num_blocks   = round(3*depth),
+                                         num_blocks   = num_blocks,
                                          shortcut     = False,
                                          act_type     = act_type,
                                          norm_type    = norm_type,
@@ -105,7 +104,7 @@ class HybridEncoder(nn.Module):
         ## P3 -> P4
         self.bottom_up_layer_1 = RTCBlock(in_dim       = self.out_dim * 2,
                                           out_dim      = self.out_dim,
-                                          num_blocks   = round(3*depth),
+                                          num_blocks   = num_blocks,
                                           shortcut     = False,
                                           act_type     = act_type,
                                           norm_type    = norm_type,
@@ -114,7 +113,7 @@ class HybridEncoder(nn.Module):
         ## P4 -> P5
         self.bottom_up_layer_2 = RTCBlock(in_dim       = self.out_dim * 2,
                                           out_dim      = self.out_dim,
-                                          num_blocks   = round(3*depth),
+                                          num_blocks   = num_blocks,
                                           shortcut     = False,
                                           act_type     = act_type,
                                           norm_type    = norm_type,
@@ -165,16 +164,14 @@ if __name__ == '__main__':
     import time
     from thop import profile
     cfg = {
-        'width': 1.0,
-        'depth': 1.0,
         'fpn': 'hybrid_encoder',
         'fpn_act': 'silu',
         'fpn_norm': 'BN',
         'fpn_depthwise': False,
-        'expansion': 1.0,
+        'fpn_num_blocks': 3,
         'en_num_heads': 8,
         'en_num_layers': 1,
-        'en_mlp_ratio': 4.0,
+        'en_ffn_dim': 1024,
         'en_dropout': 0.0,
         'pe_temperature': 10000.,
         'en_act': 'gelu',

+ 12 - 12
models/detectors/rtdetr/basic_modules/transformer.py

@@ -210,7 +210,7 @@ class TransformerEncoderLayer(nn.Module):
     def __init__(self,
                  d_model         :int   = 256,
                  num_heads       :int   = 8,
-                 mlp_ratio       :float = 4.0,
+                 ffn_dim         :int   = 1024,
                  dropout         :float = 0.1,
                  act_type        :str   = "relu",
                  ):
@@ -218,7 +218,7 @@ class TransformerEncoderLayer(nn.Module):
         # ----------- Basic parameters -----------
         self.d_model = d_model
         self.num_heads = num_heads
-        self.mlp_ratio = mlp_ratio
+        self.ffn_dim = ffn_dim
         self.dropout = dropout
         self.act_type = act_type
         # ----------- Basic parameters -----------
@@ -228,7 +228,7 @@ class TransformerEncoderLayer(nn.Module):
         self.norm = nn.LayerNorm(d_model)
 
         # Feedforwaed Network
-        self.ffn = FFN(d_model, mlp_ratio, dropout, act_type)
+        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
 
     def with_pos_embed(self, tensor, pos):
         return tensor if pos is None else tensor + pos
@@ -259,7 +259,7 @@ class TransformerEncoder(nn.Module):
                  d_model        :int   = 256,
                  num_heads      :int   = 8,
                  num_layers     :int   = 1,
-                 mlp_ratio      :float = 4.0,
+                 ffn_dim        :int   = 1024,
                  pe_temperature : float = 10000.,
                  dropout        :float = 0.1,
                  act_type       :str   = "relu",
@@ -269,14 +269,14 @@ class TransformerEncoder(nn.Module):
         self.d_model = d_model
         self.num_heads = num_heads
         self.num_layers = num_layers
-        self.mlp_ratio = mlp_ratio
+        self.ffn_dim = ffn_dim
         self.dropout = dropout
         self.act_type = act_type
         self.pe_temperature = pe_temperature
         self.pos_embed = None
         # ----------- Basic parameters -----------
         self.encoder_layers = get_clones(
-            TransformerEncoderLayer(d_model, num_heads, mlp_ratio, dropout, act_type), num_layers)
+            TransformerEncoderLayer(d_model, num_heads, ffn_dim, dropout, act_type), num_layers)
 
     def build_2d_sincos_position_embedding(self, device, w, h, embed_dim=256, temperature=10000.):
         assert embed_dim % 4 == 0, \
@@ -339,7 +339,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
                  num_heads   :int   = 8,
                  num_levels  :int   = 3,
                  num_points  :int   = 4,
-                 mlp_ratio   :float = 4.0,
+                 ffn_dim     :int   = 1024,
                  dropout     :float = 0.1,
                  act_type    :str   = "relu",
                  ):
@@ -349,7 +349,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
         self.num_heads = num_heads
         self.num_levels = num_levels
         self.num_points = num_points
-        self.mlp_ratio = mlp_ratio
+        self.ffn_dim = ffn_dim
         self.dropout = dropout
         self.act_type = act_type
         # ---------------- Network parameters ----------------
@@ -362,7 +362,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
         self.dropout2 = nn.Dropout(dropout)
         self.norm2 = nn.LayerNorm(d_model)
         ## FFN
-        self.ffn = FFN(d_model, mlp_ratio, dropout, act_type)
+        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
 
     def with_pos_embed(self, tensor, pos):
         return tensor if pos is None else tensor + pos
@@ -403,7 +403,7 @@ class DeformableTransformerDecoder(nn.Module):
                  num_layers     :int   = 1,
                  num_levels     :int   = 3,
                  num_points     :int   = 4,
-                 mlp_ratio      :float = 4.0,
+                 ffn_dim        :int   = 1024,
                  dropout        :float = 0.1,
                  act_type       :str   = "relu",
                  return_intermediate :bool = False,
@@ -413,13 +413,13 @@ class DeformableTransformerDecoder(nn.Module):
         self.d_model = d_model
         self.num_heads = num_heads
         self.num_layers = num_layers
-        self.mlp_ratio = mlp_ratio
+        self.ffn_dim = ffn_dim
         self.dropout = dropout
         self.act_type = act_type
         self.pos_embed = None
         # ----------- Network parameters -----------
         self.decoder_layers = get_clones(
-            DeformableTransformerDecoderLayer(d_model, num_heads, num_levels, num_points, mlp_ratio, dropout, act_type), num_layers)
+            DeformableTransformerDecoderLayer(d_model, num_heads, num_levels, num_points, ffn_dim, dropout, act_type), num_layers)
         self.num_layers = num_layers
         self.return_intermediate = return_intermediate
 

+ 6 - 11
models/detectors/rtdetr/rtdetr.py

@@ -29,8 +29,6 @@ class RT_DETR(nn.Module):
         self.num_classes = num_classes
         self.num_topk = topk
         self.deploy = deploy
-        # scale hidden channels by width_factor
-        cfg['hidden_dim'] = round(cfg['hidden_dim'] * cfg['width'])
         ## Post-process parameters
         self.use_nms = use_nms
         self.nms_thresh = nms_thresh
@@ -145,14 +143,11 @@ if __name__ == '__main__':
 
     # Model config
     cfg = {
-        'width': 1.0,
-        'depth': 1.0,
-        'out_stride': [8, 16, 32],
         # Image Encoder - Backbone
-        'backbone': 'resnet50',
+        'backbone': 'resnet101',
         'backbone_norm': 'BN',
         'res5_dilation': False,
-        'pretrained': True,
+        'pretrained': False,
         'pretrained_weight': 'imagenet1k_v1',
         'freeze_at': 0,
         'freeze_stem_only': False,
@@ -160,22 +155,22 @@ if __name__ == '__main__':
         'max_stride': 32,
         # Image Encoder - FPN
         'fpn': 'hybrid_encoder',
+        'fpn_num_blocks': 4,
         'fpn_act': 'silu',
         'fpn_norm': 'BN',
         'fpn_depthwise': False,
-        'hidden_dim': 256,
+        'hidden_dim': 384,
         'en_num_heads': 8,
         'en_num_layers': 1,
-        'en_mlp_ratio': 4.0,
+        'en_ffn_dim': 2048,
         'en_dropout': 0.0,
         'pe_temperature': 10000.,
         'en_act': 'gelu',
         # Transformer Decoder
         'transformer': 'rtdetr_transformer',
-        'hidden_dim': 256,
         'de_num_heads': 8,
         'de_num_layers': 6,
-        'de_mlp_ratio': 4.0,
+        'de_ffn_dim': 2048,
         'de_dropout': 0.0,
         'de_act': 'gelu',
         'de_num_points': 4,

+ 5 - 5
models/detectors/rtdetr/rtdetr_decoder.py

@@ -27,7 +27,7 @@ def build_transformer(cfg, in_dims, num_classes, return_intermediate=False):
                                  num_layers          = cfg['de_num_layers'],
                                  num_levels          = len(cfg['out_stride']),
                                  num_points          = cfg['de_num_points'],
-                                 mlp_ratio           = cfg['de_mlp_ratio'],
+                                 ffn_dim           = cfg['de_ffn_dim'],
                                  dropout             = cfg['de_dropout'],
                                  act_type            = cfg['de_act'],
                                  return_intermediate = return_intermediate,
@@ -54,7 +54,7 @@ class RTDETRTransformer(nn.Module):
                  num_layers     :int   = 1,
                  num_levels     :int   = 3,
                  num_points     :int   = 4,
-                 mlp_ratio      :float = 4.0,
+                 ffn_dim        :int   = 1024,
                  dropout        :float = 0.1,
                  act_type       :str   = "relu",
                  return_intermediate :bool = False,
@@ -80,7 +80,7 @@ class RTDETRTransformer(nn.Module):
         self.num_layers = num_layers
         self.num_levels = num_levels
         self.num_points = num_points
-        self.mlp_ratio  = mlp_ratio
+        self.ffn_dim  = ffn_dim
         self.dropout    = dropout
         self.act_type   = act_type
         self.return_intermediate = return_intermediate
@@ -104,7 +104,7 @@ class RTDETRTransformer(nn.Module):
                                     num_layers = num_layers,
                                     num_levels = num_levels,
                                     num_points = num_points,
-                                    mlp_ratio  = mlp_ratio,
+                                    ffn_dim  = ffn_dim,
                                     dropout    = dropout,
                                     act_type   = act_type,
                                     return_intermediate = return_intermediate
@@ -335,7 +335,7 @@ if __name__ == '__main__':
         'hidden_dim': 256,
         'de_num_heads': 8,
         'de_num_layers': 6,
-        'de_mlp_ratio': 4.0,
+        'de_ffn_dim': 1024,
         'de_dropout': 0.1,
         'de_act': 'gelu',
         'de_num_points': 4,