yjh0410 преди 1 година
родител
ревизия
447338bed9

+ 0 - 121
config/model_config/rtdetr_config.py

@@ -31,7 +31,6 @@ rtdetr_cfg = {
         'en_act': 'gelu',
         # Transformer Decoder
         'transformer': 'rtdetr_transformer',
-        'hidden_dim': 256,
         'de_num_heads': 8,
         'de_num_layers': 3,
         'de_mlp_ratio': 4.0,
@@ -89,126 +88,6 @@ rtdetr_cfg = {
         'en_act': 'gelu',
         # Transformer Decoder
         'transformer': 'rtdetr_transformer',
-        'hidden_dim': 256,
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_mlp_ratio': 4.0,
-        'de_dropout': 0.0,
-        'de_act': 'relu',
-        'de_num_points': 4,
-        'num_queries': 300,
-        'learnt_init_query': False,
-        'pe_temperature': 10000.,
-        'dn_num_denoising': 100,
-        'dn_label_noise_ratio': 0.5,
-        'dn_box_noise_scale': 1,
-        # Head
-        'det_head': 'dino_head',
-        # ---------------- Assignment config ----------------
-        'matcher_hpy': {'cost_class': 2.0,
-                        'cost_bbox': 5.0,
-                        'cost_giou': 2.0,},
-        # ---------------- Loss config ----------------
-        'use_vfl': True,
-        'loss_coeff': {'class': 1,
-                       'bbox': 5,
-                       'giou': 2,},
-        # ---------------- Train config ----------------
-        ## input
-        'multi_scale': [0.5, 1.25],   # 320 -> 800
-        'trans_type': 'rtdetr_base',
-        # ---------------- Train config ----------------
-        'trainer_type': 'rtdetr',
-    },
-
-    'rtdetr_r101':{
-        # ---------------- Model config ----------------
-        ## Model scale
-        'width': 1.0,
-        'depth': 1.0,
-        ## Image Encoder - Backbone
-        'backbone': 'resnet101',
-        'backbone_norm': 'FrozeBN',
-        'pretrained': True,
-        'pretrained_weight': 'imagenet1k_v1',
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'out_stride': [8, 16, 32],
-        'max_stride': 32,
-        ## Image Encoder - FPN
-        'fpn': 'hybrid_encoder',
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        'hidden_dim': 256,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_mlp_ratio': 4.0,
-        'en_dropout': 0.0,
-        'pe_temperature': 10000.,
-        'en_act': 'gelu',
-        # Transformer Decoder
-        'transformer': 'rtdetr_transformer',
-        'hidden_dim': 256,
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_mlp_ratio': 4.0,
-        'de_dropout': 0.0,
-        'de_act': 'relu',
-        'de_num_points': 4,
-        'num_queries': 300,
-        'learnt_init_query': False,
-        'pe_temperature': 10000.,
-        'dn_num_denoising': 100,
-        'dn_label_noise_ratio': 0.5,
-        'dn_box_noise_scale': 1,
-        # Head
-        'det_head': 'dino_head',
-        # ---------------- Assignment config ----------------
-        'matcher_hpy': {'cost_class': 2.0,
-                        'cost_bbox': 5.0,
-                        'cost_giou': 2.0,},
-        # ---------------- Loss config ----------------
-        'use_vfl': True,
-        'loss_coeff': {'class': 1,
-                       'bbox': 5,
-                       'giou': 2,},
-        # ---------------- Train config ----------------
-        ## input
-        'multi_scale': [0.5, 1.25],   # 320 -> 800
-        'trans_type': 'rtdetr_base',
-        # ---------------- Train config ----------------
-        'trainer_type': 'rtdetr',
-    },
-
-    # Below RT-DETR is not complete
-    'rtdetr_l':{
-        # ---------------- Model config ----------------
-        ## Model scale
-        'width': 1.0,
-        'depth': 1.0,
-        ## Image Encoder - Backbone
-        'backbone': 'rtcnet_l',
-        'pretrained': True,
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'out_stride': [8, 16, 32],
-        'max_stride': 32,
-        ## Image Encoder - FPN
-        'fpn': 'hybrid_encoder',
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        'hidden_dim': 256,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_mlp_ratio': 4.0,
-        'en_dropout': 0.0,
-        'pe_temperature': 10000.,
-        'en_act': 'gelu',
-        # Transformer Decoder
-        'transformer': 'rtdetr_transformer',
-        'hidden_dim': 256,
         'de_num_heads': 8,
         'de_num_layers': 6,
         'de_mlp_ratio': 4.0,

+ 1 - 104
models/detectors/rtdetr/basic_modules/basic.py

@@ -1,6 +1,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 
 # ---------------------------- NMS ----------------------------
@@ -241,7 +242,6 @@ class PointwiseConv(nn.Module):
         return self.act(self.norm(self.conv(x)))
 
 
-
 # ----------------- CNN Modules -----------------
 class Bottleneck(nn.Module):
     def __init__(self,
@@ -303,106 +303,3 @@ class RTCBlock(nn.Module):
         out = self.output_proj(torch.cat(out, dim=1))
 
         return out
-
-class RepVggBlock(nn.Module):
-    def __init__(self, in_dim, out_dim, act_type='relu', norm_type='BN', alpha=False):
-        super(RepVggBlock, self).__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.conv1 = BasicConv(in_dim, out_dim, kernel_size=3, padding=1, act_type=None, norm_type=norm_type)
-        self.conv2 = BasicConv(in_dim, out_dim, kernel_size=3, padding=1, act_type=None, norm_type=norm_type)
-        self.act = get_activation(act_type)
-
-        if alpha:
-            self.alpha = nn.Parameter(torch.as_tensor([1.0]).float())
-        else:
-            self.alpha = None
-
-    def forward(self, x):
-        if hasattr(self, 'conv'):
-            y = self.conv(x)
-        else:
-            if self.alpha:
-                y = self.conv1(x) + self.alpha * self.conv2(x)
-            else:
-                y = self.conv1(x) + self.conv2(x)
-        y = self.act(y)
-        return y
-
-    def convert_to_deploy(self):
-        if not hasattr(self, 'conv'):
-            self.conv = nn.Conv2d(
-                self.in_dim,
-                self.out_dim,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                groups=1)
-        kernel, bias = self.get_equivalent_kernel_bias()
-        # self.conv.weight.set_value(kernel)
-        # self.conv.bias.set_value(bias)
-        self.conv.weight.data = kernel
-        self.conv.bias.data = bias
-        self.__delattr__('conv1')
-        self.__delattr__('conv2')
-
-    def get_equivalent_kernel_bias(self):
-        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
-        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
-        if self.alpha:
-            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
-                kernel1x1), bias3x3 + self.alpha * bias1x1
-        else:
-            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
-                kernel1x1), bias3x3 + bias1x1
-
-    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
-        if kernel1x1 is None:
-            return 0
-        else:
-            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
-
-    def _fuse_bn_tensor(self, branch):
-        if branch is None:
-            return 0, 0
-        kernel = branch.conv.weight
-        running_mean = branch.bn._mean
-        running_var = branch.bn._variance
-        gamma = branch.bn.weight
-        beta = branch.bn.bias
-        eps = branch.bn._epsilon
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape((-1, 1, 1, 1))
-
-        return kernel * t, beta - running_mean * gamma / std
-
-class CSPRepLayer(nn.Module):
-    def __init__(self,
-                 in_dim     :int,
-                 out_dim    :int,
-                 num_blocks :int   = 3,
-                 expansion  :float = 1.0,
-                 act_type   :str   ="silu",
-                 norm_type  :str   = 'BN'):
-        super(CSPRepLayer, self).__init__()
-        hidden_dim = int(out_dim * expansion)
-        self.conv1 = BasicConv(
-            in_dim, hidden_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.conv2 = BasicConv(
-            in_dim, hidden_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.bottlenecks = nn.Sequential(*[
-            RepVggBlock(
-                hidden_dim, hidden_dim, act_type=act_type, norm_type=norm_type)
-            for _ in range(num_blocks)
-        ])
-        if hidden_dim != out_dim:
-            self.conv3 = BasicConv(hidden_dim, out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        else:
-            self.conv3 = nn.Identity()
-
-    def forward(self, x):
-        x_1 = self.conv1(x)
-        x_1 = self.bottlenecks(x_1)
-        x_2 = self.conv2(x)
-
-        return self.conv3(x_1 + x_2)

+ 0 - 141
models/detectors/rtdetr/basic_modules/fpn.py

@@ -27,21 +27,6 @@ def build_fpn(cfg, in_dims, out_dim):
                              pe_temperature = cfg['pe_temperature'],
                              en_act_type    = cfg['en_act'],
                              )
-    elif cfg['fpn'] == 'pp_hybrid_encoder':
-        return PPHybridEncoder(in_dims     = in_dims,
-                               out_dim     = out_dim,
-                               depth       = cfg['depth'],
-                               expansion   = cfg['expansion'],
-                               act_type    = cfg['fpn_act'],
-                               norm_type   = cfg['fpn_norm'],
-                               depthwise   = cfg['fpn_depthwise'],
-                               num_heads   = cfg['en_num_heads'],
-                               num_layers  = cfg['en_num_layers'],
-                               mlp_ratio   = cfg['en_mlp_ratio'],
-                               dropout     = cfg['en_dropout'],
-                               pe_temperature = cfg['pe_temperature'],
-                               en_act_type    = cfg['en_act'],
-                               )
     else:
         raise NotImplementedError("Unknown PaFPN: <{}>".format(cfg['fpn']))
 
@@ -176,132 +161,6 @@ class HybridEncoder(nn.Module):
         return out_feats
 
 
-## PaddlePaddle Hybrid Encoder (Transformer encoder + Convolutional PaFPN)
-class PPHybridEncoder(nn.Module):
-    def __init__(self, 
-                 in_dims     :List  = [256, 512, 1024],
-                 out_dim     :int   = 256,
-                 depth       :float = 1.0,
-                 expansion   :float = 1.0,
-                 act_type    :str   = 'silu',
-                 norm_type   :str   = 'BN',
-                 depthwise   :bool  = False,
-                 # Transformer's parameters
-                 num_heads      :int   = 8,
-                 num_layers     :int   = 1,
-                 mlp_ratio      :float = 4.0,
-                 dropout        :float = 0.1,
-                 pe_temperature :float = 10000.,
-                 en_act_type    :str   = 'gelu'
-                 ) -> None:
-        super(PPHybridEncoder, self).__init__()
-        print('==============================')
-        print('FPN: {}'.format("RTC-PaFPN"))
-        # ---------------- Basic parameters ----------------
-        self.in_dims = in_dims
-        self.out_dim = out_dim
-        self.out_dims = [self.out_dim] * len(in_dims)
-        self.depth = depth
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.mlp_ratio = mlp_ratio
-        c3, c4, c5 = in_dims
-
-        # ---------------- Input projs ----------------
-        self.reduce_layer_1 = BasicConv(c5, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.reduce_layer_2 = BasicConv(c4, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.reduce_layer_3 = BasicConv(c3, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-
-        # ---------------- Downsample ----------------
-        self.dowmsample_layer_1 = BasicConv(self.out_dim, self.out_dim, kernel_size=3, padding=1, stride=2, act_type=act_type, norm_type=norm_type)
-        self.dowmsample_layer_2 = BasicConv(self.out_dim, self.out_dim, kernel_size=3, padding=1, stride=2, act_type=act_type, norm_type=norm_type)
-
-        # ---------------- Transformer Encoder ----------------
-        self.transformer_encoder = TransformerEncoder(d_model        = self.out_dim,
-                                                      num_heads      = num_heads,
-                                                      num_layers     = num_layers,
-                                                      mlp_ratio      = mlp_ratio,
-                                                      pe_temperature = pe_temperature,
-                                                      dropout        = dropout,
-                                                      act_type       = en_act_type
-                                                      )
-
-        # ---------------- Top dwon FPN ----------------
-        ## P5 -> P4
-        self.top_down_layer_1 = CSPRepLayer(in_dim       = self.out_dim * 2,
-                                            out_dim      = self.out_dim,
-                                            num_blocks   = round(3*depth),
-                                            expansion    = expansion,
-                                            act_type     = act_type,
-                                            norm_type    = norm_type,
-                                            )
-        ## P4 -> P3
-        self.top_down_layer_2 = CSPRepLayer(in_dim       = self.out_dim * 2,
-                                            out_dim      = self.out_dim,
-                                            num_blocks   = round(3*depth),
-                                            expansion    = expansion,
-                                            act_type     = act_type,
-                                            norm_type    = norm_type,
-                                            )
-        
-        # ---------------- Bottom up PAN----------------
-        ## P3 -> P4
-        self.bottom_up_layer_1 = CSPRepLayer(in_dim       = self.out_dim * 2,
-                                             out_dim      = self.out_dim,
-                                             num_blocks   = round(3*depth),
-                                             expansion    = expansion,
-                                             act_type     = act_type,
-                                             norm_type    = norm_type,
-                                             )
-        ## P4 -> P5
-        self.bottom_up_layer_2 = CSPRepLayer(in_dim       = self.out_dim * 2,
-                                             out_dim      = self.out_dim,
-                                             num_blocks   = round(3*depth),
-                                             expansion    = expansion,
-                                             act_type     = act_type,
-                                             norm_type    = norm_type,
-                                             )
-
-        self.init_weights()
-  
-    def init_weights(self):
-        """Initialize the parameters."""
-        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                # In order to be consistent with the source code,
-                # reset the Conv2d initialization parameters
-                m.reset_parameters()
-
-    def forward(self, features):
-        c3, c4, c5 = features
-
-        # -------- Input projs --------
-        p5 = self.reduce_layer_1(c5)
-        p4 = self.reduce_layer_2(c4)
-        p3 = self.reduce_layer_3(c3)
-
-        # -------- Transformer encoder --------
-        p5 = self.transformer_encoder(p5)
-
-        # -------- Top down FPN --------
-        p5_up = F.interpolate(p5, scale_factor=2.0)
-        p4 = self.top_down_layer_1(torch.cat([p4, p5_up], dim=1))
-
-        p4_up = F.interpolate(p4, scale_factor=2.0)
-        p3 = self.top_down_layer_2(torch.cat([p3, p4_up], dim=1))
-
-        # -------- Bottom up PAN --------
-        p3_ds = self.dowmsample_layer_1(p3)
-        p4 = self.bottom_up_layer_1(torch.cat([p4, p3_ds], dim=1))
-
-        p4_ds = self.dowmsample_layer_2(p4)
-        p5 = self.bottom_up_layer_2(torch.cat([p5, p4_ds], dim=1))
-
-        out_feats = [p3, p4, p5]
-        
-        return out_feats
-
-
 if __name__ == '__main__':
     import time
     from thop import profile

+ 2 - 2
models/detectors/rtdetr/rtdetr.py

@@ -149,7 +149,7 @@ if __name__ == '__main__':
         'depth': 1.0,
         'out_stride': [8, 16, 32],
         # Image Encoder - Backbone
-        'backbone': 'resnet18',
+        'backbone': 'resnet50',
         'backbone_norm': 'BN',
         'res5_dilation': False,
         'pretrained': True,
@@ -174,7 +174,7 @@ if __name__ == '__main__':
         'transformer': 'rtdetr_transformer',
         'hidden_dim': 256,
         'de_num_heads': 8,
-        'de_num_layers': 3,
+        'de_num_layers': 6,
         'de_mlp_ratio': 4.0,
         'de_dropout': 0.0,
         'de_act': 'gelu',