yjh0410 2 years ago
parent
commit
8b21555292

+ 0 - 5
config/__init__.py

@@ -106,8 +106,6 @@ from .model_config.yolov5_config import yolov5_cfg
 from .model_config.yolov7_config import yolov7_cfg
 from .model_config.yolovx_config import yolovx_cfg
 from .model_config.yolox_config import yolox_cfg
-## Real-Time DETR
-from .model_config.rtdetr_config import rtdetr_cfg
 
 
 def build_model_config(args):
@@ -137,9 +135,6 @@ def build_model_config(args):
     # YOLOvx
     elif args.model in ['yolovx_n', 'yolovx_t', 'yolovx_s', 'yolovx_m', 'yolovx_l', 'yolovx_x']:
         cfg = yolovx_cfg[args.model]
-    # RT-DETR
-    elif args.model in ['rtdetr_n', 'rtdetr_s', 'rtdetr_m', 'rtdetr_l', 'rtdetr_x']:
-        cfg = rtdetr_cfg[args.model]
 
     return cfg
 

+ 0 - 69
config/model_config/rtdetr_config.py

@@ -1,69 +0,0 @@
-# yolo-free config
-
-
-rtdetr_cfg = {
-    # P5
-    'rtdetr_n': {
-        # ---------------- Model config ----------------
-        ## ------- Image Encoder -------
-        ### CNN-Backbone
-        'backbone': 'elannet',
-        'pretrained': False,
-        'bk_act': 'silu',
-        'bk_norm': 'BN',
-        'bk_dpw': False,
-        'width': 0.25,
-        'depth': 0.34,
-        'stride': [8, 16, 32],  # P3, P4, P5
-        'max_stride': 32,
-        ### CNN-Neck
-        'neck': 'sppf',
-        'neck_expand_ratio': 0.5,
-        'pooling_size': 5,
-        'neck_act': 'silu',
-        'neck_norm': 'BN',
-        'neck_depthwise': False,
-        ### CNN-CSFM
-        'fpn': 'yolovx_pafpn',
-        'fpn_reduce_layer': 'conv',
-        'fpn_downsample_layer': 'conv',
-        'fpn_core_block': 'elanblock',
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        ## ------- Memory Decoder -------
-        'num_compressed': 300,
-        'com_dim_feedforward': 1024,
-        'com_num_heads': 8,
-        'com_dropout': 0.1,
-        'com_act': 'silu',
-        ## ------- Transformer Decoder -------
-        'd_model': 256,
-        'attn_type': 'mhsa',
-        'num_decoder_layers': 6,
-        'num_queries': 300,
-        'de_dim_feedforward': 1024,
-        'de_num_heads': 8,
-        'de_dropout': 0.1,
-        'de_act': 'silu',
-        'de_norm': 'LN',
-        # ---------------- Train config ----------------
-        ## input
-        'multi_scale': [0.5, 1.0],   # 320 -> 640
-        'trans_type': 'yolov5_nano',
-        # ---------------- Assignment config ----------------
-        ## matcher
-        'set_cost_class': 2.0,
-        'set_cost_bbox': 5.0,
-        'set_cost_giou': 2.0,
-        # ---------------- Loss config ----------------
-        ## loss weight
-        'focal_alpha': 0.25,
-        'loss_cls_weight': 1.0,
-        'loss_box_weight': 5.0,
-        'loss_giou_weight': 2.0,
-        # ---------------- Train config ----------------
-        'trainer_type': 'detr',
-        },
-
-}

+ 0 - 6
models/detectors/__init__.py

@@ -12,8 +12,6 @@ from .yolov7.build import build_yolov7
 from .yolovx.build import build_yolovx
 # My custom YOLO
 from .yolox.build import build_yolox
-# Real-time DETR
-from .rtdetr.build import build_rtdetr
 
 
 # build object detector
@@ -55,10 +53,6 @@ def build_model(args,
     elif args.model in ['yolox_n', 'yolox_s', 'yolox_m', 'yolox_l', 'yolox_x']:
         model, criterion = build_yolox(
             args, model_cfg, device, num_classes, trainable, deploy)
-    # RT-DETR
-    elif args.model in ['rtdetr_n', 'rtdetr_s', 'rtdetr_m', 'rtdetr_l', 'rtdetr_x']:
-        model, criterion = build_rtdetr(
-            args, model_cfg, device, num_classes, trainable, deploy)
 
 
     if trainable:

+ 0 - 8
models/detectors/rtdetr/README.md

@@ -1,8 +0,0 @@
-# Redesigned RT-DETR:
-
-| Model     | Scale | Batch | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
-|-----------|-------|-------|------------------------|-------------------|-------------------|--------------------|--------|
-| RT-DETR-N |  640  |       |                        |                   |                   |                    |  |
-| RT-DETR-S |  640  |       |                        |                   |                   |                    |  |
-| RT-DETR-M |  640  |       |                        |                   |                   |                    |  |
-| RT-DETR-L |  640  |       |                        |                   |                   |                    |  |

+ 0 - 33
models/detectors/rtdetr/build.py

@@ -1,33 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-
-from .loss import build_criterion
-from .rtdetr import RTDETR
-
-
-# build object detector
-def build_rtdetr(args, cfg, device, num_classes=80, trainable=False, deploy=False):
-    print('==============================')
-    print('Build {} ...'.format(args.model.upper()))
-    
-    print('==============================')
-    print('Model Configuration: \n', cfg)
-    
-    # -------------- Build rtdetr --------------
-    model = RTDETR(
-        cfg=cfg,
-        device=device, 
-        num_classes=num_classes,
-        trainable=trainable,
-        aux_loss=trainable,
-        with_box_refine=True,
-        deploy=deploy
-        )
-
-    # -------------- Build criterion --------------
-    criterion = None
-    if trainable:
-        # build criterion for training
-        criterion = build_criterion(cfg, num_classes, aux_loss=True)
-        
-    return model, criterion

+ 0 - 154
models/detectors/rtdetr/image_encoder/cnn_backbone.py

@@ -1,154 +0,0 @@
-import torch
-import torch.nn as nn
-try:
-    from .cnn_basic import Conv, ELANBlock, DownSample
-except:
-    from cnn_basic import Conv, ELANBlock, DownSample
-
-
-
-model_urls = {
-    'elannet_pico': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_pico.pth",
-    'elannet_nano': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_nano.pth",
-    'elannet_small': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_small.pth",
-    'elannet_medium': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_medium.pth",
-    'elannet_large': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_large.pth",
-    'elannet_huge': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_huge.pth",
-}
-
-
-# ---------------------------- Backbones ----------------------------
-# ELANNet-P5
-class ELANNet(nn.Module):
-    def __init__(self, width=1.0, depth=1.0, act_type='silu', norm_type='BN', depthwise=False):
-        super(ELANNet, self).__init__()
-        self.feat_dims = [int(512 * width), int(1024 * width), int(1024 * width)]
-        
-        # P1/2
-        self.layer_1 = nn.Sequential(
-            Conv(3, int(64*width), k=3, p=1, s=2, act_type=act_type, norm_type=norm_type),
-            Conv(int(64*width), int(64*width), k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        )
-        # P2/4
-        self.layer_2 = nn.Sequential(   
-            Conv(int(64*width), int(128*width), k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise),             
-            ELANBlock(in_dim=int(128*width), out_dim=int(256*width), expand_ratio=0.5, depth=depth,
-                      act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        )
-        # P3/8
-        self.layer_3 = nn.Sequential(
-            DownSample(in_dim=int(256*width), out_dim=int(256*width), act_type=act_type, norm_type=norm_type),             
-            ELANBlock(in_dim=int(256*width), out_dim=int(512*width), expand_ratio=0.5, depth=depth,
-                      act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        )
-        # P4/16
-        self.layer_4 = nn.Sequential(
-            DownSample(in_dim=int(512*width), out_dim=int(512*width), act_type=act_type, norm_type=norm_type),             
-            ELANBlock(in_dim=int(512*width), out_dim=int(1024*width), expand_ratio=0.5, depth=depth,
-                      act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        )
-        # P5/32
-        self.layer_5 = nn.Sequential(
-            DownSample(in_dim=int(1024*width), out_dim=int(1024*width), act_type=act_type, norm_type=norm_type),             
-            ELANBlock(in_dim=int(1024*width), out_dim=int(1024*width), expand_ratio=0.25, depth=depth,
-                    act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        )
-
-
-    def forward(self, x):
-        c1 = self.layer_1(x)
-        c2 = self.layer_2(c1)
-        c3 = self.layer_3(c2)
-        c4 = self.layer_4(c3)
-        c5 = self.layer_5(c4)
-
-        outputs = [c3, c4, c5]
-
-        return outputs
-
-
-# ---------------------------- Functions ----------------------------
-## load pretrained weight
-def load_weight(model, model_name):
-    # load weight
-    print('Loading pretrained weight ...')
-    url = model_urls[model_name]
-    if url is not None:
-        checkpoint = torch.hub.load_state_dict_from_url(
-            url=url, map_location="cpu", check_hash=True)
-        # checkpoint state dict
-        checkpoint_state_dict = checkpoint.pop("model")
-        # model state dict
-        model_state_dict = model.state_dict()
-        # check
-        for k in list(checkpoint_state_dict.keys()):
-            if k in model_state_dict:
-                shape_model = tuple(model_state_dict[k].shape)
-                shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
-                if shape_model != shape_checkpoint:
-                    checkpoint_state_dict.pop(k)
-            else:
-                checkpoint_state_dict.pop(k)
-                print(k)
-
-        model.load_state_dict(checkpoint_state_dict)
-    else:
-        print('No pretrained for {}'.format(model_name))
-
-    return model
-
-
-## build ELAN-Net
-def build_backbone(cfg, pretrained=False): 
-    # model
-    backbone = ELANNet(
-        width=cfg['width'],
-        depth=cfg['depth'],
-        act_type=cfg['bk_act'],
-        norm_type=cfg['bk_norm'],
-        depthwise=cfg['bk_dpw']
-        )
-    # check whether to load imagenet pretrained weight
-    if pretrained:
-        if cfg['width'] == 0.25 and cfg['depth'] == 0.34 and cfg['bk_dpw']:
-            backbone = load_weight(backbone, model_name='elannet_pico')
-        elif cfg['width'] == 0.25 and cfg['depth'] == 0.34:
-            backbone = load_weight(backbone, model_name='elannet_nano')
-        elif cfg['width'] == 0.5 and cfg['depth'] == 0.34:
-            backbone = load_weight(backbone, model_name='elannet_small')
-        elif cfg['width'] == 0.75 and cfg['depth'] == 0.67:
-            backbone = load_weight(backbone, model_name='elannet_medium')
-        elif cfg['width'] == 1.0 and cfg['depth'] == 1.0:
-            backbone = load_weight(backbone, model_name='elannet_large')
-        elif cfg['width'] == 1.25 and cfg['depth'] == 1.34:
-            backbone = load_weight(backbone, model_name='elannet_huge')
-    feat_dims = backbone.feat_dims
-
-    return backbone, feat_dims
-
-
-if __name__ == '__main__':
-    import time
-    from thop import profile
-    cfg = {
-        'pretrained': True,
-        'bk_act': 'silu',
-        'bk_norm': 'BN',
-        'bk_dpw': True,
-        'width': 0.25,
-        'depth': 0.34,
-    }
-    model, feats = build_backbone(cfg)
-    x = torch.randn(1, 3, 640, 640)
-    t0 = time.time()
-    outputs = model(x)
-    t1 = time.time()
-    print('Time: ', t1 - t0)
-    for out in outputs:
-        print(out.shape)
-
-    print('==============================')
-    flops, params = profile(model, inputs=(x, ), verbose=False)
-    print('==============================')
-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
-    print('Params : {:.2f} M'.format(params / 1e6))

+ 0 - 174
models/detectors/rtdetr/image_encoder/cnn_basic.py

@@ -1,174 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-# ------------------------------- Basic Modules -------------------------------
-class SiLU(nn.Module):
-    """export-friendly version of nn.SiLU()"""
-
-    @staticmethod
-    def forward(x):
-        return x * torch.sigmoid(x)
-
-
-def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
-    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
-
-    return conv
-
-
-def get_activation(act_type=None):
-    if act_type == 'relu':
-        return nn.ReLU(inplace=True)
-    elif act_type == 'gelu':
-        return nn.GELU()
-    elif act_type == 'lrelu':
-        return nn.LeakyReLU(0.1, inplace=True)
-    elif act_type == 'mish':
-        return nn.Mish(inplace=True)
-    elif act_type == 'silu':
-        return nn.SiLU(inplace=True)
-
-
-def get_norm(norm_type, dim):
-    if norm_type == 'BN':
-        return nn.BatchNorm2d(dim)
-    elif norm_type == 'GN':
-        return nn.GroupNorm(num_groups=32, num_channels=dim)
-    elif norm_type == 'LN':
-        return nn.LayerNorm(dim)
-
-
-# ------------------------------- Conv -------------------------------
-class Conv(nn.Module):
-    def __init__(self, 
-                 c1,                   # in channels
-                 c2,                   # out channels 
-                 k=1,                  # kernel size 
-                 p=0,                  # padding
-                 s=1,                  # padding
-                 d=1,                  # dilation
-                 act_type='relu',      # activation
-                 norm_type='BN',       # normalization
-                 depthwise=False):
-        super(Conv, self).__init__()
-        convs = []
-        add_bias = False if norm_type else True
-        if depthwise:
-            convs.append(get_conv2d(c1, c1, k=k, p=p, s=s, d=d, g=c1, bias=add_bias))
-            # depthwise conv
-            if norm_type:
-                convs.append(get_norm(norm_type, c1))
-            if act_type:
-                convs.append(get_activation(act_type))
-            # pointwise conv
-            convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias))
-            if norm_type:
-                convs.append(get_norm(norm_type, c2))
-            if act_type:
-                convs.append(get_activation(act_type))
-        else:
-            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=1, bias=add_bias))
-            if norm_type:
-                convs.append(get_norm(norm_type, c2))
-            if act_type:
-                convs.append(get_activation(act_type))
-            
-        self.convs = nn.Sequential(*convs)
-
-
-    def forward(self, x):
-        return self.convs(x)
-
-
-# ---------------------------- Modified YOLOv7's Modules ----------------------------
-## ELANBlock
-class ELANBlock(nn.Module):
-    def __init__(self, in_dim, out_dim, expand_ratio=0.5, depth=1.0, act_type='silu', norm_type='BN', depthwise=False):
-        super(ELANBlock, self).__init__()
-        if isinstance(expand_ratio, float):
-            inter_dim = int(in_dim * expand_ratio)
-            inter_dim2 = inter_dim
-        elif isinstance(expand_ratio, list):
-            assert len(expand_ratio) == 2
-            e1, e2 = expand_ratio
-            inter_dim = int(in_dim * e1)
-            inter_dim2 = int(inter_dim * e2)
-        # branch-1
-        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
-        # branch-2
-        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
-        # branch-3
-        for idx in range(round(3*depth)):
-            if idx == 0:
-                cv3 = [Conv(inter_dim, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)]
-            else:
-                cv3.append(Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise))
-        self.cv3 = nn.Sequential(*cv3)
-        # branch-4
-        self.cv4 = nn.Sequential(*[
-            Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-            for _ in range(round(3*depth))
-        ])
-        # output
-        self.out = Conv(inter_dim*2 + inter_dim2*2, out_dim, k=1, act_type=act_type, norm_type=norm_type)
-
-
-    def forward(self, x):
-        x1 = self.cv1(x)
-        x2 = self.cv2(x)
-        x3 = self.cv3(x2)
-        x4 = self.cv4(x3)
-
-        out = self.out(torch.cat([x1, x2, x3, x4], dim=1))
-
-        return out
-
-## DownSample
-class DownSample(nn.Module):
-    def __init__(self, in_dim, out_dim, act_type='silu', norm_type='BN', depthwise=False):
-        super().__init__()
-        inter_dim = out_dim // 2
-        self.mp = nn.MaxPool2d((2, 2), 2)
-        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
-        self.cv2 = nn.Sequential(
-            Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type),
-            Conv(inter_dim, inter_dim, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        )
-
-    def forward(self, x):
-        x1 = self.cv1(self.mp(x))
-        x2 = self.cv2(x)
-        out = torch.cat([x1, x2], dim=1)
-
-        return out
-
-
-## build core block for CSFM
-def build_fpn_block(cfg, in_dim, out_dim):
-    if cfg['fpn_core_block'] == 'elanblock':
-        layer = ELANBlock(in_dim=in_dim,
-                          out_dim=out_dim,
-                          expand_ratio=[0.5, 0.5],
-                          depth=cfg['depth'],
-                          act_type=cfg['fpn_act'],
-                          norm_type=cfg['fpn_norm'],
-                          depthwise=cfg['fpn_depthwise']
-                          )
-        
-    return layer
-
-## build reduce layer for CSFM
-def build_reduce_layer(cfg, in_dim, out_dim):
-    layer = Conv(in_dim, out_dim, k=1,
-                 act_type=cfg['fpn_act'], norm_type=cfg['fpn_norm'])
-        
-    return layer
-
-## build downsample layer for CSFM
-def build_downsample_layer(cfg, in_dim, out_dim):
-    if cfg['fpn_downsample_layer'] == 'conv':
-        layer = Conv(in_dim, out_dim, k=3, s=2, p=1,
-                     act_type=cfg['fpn_act'], norm_type=cfg['fpn_norm'])
-        
-    return layer

+ 0 - 70
models/detectors/rtdetr/image_encoder/cnn_neck.py

@@ -1,70 +0,0 @@
-import torch
-import torch.nn as nn
-from .cnn_basic import Conv
-
-
-# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
-class SPPF(nn.Module):
-    """
-        This code referenced to https://github.com/ultralytics/yolov5
-    """
-    def __init__(self, cfg, in_dim, out_dim, expand_ratio=0.5):
-        super().__init__()
-        inter_dim = int(in_dim * expand_ratio)
-        self.out_dim = out_dim
-        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
-        self.cv2 = Conv(inter_dim * 4, out_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
-        self.m = nn.MaxPool2d(kernel_size=cfg['pooling_size'], stride=1, padding=cfg['pooling_size'] // 2)
-
-    def forward(self, x):
-        x = self.cv1(x)
-        y1 = self.m(x)
-        y2 = self.m(y1)
-
-        return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
-
-
-# SPPF block with CSP module
-class SPPFBlockCSP(nn.Module):
-    """
-        CSP Spatial Pyramid Pooling Block
-    """
-    def __init__(self, cfg, in_dim, out_dim, expand_ratio):
-        super(SPPFBlockCSP, self).__init__()
-        inter_dim = int(in_dim * expand_ratio)
-        self.out_dim = out_dim
-        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
-        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
-        self.m = nn.Sequential(
-            Conv(inter_dim, inter_dim, k=3, p=1, 
-                 act_type=cfg['neck_act'], norm_type=cfg['neck_norm'], 
-                 depthwise=cfg['neck_depthwise']),
-            SPPF(cfg, inter_dim, inter_dim, expand_ratio=1.0),
-            Conv(inter_dim, inter_dim, k=3, p=1, 
-                 act_type=cfg['neck_act'], norm_type=cfg['neck_norm'], 
-                 depthwise=cfg['neck_depthwise'])
-        )
-        self.cv3 = Conv(inter_dim * 2, self.out_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
-
-        
-    def forward(self, x):
-        x1 = self.cv1(x)
-        x2 = self.cv2(x)
-        x3 = self.m(x2)
-        y = self.cv3(torch.cat([x1, x3], dim=1))
-
-        return y
-
-
-def build_neck(cfg, in_dim, out_dim):
-    model = cfg['neck']
-    print('==============================')
-    print('Neck: {}'.format(model))
-    # build neck
-    if model == 'sppf':
-        neck = SPPF(cfg, in_dim, out_dim, cfg['neck_expand_ratio'])
-    elif model == 'csp_sppf':
-        neck = SPPFBlockCSP(cfg, in_dim, out_dim, cfg['neck_expand_ratio'])
-
-    return neck
-        

+ 0 - 98
models/detectors/rtdetr/image_encoder/cnn_pafpn.py

@@ -1,98 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .cnn_basic import (Conv, build_reduce_layer, build_downsample_layer, build_fpn_block)
-
-
-# YOLO-Style PaFPN
-class YolovxPaFPN(nn.Module):
-    def __init__(self, cfg, in_dims=[512, 1024, 1024], out_dim=None, input_proj=False):
-        super(YolovxPaFPN, self).__init__()
-        # --------------------------- Basic Parameters ---------------------------
-        self.in_dims = in_dims
-        if input_proj:
-            self.fpn_dims = [round(256*cfg['width']), round(512*cfg['width']), round(1024*cfg['width'])]
-        else:
-            self.fpn_dims = in_dims
-
-        # --------------------------- Input proj ---------------------------
-        self.input_projs = nn.ModuleList([nn.Conv2d(in_dim, fpn_dim, kernel_size=1)
-                                          for in_dim, fpn_dim in zip(in_dims, self.fpn_dims)])
-        
-        # --------------------------- Top-down FPN---------------------------
-        ## P5 -> P4
-        self.reduce_layer_1 = build_reduce_layer(cfg, self.fpn_dims[2], self.fpn_dims[2]//2)
-        self.top_down_layer_1 = build_fpn_block(cfg, self.fpn_dims[1] + self.fpn_dims[2]//2, self.fpn_dims[1])
-
-        ## P4 -> P3
-        self.reduce_layer_2 = build_reduce_layer(cfg, self.fpn_dims[1], self.fpn_dims[1]//2)
-        self.top_down_layer_2 = build_fpn_block(cfg, self.fpn_dims[0] + self.fpn_dims[1]//2, self.fpn_dims[0])
-
-        # --------------------------- Bottom-up FPN ---------------------------
-        ## P3 -> P4
-        self.downsample_layer_1 = build_downsample_layer(cfg, self.fpn_dims[0], self.fpn_dims[0])
-        self.bottom_up_layer_1 = build_fpn_block(cfg, self.fpn_dims[0] + self.fpn_dims[1]//2, self.fpn_dims[1])
-
-        ## P4 -> P5
-        self.downsample_layer_2 = build_downsample_layer(cfg, self.fpn_dims[1], self.fpn_dims[1])
-        self.bottom_up_layer_2 = build_fpn_block(cfg, self.fpn_dims[1] + self.fpn_dims[2]//2, self.fpn_dims[2])
-                
-        # --------------------------- Output proj ---------------------------
-        if out_dim is not None:
-            self.out_layers = nn.ModuleList([
-                Conv(in_dim, out_dim, k=1,
-                     act_type=cfg['fpn_act'], norm_type=cfg['fpn_norm'])
-                     for in_dim in self.fpn_dims
-                     ])
-            self.out_dim = [out_dim] * 3
-        else:
-            self.out_layers = None
-            self.out_dim = self.fpn_dims
-
-
-    def forward(self, features):
-        fpn_feats = [layer(feat) for feat, layer in zip(features, self.input_projs)]
-        c3, c4, c5 = fpn_feats
-
-        # Top down
-        ## P5 -> P4
-        c6 = self.reduce_layer_1(c5)
-        c7 = F.interpolate(c6, scale_factor=2.0)
-        c8 = torch.cat([c7, c4], dim=1)
-        c9 = self.top_down_layer_1(c8)
-        ## P4 -> P3
-        c10 = self.reduce_layer_2(c9)
-        c11 = F.interpolate(c10, scale_factor=2.0)
-        c12 = torch.cat([c11, c3], dim=1)
-        c13 = self.top_down_layer_2(c12)
-
-        # Bottom up
-        ## p3 -> P4
-        c14 = self.downsample_layer_1(c13)
-        c15 = torch.cat([c14, c10], dim=1)
-        c16 = self.bottom_up_layer_1(c15)
-        ## P4 -> P5
-        c17 = self.downsample_layer_2(c16)
-        c18 = torch.cat([c17, c6], dim=1)
-        c19 = self.bottom_up_layer_2(c18)
-
-        out_feats = [c13, c16, c19] # [P3, P4, P5]
-        
-        # output proj layers
-        if self.out_layers is not None:
-            out_feats_proj = []
-            for feat, layer in zip(out_feats, self.out_layers):
-                out_feats_proj.append(layer(feat))
-            return out_feats_proj
-
-        return out_feats
-
-
-def build_fpn(cfg, in_dims, out_dim=None, input_proj=False):
-    model = cfg['fpn']
-    # build pafpn
-    if model == 'yolovx_pafpn':
-        fpn_net = YolovxPaFPN(cfg, in_dims, out_dim, input_proj)
-
-    return fpn_net

+ 0 - 39
models/detectors/rtdetr/image_encoder/img_encoder.py

@@ -1,39 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .cnn_backbone import build_backbone
-from .cnn_neck import build_neck
-from .cnn_pafpn import build_fpn
-
-
-# ------------------------ Image Encoder ------------------------
-class ImageEncoder(nn.Module):
-    def __init__(self, cfg, trainable=False) -> None:
-        super().__init__()
-        ## Backbone
-        self.backbone, feats_dim = build_backbone(cfg, cfg['pretrained']*trainable)
-
-        ## Encoder
-        self.encoder = build_neck(cfg, feats_dim[-1], feats_dim[-1])
-
-        ## CSFM
-        self.csfm = build_fpn(cfg=cfg, in_dims=feats_dim, out_dim=round(cfg['d_model']*cfg['width']), input_proj=True)
-
-
-    def forward(self, x):
-        # Backbone
-        pyramid_feats = self.backbone(x)
-
-        # Encoder
-        pyramid_feats[-1] = self.encoder(pyramid_feats[-1])
-
-        # CSFM
-        pyramid_feats = self.csfm(pyramid_feats)
-
-        return pyramid_feats
-
-
-# build img-encoder
-def build_img_encoder(cfg, trainable):
-    return ImageEncoder(cfg, trainable)
-

+ 0 - 171
models/detectors/rtdetr/loss.py

@@ -1,171 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import copy
-
-from .matcher import build_matcher
-from utils.misc import sigmoid_focal_loss
-from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
-from utils.distributed_utils import is_dist_avail_and_initialized, get_world_size
-
-
-class Criterion(nn.Module):
-    """ This class computes the loss for DETR.
-    The process happens in two steps:
-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
-    """
-    def __init__(self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25):
-        """ Create the criterion.
-        Parameters:
-            num_classes: number of object categories, omitting the special no-object category
-            matcher: module able to compute a matching between targets and proposals
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            eos_coef: relative classification weight applied to the no-object category
-            losses: list of all the losses to be applied. See get_loss for list of available losses.
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.matcher = matcher
-        self.weight_dict = weight_dict
-        self.losses = losses
-        self.focal_alpha = focal_alpha
-
-
-    def _get_src_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
-        src_idx = torch.cat([src for (src, _) in indices])
-        return batch_idx, src_idx
-
-
-    def _get_tgt_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
-        return batch_idx, tgt_idx
-
-
-    def loss_labels(self, outputs, targets, indices, num_boxes):
-        """Classification loss (NLL)
-        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
-        """
-        assert 'pred_logits' in outputs
-        src_logits = outputs['pred_logits']
-
-        idx = self._get_src_permutation_idx(indices)
-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]).to(src_logits.device)
-        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
-                                    dtype=torch.int64, device=src_logits.device)
-        target_classes[idx] = target_classes_o
-
-        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
-                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
-        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
-
-        target_classes_onehot = target_classes_onehot[:, :, :-1]
-        loss_cls = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * \
-                  src_logits.shape[1]
-        losses = {'loss_cls': loss_cls}
-
-        return losses
-
-
-    def loss_boxes(self, outputs, targets, indices, num_boxes):
-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
-        """
-        assert 'pred_boxes' in outputs
-        idx = self._get_src_permutation_idx(indices)
-        src_boxes = outputs['pred_boxes'][idx]
-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0).to(src_boxes.device)
-
-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
-
-        losses = {}
-        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
-
-        loss_giou = 1 - torch.diag(generalized_box_iou(
-            box_cxcywh_to_xyxy(src_boxes),
-            box_cxcywh_to_xyxy(target_boxes)))
-        losses['loss_giou'] = loss_giou.sum() / num_boxes
-        return losses
-
-
-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
-        loss_map = {
-            'labels': self.loss_labels,
-            'boxes': self.loss_boxes,
-        }
-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
-
-
-    def forward(self, outputs, targets, epoch=0):
-        """ This performs the loss computation.
-        Parameters:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
-        """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
-
-        # Retrieve the matching between the outputs of the last layer and the targets
-        indices = self.matcher(outputs_without_aux, targets)
-
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
-        num_boxes = sum(len(t["labels"]) for t in targets)
-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
-        if is_dist_avail_and_initialized():
-            torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
-
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
-
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if 'aux_outputs' in outputs:
-            for i, aux_outputs in enumerate(outputs['aux_outputs']):
-                indices = self.matcher(aux_outputs, targets)
-                for loss in self.losses:
-                    kwargs = {}
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
-                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
-                    losses.update(l_dict)
-
-        weight_dict = self.weight_dict
-        total_loss = sum(losses[k] * weight_dict[k] for k in losses.keys() if k in weight_dict)
-        losses['losses'] = total_loss
-
-        return losses
-
-
-# build criterion
-def build_criterion(cfg, num_classes, aux_loss=False):
-    matcher = build_matcher(cfg)
-    
-    weight_dict = {'loss_cls': cfg['loss_cls_weight'],
-                  'loss_bbox': cfg['loss_box_weight'],
-                  'loss_giou': cfg['loss_giou_weight']}
-
-    # TODO this is a hack
-    if aux_loss:
-        aux_weight_dict = {}
-        for i in range(cfg['num_decoder_layers'] - 1):
-            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
-        weight_dict.update(aux_weight_dict)
-
-    losses = ['labels', 'boxes']
-    
-    criterion = Criterion(
-        num_classes=num_classes,
-        matcher=matcher,
-        weight_dict=weight_dict,
-        losses=losses,
-        focal_alpha=cfg['focal_alpha'])
-
-    return criterion
-    

+ 0 - 102
models/detectors/rtdetr/matcher.py

@@ -1,102 +0,0 @@
-import torch
-import torch.nn as nn
-from scipy.optimize import linear_sum_assignment
-from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
-
-
-class HungarianMatcher(nn.Module):
-    """This class computes an assignment between the targets and the predictions of the network
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
-    while the others are un-matched (and thus treated as non-objects).
-    """
-
-    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
-        """Creates the matcher
-        Params:
-            cost_class: This is the relative weight of the classification error in the matching cost
-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
-        """
-        super().__init__()
-        self.cost_class = cost_class
-        self.cost_bbox = cost_bbox
-        self.cost_giou = cost_giou
-        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
-
-
-    @torch.no_grad()
-    def forward(self, outputs, targets):
-        """ Performs the matching
-        Params:
-            outputs: This is a dict that contains at least these entries:
-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
-                           objects in the target) containing the class labels
-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
-        Returns:
-            A list of size batch_size, containing tuples of (index_i, index_j) where:
-                - index_i is the indices of the selected predictions (in order)
-                - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds:
-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        bs, num_queries = outputs["pred_logits"].shape[:2]
-
-        # We flatten to compute the cost matrices in a batch
-        # [B * num_queries, C] = [N, C], where N is B * num_queries
-        out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
-        # [B * num_queries, 4] = [N, 4]
-        out_bbox = outputs["pred_boxes"].flatten(0, 1)
-
-        # Also concat the target labels and boxes
-        # [M,] where M is number of all targets in this batch
-        tgt_ids = torch.cat([v["labels"] for v in targets])
-        # [M, 4] where M is number of all targets in this batch
-        tgt_bbox = torch.cat([v["boxes"] for v in targets])
-
-        # Compute the classification cost.
-        alpha = 0.25
-        gamma = 2.0
-        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
-        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
-        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
-
-        # Compute the L1 cost between boxes
-        # [N, M]
-        cost_bbox = torch.cdist(out_bbox, tgt_bbox.to(out_bbox.device), p=1)
-
-        # Compute the giou cost betwen boxes
-        # [N, M]
-        cost_giou = -generalized_box_iou(
-            box_cxcywh_to_xyxy(out_bbox),
-            box_cxcywh_to_xyxy(tgt_bbox.to(out_bbox.device)))
-
-        # Final cost matrix: [N, M]
-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
-        # [N, M] -> [B, num_queries, M]
-        C = C.view(bs, num_queries, -1).cpu()
-
-        # The number of boxes in each image
-        sizes = [len(v["boxes"]) for v in targets]
-        # In the last dimension of C, we divide it into B costs, and each cost is [B, num_querys, M_i]
-        # where sum(Mi) = M.
-        # i is the batch index and c is cost_i = [B, num_querys, M_i].
-        # Therefore c[i] is the cost between the i-th sample and i-th prediction.
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
-        # As for each (i, j) in indices, i is the prediction indexes and j is the target indexes
-        # i contains row indexes of cost matrix: array([row_1, row_2, row_3]) 
-        # j contains col indexes of cost matrix: array([col_1, col_2, col_3])
-        # len(i) == len(j)
-        # len(indices) = batch_size
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
-
-
-def build_matcher(cfg):
-    return HungarianMatcher(
-        cost_class=cfg['set_cost_class'],
-        cost_bbox=cfg['set_cost_bbox'],
-        cost_giou=cfg['set_cost_giou']
-        )

+ 0 - 163
models/detectors/rtdetr/rtdetr.py

@@ -1,163 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .rtdetr_encoder import build_encoder
-from .rtdetr_compressor import build_compressor
-from .rtdetr_decoder import build_decoder
-from .rtdetr_dethead import build_dethead
-
-
-# Real-time DETR
-class RTDETR(nn.Module):
-    def __init__(self, 
-                 cfg,
-                 device, 
-                 num_classes = 20, 
-                 trainable = False, 
-                 aux_loss = False,
-                 with_box_refine = False,
-                 deploy = False):
-        super(RTDETR, self).__init__()
-        # --------- Basic Parameters ----------
-        self.cfg = cfg
-        self.device = device
-        self.num_classes = num_classes
-        self.trainable = trainable
-        self.max_stride = max(cfg['stride'])
-        self.d_model = round(cfg['d_model'] * self.cfg['width'])
-        self.aux_loss = aux_loss
-        self.with_box_refine = with_box_refine
-        self.deploy = deploy
-        
-        # --------- Network Parameters ----------
-        ## Encoder
-        self.encoder = build_encoder(cfg, trainable, 'img_encoder')
-
-        ## Compressor
-        self.compressor = build_compressor(cfg, self.d_model)
-
-        ## Decoder
-        self.decoder = build_decoder(cfg, self.d_model, return_intermediate=aux_loss)
-
-        ## DetHead
-        self.dethead = build_dethead(cfg, self.d_model, num_classes, with_box_refine)
-            
-        # set for TR-Decoder
-        self.decoder.class_embed = self.dethead.class_embed
-        self.decoder.bbox_embed = self.dethead.bbox_embed
-
-
-    # ---------------------- Basic Functions ----------------------
-    def position_embedding(self, x, temperature=10000):
-        hs, ws = x.shape[-2:]
-        device = x.device
-        num_pos_feats = x.shape[1] // 2       
-        scale = 2 * 3.141592653589793
-
-        # generate xy coord mat
-        y_embed, x_embed = torch.meshgrid(
-            [torch.arange(1, hs+1, dtype=torch.float32),
-             torch.arange(1, ws+1, dtype=torch.float32)])
-        y_embed = y_embed / (hs + 1e-6) * scale
-        x_embed = x_embed / (ws + 1e-6) * scale
-    
-        # [H, W] -> [1, H, W]
-        y_embed = y_embed[None, :, :].to(device)
-        x_embed = x_embed[None, :, :].to(device)
-
-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=device)
-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats
-        dim_t = temperature ** (2 * dim_t_)
-
-        pos_x = torch.div(x_embed[:, :, :, None], dim_t)
-        pos_y = torch.div(y_embed[:, :, :, None], dim_t)
-        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
-        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
-
-        # [B, C, H, W]
-        pos_embed = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
-        
-        return pos_embed
-        
-
-    @torch.jit.unused
-    def set_aux_loss(self, outputs_class, outputs_coord):
-        # this is a workaround to make torchscript happy, as torchscript
-        # doesn't support dictionary with non-homogeneous values, such
-        # as a dict having both a Tensor and a list.
-        return [{'pred_logits': a, 'pred_boxes': b}
-                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
-
-
-    # ---------------------- Main Process for Inference ----------------------
-    @torch.no_grad()
-    def inference_single_image(self, x):
-        # -------------------- Encoder --------------------
-        pyramid_feats = self.encoder(x)
-
-        # -------------------- Pos Embed --------------------
-        memory = torch.cat([feat.flatten(2) for feat in pyramid_feats], dim=-1)
-        memory_pos = torch.cat([self.position_embedding(feat).flatten(2) for feat in pyramid_feats], dim=-1)
-        memory = memory.permute(0, 2, 1).contiguous()
-        memory_pos = memory_pos.permute(0, 2, 1).contiguous()
-
-        # -------------------- Compressor --------------------
-        compressed_memory = self.compressor(memory, memory_pos)
-
-        # -------------------- Decoder --------------------
-        hs, reference = self.decoder(compressed_memory, None)
-
-        # -------------------- DetHead --------------------
-        out_logits, out_bbox = self.dethead(hs, reference, False)
-        cls_pred, box_pred = out_logits[0], out_bbox[0]
-
-        # -------------------- Top-k --------------------
-        cls_pred = cls_pred.flatten().sigmoid_()
-        num_topk = 100
-        predicted_prob, topk_idxs = cls_pred.sort(descending=True)
-        topk_idxs = topk_idxs[:num_topk]
-        topk_box_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
-        topk_scores = predicted_prob[:num_topk]
-        topk_labels = topk_idxs % self.num_classes
-        topk_bboxes = box_pred[topk_box_idxs]
-
-        # denormalize bbox
-        img_h, img_w = x.shape[-2:]
-        topk_bboxes[..., 0::2] *= img_w
-        topk_bboxes[..., 1::2] *= img_h
-
-        if self.deploy:
-            return topk_bboxes, topk_scores, topk_labels
-        else:
-            return topk_bboxes.cpu().numpy(), topk_scores.cpu().numpy(), topk_labels.cpu().numpy()
-        
-
-    # ---------------------- Main Process for Training ----------------------
-    def forward(self, x):
-        if not self.trainable:
-            return self.inference_single_image(x)
-        else:
-            # -------------------- Encoder --------------------
-            pyramid_feats = self.encoder(x)
-
-            # -------------------- Pos Embed --------------------
-            memory = torch.cat([feat.flatten(2) for feat in pyramid_feats], dim=-1)
-            memory_pos = torch.cat([self.position_embedding(feat).flatten(2) for feat in pyramid_feats], dim=-1)
-            memory = memory.permute(0, 2, 1).contiguous()
-            memory_pos = memory_pos.permute(0, 2, 1).contiguous()
-            
-            # -------------------- Compressor --------------------
-            compressed_memory = self.compressor(memory, memory_pos)
-
-            # -------------------- Decoder --------------------
-            hs, reference = self.decoder(compressed_memory, None)
-
-            # -------------------- DetHead --------------------
-            outputs_class, outputs_coords = self.dethead(hs, reference, True)
-
-            outputs = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coords[-1]}
-            if self.aux_loss:
-                outputs['aux_outputs'] = self.set_aux_loss(outputs_class, outputs_coords)
-            
-            return outputs
-    

+ 0 - 221
models/detectors/rtdetr/rtdetr_basic.py

@@ -1,221 +0,0 @@
-import copy
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import nn, Tensor
-
-
-# ------------------------------- Basic Modules -------------------------------
-def get_activation(act_type=None):
-    if act_type == 'relu':
-        return nn.ReLU(inplace=True)
-    elif act_type == 'gelu':
-        return nn.GELU()
-    elif act_type == 'lrelu':
-        return nn.LeakyReLU(0.1, inplace=True)
-    elif act_type == 'mish':
-        return nn.Mish(inplace=True)
-    elif act_type == 'silu':
-        return nn.SiLU(inplace=True)
-
-
-def get_norm(norm_type, dim):
-    if norm_type == 'BN':
-        return nn.BatchNorm2d(dim)
-    elif norm_type == 'GN':
-        return nn.GroupNorm(num_groups=32, num_channels=dim)
-    elif norm_type == 'LN':
-        return nn.LayerNorm(dim)
-
-
-def get_clones(module, N):
-    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
-    
-
-def build_multi_head_attention(d_model, num_heads, dropout, attn_type='mhsa'):
-    if attn_type == 'mhsa':
-        attn_layer = MultiHeadAttention(d_model, num_heads, dropout)
-    elif attn_type == 's_mhsa':
-        attn_layer = None
-
-    return attn_layer
-
-
-# ------------------------------- MLP -------------------------------
-class MLP(nn.Module):
-    """ Very simple multi-layer perceptron (also called FFN)"""
-
-    def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([in_dim] + h, h + [out_dim]))
-
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
-
-
-# ------------------------------- Transformer Modules -------------------------------
-## Vanilla Multi-Head Attention
-class MultiHeadAttention(nn.Module):
-    def __init__(self, d_model, num_heads, dropout=0.) -> None:
-        super().__init__()
-        # --------------- Basic parameters ---------------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.scale = (d_model // num_heads) ** -0.5
-
-        # --------------- Network parameters ---------------
-        self.q_proj = nn.Linear(d_model, d_model, bias = False) # W_q, W_k, W_v
-        self.k_proj = nn.Linear(d_model, d_model, bias = False) # W_q, W_k, W_v
-        self.v_proj = nn.Linear(d_model, d_model, bias = False) # W_q, W_k, W_v
-
-        self.out_proj = nn.Linear(d_model, d_model)
-        self.dropout = nn.Dropout(dropout)
-
-
-    def forward(self, query, key, value):
-        """
-        Inputs:
-            query : (Tensor) -> [B, Nq, C]
-            key   : (Tensor) -> [B, Nk, C]
-            value : (Tensor) -> [B, Nk, C]
-        """
-        bs = query.shape[0]
-        Nq = query.shape[1]
-        Nk = key.shape[1]
-
-        # ----------------- Input proj -----------------
-        query = self.q_proj(query)
-        key   = self.k_proj(key)
-        value = self.v_proj(value)
-
-        # ----------------- Multi-head Attn -----------------
-        ## [B, N, C] -> [B, N, H, C_h] -> [B, H, N, C_h]
-        query = query.view(bs, Nq, self.num_heads, self.d_model // self.num_heads)
-        query = query.permute(0, 2, 1, 3).contiguous()
-        key   = key.view(bs, Nk, self.num_heads, self.d_model // self.num_heads)
-        key   = key.permute(0, 2, 1, 3).contiguous()
-        value = value.view(bs, Nk, self.num_heads, self.d_model // self.num_heads)
-        value = value.permute(0, 2, 1, 3).contiguous()
-        # Attention
-        ## [B, H, Nq, C_h] X [B, H, C_h, Nk] = [B, H, Nq, Nk]
-        sim_matrix = torch.matmul(query, key.transpose(-1, -2)) * self.scale
-        sim_matrix = torch.softmax(sim_matrix, dim=-1)
-
-        # ----------------- Output -----------------
-        out = torch.matmul(sim_matrix, value)  # [B, H, Nq, C_h]
-        out = out.permute(0, 2, 1, 3).contiguous().view(bs, Nq, -1)
-        out = self.out_proj(out)
-
-        return out
-        
-## Transformer Encoder layer
-class TREncoderLayer(nn.Module):
-    def __init__(self,
-                 d_model,
-                 num_heads,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 act_type="relu",
-                 attn_type='mhsa'
-                 ):
-        super().__init__()
-        # Multi-head Self-Attn
-        self.self_attn = build_multi_head_attention(d_model, num_heads, dropout, attn_type)
-
-        # Feedforwaed Network
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-
-        self.activation = get_activation(act_type)
-
-
-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
-        return tensor if pos is None else tensor + pos
-
-
-    def forward(self, src, pos):
-        """
-        Input:
-            src: [torch.Tensor] -> [B, N, C]
-            pos: [torch.Tensor] -> [B, N, C]
-        Output:
-            src: [torch.Tensor] -> [B, N, C]
-        """
-        q = k = self.with_pos_embed(src, pos)
-
-        # self-attn
-        src2 = self.self_attn(q, k, value=src)
-
-        # reshape: [B, N, C] -> [B, C, H, W]
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-
-        # ffpn
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = src + self.dropout2(src2)
-        src = self.norm2(src)
-        
-        return src
-
-## Transformer Decoder layer
-class TRDecoderLayer(nn.Module):
-    def __init__(self, d_model, num_heads, dim_feedforward=2048, dropout=0.1, act_type="relu", attn_type='mhsa'):
-        super().__init__()
-        # Multi-head Self-Attn
-        self.self_attn = build_multi_head_attention(d_model, num_heads, dropout, attn_type)
-        self.cross_attn = build_multi_head_attention(d_model, num_heads, dropout)
-        # Feedforward Network
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.dropout3 = nn.Dropout(dropout)
-
-        self.activation = get_activation(act_type)
-
-
-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
-        return tensor if pos is None else tensor + pos
-
-
-    def forward(self, tgt, tgt_query_pos, memory, memory_pos):
-        # self attention
-        tgt2 = self.self_attn(
-            query=self.with_pos_embed(tgt, tgt_query_pos),
-            key=self.with_pos_embed(tgt, tgt_query_pos),
-            value=tgt)[0]
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
-
-        # cross attention
-        tgt2 = self.cross_attn(
-            query=self.with_pos_embed(tgt, tgt_query_pos),
-            key=self.with_pos_embed(memory, memory_pos),
-            value=memory)
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
-
-        # ffn
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
-        tgt = tgt + self.dropout3(tgt2)
-        tgt = self.norm3(tgt)
-        
-        return tgt

+ 0 - 34
models/detectors/rtdetr/rtdetr_compressor.py

@@ -1,34 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .rtdetr_basic import TRDecoderLayer
-
-
-# Transformer Decoder Module
-class MemoryCompressor(nn.Module):
-    def __init__(self, cfg, in_dim):
-        super().__init__()
-        # -------------------- Basic Parameters ---------------------
-        self.d_model = in_dim
-        self.ffn_dim = round(cfg['com_dim_feedforward']*cfg['width'])
-        self.compressed_vector = nn.Embedding(cfg['num_compressed'], in_dim)
-        # -------------------- Network Parameters ---------------------
-        self.compress_layer = TRDecoderLayer(
-            d_model=in_dim,
-            dim_feedforward=self.ffn_dim,
-            num_heads=cfg['com_num_heads'],
-            dropout=cfg['com_dropout'],
-            act_type=cfg['com_act']
-        )
-
-
-    def forward(self, memory, memory_pos):
-        bs = memory.size(0)
-        output = self.compressed_vector.weight[None].repeat(bs, 1, 1)
-        output = self.compress_layer(output, None, memory, memory_pos)
-
-        return output
-
-
-def build_compressor(cfg, in_dim):
-    return MemoryCompressor(cfg, in_dim)

+ 0 - 115
models/detectors/rtdetr/rtdetr_decoder.py

@@ -1,115 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .rtdetr_basic import get_clones, TRDecoderLayer, MLP
-
-
-# Transformer Decoder Module
-class TransformerDecoder(nn.Module):
-    def __init__(self, cfg, in_dim, return_intermediate=False):
-        super().__init__()
-        # -------------------- Basic Parameters ---------------------
-        self.d_model = in_dim
-        self.query_dim = 4  # For RefPoint head
-        self.scale = 2 * 3.141592653589793
-        self.num_queries = cfg['num_queries']
-        self.num_deocder_layers = cfg['num_decoder_layers']
-        self.return_intermediate = return_intermediate
-        self.ffn_dim = round(cfg['de_dim_feedforward']*cfg['width'])
-
-        # -------------------- Network Parameters ---------------------
-        ## Decoder
-        decoder_layer = TRDecoderLayer(
-            d_model=in_dim,
-            dim_feedforward=self.ffn_dim,
-            num_heads=cfg['de_num_heads'],
-            dropout=cfg['de_dropout'],
-            act_type=cfg['de_act']
-        )
-        self.decoder_layers = get_clones(decoder_layer, cfg['num_decoder_layers'])
-        ## RefPoint Embed
-        self.refpoint_embed = nn.Embedding(cfg['num_queries'], 4)
-        self.ref_point_head = MLP(self.query_dim // 2 * in_dim, in_dim, in_dim, 2)
-        ## Object Query Embed
-        self.object_query = nn.Embedding(cfg['num_queries'], in_dim)
-        nn.init.normal_(self.object_query.weight.data)
-        ## TODO: Group queries
-
-        self.bbox_embed = None
-        self.class_embed = None
-
-
-    def inverse_sigmoid(self, x):
-        x = x.clamp(min=0, max=1)
-        return torch.log(x.clamp(min=1e-5)/(1 - x).clamp(min=1e-5))
-
-
-    def query_sine_embed(self, num_feats, reference_points):
-        dim_t = torch.arange(num_feats, dtype=torch.float32, device=reference_points.device)
-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_feats
-        dim_t = 10000 ** (2 * dim_t_)
-
-        x_embed = reference_points[:, :, 0] * self.scale
-        y_embed = reference_points[:, :, 1] * self.scale
-        pos_x = x_embed[:, :, None] / dim_t
-        pos_y = y_embed[:, :, None] / dim_t
-        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
-        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
-        w_embed = reference_points[:, :, 2] * self.scale
-        pos_w = w_embed[:, :, None] / dim_t
-        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
-
-        h_embed = reference_points[:, :, 3] * self.scale
-        pos_h = h_embed[:, :, None] / dim_t
-        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
-        query_sine_embed = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
-
-        return query_sine_embed
-    
-
-    def forward(self, memory, memory_pos):
-        bs, _, channels = memory.size()
-        num_feats = channels // 2
-
-        # prepare tgt & refpoint
-        tgt = self.object_query.weight[None].repeat(bs, 1, 1)
-        refpoint_embed = self.refpoint_embed.weight[None].repeat(bs, 1, 1)
-
-        intermediate = []
-        reference_points = refpoint_embed.sigmoid()
-        ref_points = [reference_points]
-
-        # main process
-        output = tgt
-        for layer_id, layer in enumerate(self.decoder_layers):
-            # Conditional query
-            query_sine_embed = self.query_sine_embed(num_feats, reference_points)
-            query_pos = self.ref_point_head(query_sine_embed) # [B, N, C]
-            # Decoder
-            output = layer(
-                    # input for decoder
-                    tgt = output,
-                    tgt_query_pos = query_pos,
-                    # input from encoder
-                    memory = memory,
-                    memory_pos = memory_pos,
-                )
-            # Iter update
-            if self.bbox_embed is not None:
-                delta_unsig = self.bbox_embed[layer_id](output)
-                outputs_unsig = delta_unsig + self.inverse_sigmoid(reference_points)
-                new_reference_points = outputs_unsig.sigmoid()
-
-                reference_points = new_reference_points.detach()
-                ref_points.append(new_reference_points)
-
-            intermediate.append(output)
-
-        return torch.stack(intermediate), torch.stack(ref_points)
-
-
-# build detection head
-def build_decoder(cfg, in_dim, return_intermediate=False):
-    decoder = TransformerDecoder(cfg, in_dim, return_intermediate=return_intermediate) 
-
-    return decoder

+ 0 - 83
models/detectors/rtdetr/rtdetr_dethead.py

@@ -1,83 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .rtdetr_basic import MLP
-
-
-class DetectHead(nn.Module):
-    def __init__(self, cfg, d_model, num_classes, with_box_refine=False):
-        super().__init__()
-        # --------- Basic Parameters ----------
-        self.cfg = cfg
-        self.num_classes = num_classes
-
-        # --------- Network Parameters ----------
-        self.class_embed = nn.ModuleList([nn.Linear(d_model, self.num_classes)])
-        self.bbox_embed = nn.ModuleList([MLP(d_model, d_model, 4, 3)])
-        if with_box_refine:
-            self.class_embed = nn.ModuleList([
-                self.class_embed[0] for _ in range(cfg['num_decoder_layers'])])
-            self.bbox_embed = nn.ModuleList([
-                self.bbox_embed[0] for _ in range(cfg['num_decoder_layers'])])
-
-        self.init_weight()
-
-
-    def init_weight(self):
-        init_prob = 0.01
-        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
-
-        # cls pred
-        for class_embed in self.class_embed:
-            class_embed.bias.data = torch.ones(self.num_classes) * bias_value
-
-        # box pred
-        for bbox_embed in self.bbox_embed:
-            nn.init.constant_(bbox_embed.layers[-1].weight.data, 0)
-            nn.init.constant_(bbox_embed.layers[-1].bias.data, 0)
-        
-
-    def inverse_sigmoid(self, x):
-        x = x.clamp(min=0, max=1)
-        return torch.log(x.clamp(min=1e-5)/(1 - x).clamp(min=1e-5))
-
-
-    def decode_bbox(self, outputs_coords):
-        ## cxcywh -> xyxy
-        x1y1_pred = outputs_coords[..., :2] - outputs_coords[..., 2:] * 0.5
-        x2y2_pred = outputs_coords[..., :2] + outputs_coords[..., 2:] * 0.5
-        box_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
-        
-        return box_pred
-
-
-    def forward(self, hs, reference, multi_layer=False):
-        if multi_layer:
-            # class embed
-            outputs_class = torch.stack([
-                layer_cls_embed(layer_hs) for layer_cls_embed, layer_hs in zip(self.class_embed, hs)])
-            # bbox embed
-            outputs_coords = []
-            for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(zip(reference[:-1], self.bbox_embed, hs)):
-                layer_delta_unsig = layer_bbox_embed(layer_hs)
-                layer_outputs_unsig = layer_delta_unsig + self.inverse_sigmoid(layer_ref_sig)
-                layer_outputs_unsig = layer_outputs_unsig.sigmoid()
-                outputs_coords.append(layer_outputs_unsig)
-        else:
-            # class embed
-            outputs_class = self.class_embed[-1](hs[-1]) 
-            # bbox embed
-            delta_unsig = self.bbox_embed[-1](hs[-1])
-            ref_sig = reference[-2]
-            ref_sig = self.inverse_sigmoid(ref_sig)
-            outputs_unsig = delta_unsig + ref_sig
-            outputs_coords = outputs_unsig.sigmoid()
-            # decode bbox
-            outputs_coords = self.decode_bbox(outputs_coords)
-
-
-        return outputs_class, outputs_coords
-
-
-def build_dethead(cfg, d_model, num_classes, with_box_refine):
-    return DetectHead(cfg, d_model, num_classes, with_box_refine)

+ 0 - 10
models/detectors/rtdetr/rtdetr_encoder.py

@@ -1,10 +0,0 @@
-from .image_encoder.img_encoder import build_img_encoder
-
-
-# build encoder
-def build_encoder(cfg, trainable=False, en_type='img_encoder'):
-    if en_type == 'img_encoder':
-        return build_img_encoder(cfg, trainable)
-    elif en_type == 'text_encoder':
-        ## TODO: design text encoder
-        return None

+ 0 - 22
train.sh

@@ -16,25 +16,3 @@ python train.py \
         # --resume weights/coco/yolox_m/yolox_m_best.pth \
         # --pretrained weights/coco/yolo_free_medium/yolo_free_medium_39.46.pth \
         # --eval_first
-
-
-# # Train RT-DETR
-# python train.py \
-#         --cuda \
-#         -d voc \
-#         --root /mnt/share/ssd2/dataset/ \
-#         -m rtdetr_n \
-#         -bs 16 \
-#         -size 640 \
-#         --wp_epoch 1 \
-#         --max_epoch 150 \
-#         --eval_epoch 10 \
-#         --ema \
-#         --fp16 \
-#         --multi_scale \
-#         --mosaic 0 \
-#         --mixup 0
-#         # --resume weights/coco/yolox_s/yolox_s_best.pth \
-#         # --pretrained weights/coco/yolo_free_medium/yolo_free_medium_39.46.pth \
-#         # --eval_first
-

+ 1 - 1
train_ddp.sh

@@ -5,7 +5,7 @@ python -m torch.distributed.run --nproc_per_node=8 train.py \
                                                     -dist \
                                                     -d coco \
                                                     --root /data/datasets/ \
-                                                    -m yolovx_t \
+                                                    -m yolovx_n\
                                                     -bs 128 \
                                                     -size 640 \
                                                     --wp_epoch 3 \