2 年之前 · 8b21555292
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -106,8 +106,6 @@ from .model_config.yolov5_config import yolov5_cfg
 
				 from .model_config.yolov7_config import yolov7_cfg
			
 
				 from .model_config.yolovx_config import yolovx_cfg
			
 
				 from .model_config.yolox_config import yolox_cfg
			
 
				-## Real-Time DETR
			
 
				-from .model_config.rtdetr_config import rtdetr_cfg
			
 
				 
			
 
				 
			
 
				 def build_model_config(args):
			
@@ -137,9 +135,6 @@ def build_model_config(args):
 
				     # YOLOvx
			
 
				     elif args.model in ['yolovx_n', 'yolovx_t', 'yolovx_s', 'yolovx_m', 'yolovx_l', 'yolovx_x']:
			
 
				         cfg = yolovx_cfg[args.model]
			
 
				-    # RT-DETR
			
 
				-    elif args.model in ['rtdetr_n', 'rtdetr_s', 'rtdetr_m', 'rtdetr_l', 'rtdetr_x']:
			
 
				-        cfg = rtdetr_cfg[args.model]
			
 
				 
			
 
				     return cfg
			
 
				 
			
--- a/config/model_config/rtdetr_config.py
+++ b/config/model_config/rtdetr_config.py
@@ -1,69 +0,0 @@
 
				-# yolo-free config
			
 
				-
			
 
				-
			
 
				-rtdetr_cfg = {
			
 
				-    # P5
			
 
				-    'rtdetr_n': {
			
 
				-        # ---------------- Model config ----------------
			
 
				-        ## ------- Image Encoder -------
			
 
				-        ### CNN-Backbone
			
 
				-        'backbone': 'elannet',
			
 
				-        'pretrained': False,
			
 
				-        'bk_act': 'silu',
			
 
				-        'bk_norm': 'BN',
			
 
				-        'bk_dpw': False,
			
 
				-        'width': 0.25,
			
 
				-        'depth': 0.34,
			
 
				-        'stride': [8, 16, 32],  # P3, P4, P5
			
 
				-        'max_stride': 32,
			
 
				-        ### CNN-Neck
			
 
				-        'neck': 'sppf',
			
 
				-        'neck_expand_ratio': 0.5,
			
 
				-        'pooling_size': 5,
			
 
				-        'neck_act': 'silu',
			
 
				-        'neck_norm': 'BN',
			
 
				-        'neck_depthwise': False,
			
 
				-        ### CNN-CSFM
			
 
				-        'fpn': 'yolovx_pafpn',
			
 
				-        'fpn_reduce_layer': 'conv',
			
 
				-        'fpn_downsample_layer': 'conv',
			
 
				-        'fpn_core_block': 'elanblock',
			
 
				-        'fpn_act': 'silu',
			
 
				-        'fpn_norm': 'BN',
			
 
				-        'fpn_depthwise': False,
			
 
				-        ## ------- Memory Decoder -------
			
 
				-        'num_compressed': 300,
			
 
				-        'com_dim_feedforward': 1024,
			
 
				-        'com_num_heads': 8,
			
 
				-        'com_dropout': 0.1,
			
 
				-        'com_act': 'silu',
			
 
				-        ## ------- Transformer Decoder -------
			
 
				-        'd_model': 256,
			
 
				-        'attn_type': 'mhsa',
			
 
				-        'num_decoder_layers': 6,
			
 
				-        'num_queries': 300,
			
 
				-        'de_dim_feedforward': 1024,
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_dropout': 0.1,
			
 
				-        'de_act': 'silu',
			
 
				-        'de_norm': 'LN',
			
 
				-        # ---------------- Train config ----------------
			
 
				-        ## input
			
 
				-        'multi_scale': [0.5, 1.0],   # 320 -> 640
			
 
				-        'trans_type': 'yolov5_nano',
			
 
				-        # ---------------- Assignment config ----------------
			
 
				-        ## matcher
			
 
				-        'set_cost_class': 2.0,
			
 
				-        'set_cost_bbox': 5.0,
			
 
				-        'set_cost_giou': 2.0,
			
 
				-        # ---------------- Loss config ----------------
			
 
				-        ## loss weight
			
 
				-        'focal_alpha': 0.25,
			
 
				-        'loss_cls_weight': 1.0,
			
 
				-        'loss_box_weight': 5.0,
			
 
				-        'loss_giou_weight': 2.0,
			
 
				-        # ---------------- Train config ----------------
			
 
				-        'trainer_type': 'detr',
			
 
				-        },
			
 
				-
			
 
				-}
			
--- a/models/detectors/__init__.py
+++ b/models/detectors/__init__.py
@@ -12,8 +12,6 @@ from .yolov7.build import build_yolov7
 
				 from .yolovx.build import build_yolovx
			
 
				 # My custom YOLO
			
 
				 from .yolox.build import build_yolox
			
 
				-# Real-time DETR
			
 
				-from .rtdetr.build import build_rtdetr
			
 
				 
			
 
				 
			
 
				 # build object detector
			
@@ -55,10 +53,6 @@ def build_model(args,
 
				     elif args.model in ['yolox_n', 'yolox_s', 'yolox_m', 'yolox_l', 'yolox_x']:
			
 
				         model, criterion = build_yolox(
			
 
				             args, model_cfg, device, num_classes, trainable, deploy)
			
 
				-    # RT-DETR
			
 
				-    elif args.model in ['rtdetr_n', 'rtdetr_s', 'rtdetr_m', 'rtdetr_l', 'rtdetr_x']:
			
 
				-        model, criterion = build_rtdetr(
			
 
				-            args, model_cfg, device, num_classes, trainable, deploy)
			
 
				 
			
 
				 
			
 
				     if trainable:
			
--- a/models/detectors/rtdetr/README.md
+++ b/models/detectors/rtdetr/README.md
@@ -1,8 +0,0 @@
 
				-# Redesigned RT-DETR:
			
 
				-
			
 
				-| Model     | Scale | Batch | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
			
 
				-|-----------|-------|-------|------------------------|-------------------|-------------------|--------------------|--------|
			
 
				-| RT-DETR-N |  640  |       |                        |                   |                   |                    |  |
			
 
				-| RT-DETR-S |  640  |       |                        |                   |                   |                    |  |
			
 
				-| RT-DETR-M |  640  |       |                        |                   |                   |                    |  |
			
 
				-| RT-DETR-L |  640  |       |                        |                   |                   |                    |  |
			
--- a/models/detectors/rtdetr/build.py
+++ b/models/detectors/rtdetr/build.py
@@ -1,33 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-from .loss import build_criterion
			
 
				-from .rtdetr import RTDETR
			
 
				-
			
 
				-
			
 
				-# build object detector
			
 
				-def build_rtdetr(args, cfg, device, num_classes=80, trainable=False, deploy=False):
			
 
				-    print('==============================')
			
 
				-    print('Build {} ...'.format(args.model.upper()))
			
 
				-    
			
 
				-    print('==============================')
			
 
				-    print('Model Configuration: \n', cfg)
			
 
				-    
			
 
				-    # -------------- Build rtdetr --------------
			
 
				-    model = RTDETR(
			
 
				-        cfg=cfg,
			
 
				-        device=device, 
			
 
				-        num_classes=num_classes,
			
 
				-        trainable=trainable,
			
 
				-        aux_loss=trainable,
			
 
				-        with_box_refine=True,
			
 
				-        deploy=deploy
			
 
				-        )
			
 
				-
			
 
				-    # -------------- Build criterion --------------
			
 
				-    criterion = None
			
 
				-    if trainable:
			
 
				-        # build criterion for training
			
 
				-        criterion = build_criterion(cfg, num_classes, aux_loss=True)
			
 
				-        
			
 
				-    return model, criterion
			
--- a/models/detectors/rtdetr/image_encoder/cnn_backbone.py
+++ b/models/detectors/rtdetr/image_encoder/cnn_backbone.py
@@ -1,154 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-try:
			
 
				-    from .cnn_basic import Conv, ELANBlock, DownSample
			
 
				-except:
			
 
				-    from cnn_basic import Conv, ELANBlock, DownSample
			
 
				-
			
 
				-
			
 
				-
			
 
				-model_urls = {
			
 
				-    'elannet_pico': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_pico.pth",
			
 
				-    'elannet_nano': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_nano.pth",
			
 
				-    'elannet_small': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_small.pth",
			
 
				-    'elannet_medium': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_medium.pth",
			
 
				-    'elannet_large': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_large.pth",
			
 
				-    'elannet_huge': "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/elannet_huge.pth",
			
 
				-}
			
 
				-
			
 
				-
			
 
				-# ---------------------------- Backbones ----------------------------
			
 
				-# ELANNet-P5
			
 
				-class ELANNet(nn.Module):
			
 
				-    def __init__(self, width=1.0, depth=1.0, act_type='silu', norm_type='BN', depthwise=False):
			
 
				-        super(ELANNet, self).__init__()
			
 
				-        self.feat_dims = [int(512 * width), int(1024 * width), int(1024 * width)]
			
 
				-        
			
 
				-        # P1/2
			
 
				-        self.layer_1 = nn.Sequential(
			
 
				-            Conv(3, int(64*width), k=3, p=1, s=2, act_type=act_type, norm_type=norm_type),
			
 
				-            Conv(int(64*width), int(64*width), k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        )
			
 
				-        # P2/4
			
 
				-        self.layer_2 = nn.Sequential(   
			
 
				-            Conv(int(64*width), int(128*width), k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise),             
			
 
				-            ELANBlock(in_dim=int(128*width), out_dim=int(256*width), expand_ratio=0.5, depth=depth,
			
 
				-                      act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        )
			
 
				-        # P3/8
			
 
				-        self.layer_3 = nn.Sequential(
			
 
				-            DownSample(in_dim=int(256*width), out_dim=int(256*width), act_type=act_type, norm_type=norm_type),             
			
 
				-            ELANBlock(in_dim=int(256*width), out_dim=int(512*width), expand_ratio=0.5, depth=depth,
			
 
				-                      act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        )
			
 
				-        # P4/16
			
 
				-        self.layer_4 = nn.Sequential(
			
 
				-            DownSample(in_dim=int(512*width), out_dim=int(512*width), act_type=act_type, norm_type=norm_type),             
			
 
				-            ELANBlock(in_dim=int(512*width), out_dim=int(1024*width), expand_ratio=0.5, depth=depth,
			
 
				-                      act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        )
			
 
				-        # P5/32
			
 
				-        self.layer_5 = nn.Sequential(
			
 
				-            DownSample(in_dim=int(1024*width), out_dim=int(1024*width), act_type=act_type, norm_type=norm_type),             
			
 
				-            ELANBlock(in_dim=int(1024*width), out_dim=int(1024*width), expand_ratio=0.25, depth=depth,
			
 
				-                    act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        c1 = self.layer_1(x)
			
 
				-        c2 = self.layer_2(c1)
			
 
				-        c3 = self.layer_3(c2)
			
 
				-        c4 = self.layer_4(c3)
			
 
				-        c5 = self.layer_5(c4)
			
 
				-
			
 
				-        outputs = [c3, c4, c5]
			
 
				-
			
 
				-        return outputs
			
 
				-
			
 
				-
			
 
				-# ---------------------------- Functions ----------------------------
			
 
				-## load pretrained weight
			
 
				-def load_weight(model, model_name):
			
 
				-    # load weight
			
 
				-    print('Loading pretrained weight ...')
			
 
				-    url = model_urls[model_name]
			
 
				-    if url is not None:
			
 
				-        checkpoint = torch.hub.load_state_dict_from_url(
			
 
				-            url=url, map_location="cpu", check_hash=True)
			
 
				-        # checkpoint state dict
			
 
				-        checkpoint_state_dict = checkpoint.pop("model")
			
 
				-        # model state dict
			
 
				-        model_state_dict = model.state_dict()
			
 
				-        # check
			
 
				-        for k in list(checkpoint_state_dict.keys()):
			
 
				-            if k in model_state_dict:
			
 
				-                shape_model = tuple(model_state_dict[k].shape)
			
 
				-                shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
			
 
				-                if shape_model != shape_checkpoint:
			
 
				-                    checkpoint_state_dict.pop(k)
			
 
				-            else:
			
 
				-                checkpoint_state_dict.pop(k)
			
 
				-                print(k)
			
 
				-
			
 
				-        model.load_state_dict(checkpoint_state_dict)
			
 
				-    else:
			
 
				-        print('No pretrained for {}'.format(model_name))
			
 
				-
			
 
				-    return model
			
 
				-
			
 
				-
			
 
				-## build ELAN-Net
			
 
				-def build_backbone(cfg, pretrained=False): 
			
 
				-    # model
			
 
				-    backbone = ELANNet(
			
 
				-        width=cfg['width'],
			
 
				-        depth=cfg['depth'],
			
 
				-        act_type=cfg['bk_act'],
			
 
				-        norm_type=cfg['bk_norm'],
			
 
				-        depthwise=cfg['bk_dpw']
			
 
				-        )
			
 
				-    # check whether to load imagenet pretrained weight
			
 
				-    if pretrained:
			
 
				-        if cfg['width'] == 0.25 and cfg['depth'] == 0.34 and cfg['bk_dpw']:
			
 
				-            backbone = load_weight(backbone, model_name='elannet_pico')
			
 
				-        elif cfg['width'] == 0.25 and cfg['depth'] == 0.34:
			
 
				-            backbone = load_weight(backbone, model_name='elannet_nano')
			
 
				-        elif cfg['width'] == 0.5 and cfg['depth'] == 0.34:
			
 
				-            backbone = load_weight(backbone, model_name='elannet_small')
			
 
				-        elif cfg['width'] == 0.75 and cfg['depth'] == 0.67:
			
 
				-            backbone = load_weight(backbone, model_name='elannet_medium')
			
 
				-        elif cfg['width'] == 1.0 and cfg['depth'] == 1.0:
			
 
				-            backbone = load_weight(backbone, model_name='elannet_large')
			
 
				-        elif cfg['width'] == 1.25 and cfg['depth'] == 1.34:
			
 
				-            backbone = load_weight(backbone, model_name='elannet_huge')
			
 
				-    feat_dims = backbone.feat_dims
			
 
				-
			
 
				-    return backbone, feat_dims
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    cfg = {
			
 
				-        'pretrained': True,
			
 
				-        'bk_act': 'silu',
			
 
				-        'bk_norm': 'BN',
			
 
				-        'bk_dpw': True,
			
 
				-        'width': 0.25,
			
 
				-        'depth': 0.34,
			
 
				-    }
			
 
				-    model, feats = build_backbone(cfg)
			
 
				-    x = torch.randn(1, 3, 640, 640)
			
 
				-    t0 = time.time()
			
 
				-    outputs = model(x)
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
 
				-    for out in outputs:
			
 
				-        print(out.shape)
			
 
				-
			
 
				-    print('==============================')
			
 
				-    flops, params = profile(model, inputs=(x, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/models/detectors/rtdetr/image_encoder/cnn_basic.py
+++ b/models/detectors/rtdetr/image_encoder/cnn_basic.py
@@ -1,174 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-
			
 
				-# ------------------------------- Basic Modules -------------------------------
			
 
				-class SiLU(nn.Module):
			
 
				-    """export-friendly version of nn.SiLU()"""
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def forward(x):
			
 
				-        return x * torch.sigmoid(x)
			
 
				-
			
 
				-
			
 
				-def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
			
 
				-    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
			
 
				-
			
 
				-    return conv
			
 
				-
			
 
				-
			
 
				-def get_activation(act_type=None):
			
 
				-    if act_type == 'relu':
			
 
				-        return nn.ReLU(inplace=True)
			
 
				-    elif act_type == 'gelu':
			
 
				-        return nn.GELU()
			
 
				-    elif act_type == 'lrelu':
			
 
				-        return nn.LeakyReLU(0.1, inplace=True)
			
 
				-    elif act_type == 'mish':
			
 
				-        return nn.Mish(inplace=True)
			
 
				-    elif act_type == 'silu':
			
 
				-        return nn.SiLU(inplace=True)
			
 
				-
			
 
				-
			
 
				-def get_norm(norm_type, dim):
			
 
				-    if norm_type == 'BN':
			
 
				-        return nn.BatchNorm2d(dim)
			
 
				-    elif norm_type == 'GN':
			
 
				-        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				-    elif norm_type == 'LN':
			
 
				-        return nn.LayerNorm(dim)
			
 
				-
			
 
				-
			
 
				-# ------------------------------- Conv -------------------------------
			
 
				-class Conv(nn.Module):
			
 
				-    def __init__(self, 
			
 
				-                 c1,                   # in channels
			
 
				-                 c2,                   # out channels 
			
 
				-                 k=1,                  # kernel size 
			
 
				-                 p=0,                  # padding
			
 
				-                 s=1,                  # padding
			
 
				-                 d=1,                  # dilation
			
 
				-                 act_type='relu',      # activation
			
 
				-                 norm_type='BN',       # normalization
			
 
				-                 depthwise=False):
			
 
				-        super(Conv, self).__init__()
			
 
				-        convs = []
			
 
				-        add_bias = False if norm_type else True
			
 
				-        if depthwise:
			
 
				-            convs.append(get_conv2d(c1, c1, k=k, p=p, s=s, d=d, g=c1, bias=add_bias))
			
 
				-            # depthwise conv
			
 
				-            if norm_type:
			
 
				-                convs.append(get_norm(norm_type, c1))
			
 
				-            if act_type:
			
 
				-                convs.append(get_activation(act_type))
			
 
				-            # pointwise conv
			
 
				-            convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias))
			
 
				-            if norm_type:
			
 
				-                convs.append(get_norm(norm_type, c2))
			
 
				-            if act_type:
			
 
				-                convs.append(get_activation(act_type))
			
 
				-        else:
			
 
				-            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=1, bias=add_bias))
			
 
				-            if norm_type:
			
 
				-                convs.append(get_norm(norm_type, c2))
			
 
				-            if act_type:
			
 
				-                convs.append(get_activation(act_type))
			
 
				-            
			
 
				-        self.convs = nn.Sequential(*convs)
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return self.convs(x)
			
 
				-
			
 
				-
			
 
				-# ---------------------------- Modified YOLOv7's Modules ----------------------------
			
 
				-## ELANBlock
			
 
				-class ELANBlock(nn.Module):
			
 
				-    def __init__(self, in_dim, out_dim, expand_ratio=0.5, depth=1.0, act_type='silu', norm_type='BN', depthwise=False):
			
 
				-        super(ELANBlock, self).__init__()
			
 
				-        if isinstance(expand_ratio, float):
			
 
				-            inter_dim = int(in_dim * expand_ratio)
			
 
				-            inter_dim2 = inter_dim
			
 
				-        elif isinstance(expand_ratio, list):
			
 
				-            assert len(expand_ratio) == 2
			
 
				-            e1, e2 = expand_ratio
			
 
				-            inter_dim = int(in_dim * e1)
			
 
				-            inter_dim2 = int(inter_dim * e2)
			
 
				-        # branch-1
			
 
				-        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				-        # branch-2
			
 
				-        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				-        # branch-3
			
 
				-        for idx in range(round(3*depth)):
			
 
				-            if idx == 0:
			
 
				-                cv3 = [Conv(inter_dim, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)]
			
 
				-            else:
			
 
				-                cv3.append(Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise))
			
 
				-        self.cv3 = nn.Sequential(*cv3)
			
 
				-        # branch-4
			
 
				-        self.cv4 = nn.Sequential(*[
			
 
				-            Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-            for _ in range(round(3*depth))
			
 
				-        ])
			
 
				-        # output
			
 
				-        self.out = Conv(inter_dim*2 + inter_dim2*2, out_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x1 = self.cv1(x)
			
 
				-        x2 = self.cv2(x)
			
 
				-        x3 = self.cv3(x2)
			
 
				-        x4 = self.cv4(x3)
			
 
				-
			
 
				-        out = self.out(torch.cat([x1, x2, x3, x4], dim=1))
			
 
				-
			
 
				-        return out
			
 
				-
			
 
				-## DownSample
			
 
				-class DownSample(nn.Module):
			
 
				-    def __init__(self, in_dim, out_dim, act_type='silu', norm_type='BN', depthwise=False):
			
 
				-        super().__init__()
			
 
				-        inter_dim = out_dim // 2
			
 
				-        self.mp = nn.MaxPool2d((2, 2), 2)
			
 
				-        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.cv2 = nn.Sequential(
			
 
				-            Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type),
			
 
				-            Conv(inter_dim, inter_dim, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        )
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x1 = self.cv1(self.mp(x))
			
 
				-        x2 = self.cv2(x)
			
 
				-        out = torch.cat([x1, x2], dim=1)
			
 
				-
			
 
				-        return out
			
 
				-
			
 
				-
			
 
				-## build core block for CSFM
			
 
				-def build_fpn_block(cfg, in_dim, out_dim):
			
 
				-    if cfg['fpn_core_block'] == 'elanblock':
			
 
				-        layer = ELANBlock(in_dim=in_dim,
			
 
				-                          out_dim=out_dim,
			
 
				-                          expand_ratio=[0.5, 0.5],
			
 
				-                          depth=cfg['depth'],
			
 
				-                          act_type=cfg['fpn_act'],
			
 
				-                          norm_type=cfg['fpn_norm'],
			
 
				-                          depthwise=cfg['fpn_depthwise']
			
 
				-                          )
			
 
				-        
			
 
				-    return layer
			
 
				-
			
 
				-## build reduce layer for CSFM
			
 
				-def build_reduce_layer(cfg, in_dim, out_dim):
			
 
				-    layer = Conv(in_dim, out_dim, k=1,
			
 
				-                 act_type=cfg['fpn_act'], norm_type=cfg['fpn_norm'])
			
 
				-        
			
 
				-    return layer
			
 
				-
			
 
				-## build downsample layer for CSFM
			
 
				-def build_downsample_layer(cfg, in_dim, out_dim):
			
 
				-    if cfg['fpn_downsample_layer'] == 'conv':
			
 
				-        layer = Conv(in_dim, out_dim, k=3, s=2, p=1,
			
 
				-                     act_type=cfg['fpn_act'], norm_type=cfg['fpn_norm'])
			
 
				-        
			
 
				-    return layer
			
--- a/models/detectors/rtdetr/image_encoder/cnn_neck.py
+++ b/models/detectors/rtdetr/image_encoder/cnn_neck.py
@@ -1,70 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-from .cnn_basic import Conv
			
 
				-
			
 
				-
			
 
				-# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
			
 
				-class SPPF(nn.Module):
			
 
				-    """
			
 
				-        This code referenced to https://github.com/ultralytics/yolov5
			
 
				-    """
			
 
				-    def __init__(self, cfg, in_dim, out_dim, expand_ratio=0.5):
			
 
				-        super().__init__()
			
 
				-        inter_dim = int(in_dim * expand_ratio)
			
 
				-        self.out_dim = out_dim
			
 
				-        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
			
 
				-        self.cv2 = Conv(inter_dim * 4, out_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
			
 
				-        self.m = nn.MaxPool2d(kernel_size=cfg['pooling_size'], stride=1, padding=cfg['pooling_size'] // 2)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = self.cv1(x)
			
 
				-        y1 = self.m(x)
			
 
				-        y2 = self.m(y1)
			
 
				-
			
 
				-        return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
			
 
				-
			
 
				-
			
 
				-# SPPF block with CSP module
			
 
				-class SPPFBlockCSP(nn.Module):
			
 
				-    """
			
 
				-        CSP Spatial Pyramid Pooling Block
			
 
				-    """
			
 
				-    def __init__(self, cfg, in_dim, out_dim, expand_ratio):
			
 
				-        super(SPPFBlockCSP, self).__init__()
			
 
				-        inter_dim = int(in_dim * expand_ratio)
			
 
				-        self.out_dim = out_dim
			
 
				-        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
			
 
				-        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
			
 
				-        self.m = nn.Sequential(
			
 
				-            Conv(inter_dim, inter_dim, k=3, p=1, 
			
 
				-                 act_type=cfg['neck_act'], norm_type=cfg['neck_norm'], 
			
 
				-                 depthwise=cfg['neck_depthwise']),
			
 
				-            SPPF(cfg, inter_dim, inter_dim, expand_ratio=1.0),
			
 
				-            Conv(inter_dim, inter_dim, k=3, p=1, 
			
 
				-                 act_type=cfg['neck_act'], norm_type=cfg['neck_norm'], 
			
 
				-                 depthwise=cfg['neck_depthwise'])
			
 
				-        )
			
 
				-        self.cv3 = Conv(inter_dim * 2, self.out_dim, k=1, act_type=cfg['neck_act'], norm_type=cfg['neck_norm'])
			
 
				-
			
 
				-        
			
 
				-    def forward(self, x):
			
 
				-        x1 = self.cv1(x)
			
 
				-        x2 = self.cv2(x)
			
 
				-        x3 = self.m(x2)
			
 
				-        y = self.cv3(torch.cat([x1, x3], dim=1))
			
 
				-
			
 
				-        return y
			
 
				-
			
 
				-
			
 
				-def build_neck(cfg, in_dim, out_dim):
			
 
				-    model = cfg['neck']
			
 
				-    print('==============================')
			
 
				-    print('Neck: {}'.format(model))
			
 
				-    # build neck
			
 
				-    if model == 'sppf':
			
 
				-        neck = SPPF(cfg, in_dim, out_dim, cfg['neck_expand_ratio'])
			
 
				-    elif model == 'csp_sppf':
			
 
				-        neck = SPPFBlockCSP(cfg, in_dim, out_dim, cfg['neck_expand_ratio'])
			
 
				-
			
 
				-    return neck
			
 
				-        
			
--- a/models/detectors/rtdetr/image_encoder/cnn_pafpn.py
+++ b/models/detectors/rtdetr/image_encoder/cnn_pafpn.py
@@ -1,98 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-from .cnn_basic import (Conv, build_reduce_layer, build_downsample_layer, build_fpn_block)
			
 
				-
			
 
				-
			
 
				-# YOLO-Style PaFPN
			
 
				-class YolovxPaFPN(nn.Module):
			
 
				-    def __init__(self, cfg, in_dims=[512, 1024, 1024], out_dim=None, input_proj=False):
			
 
				-        super(YolovxPaFPN, self).__init__()
			
 
				-        # --------------------------- Basic Parameters ---------------------------
			
 
				-        self.in_dims = in_dims
			
 
				-        if input_proj:
			
 
				-            self.fpn_dims = [round(256*cfg['width']), round(512*cfg['width']), round(1024*cfg['width'])]
			
 
				-        else:
			
 
				-            self.fpn_dims = in_dims
			
 
				-
			
 
				-        # --------------------------- Input proj ---------------------------
			
 
				-        self.input_projs = nn.ModuleList([nn.Conv2d(in_dim, fpn_dim, kernel_size=1)
			
 
				-                                          for in_dim, fpn_dim in zip(in_dims, self.fpn_dims)])
			
 
				-        
			
 
				-        # --------------------------- Top-down FPN---------------------------
			
 
				-        ## P5 -> P4
			
 
				-        self.reduce_layer_1 = build_reduce_layer(cfg, self.fpn_dims[2], self.fpn_dims[2]//2)
			
 
				-        self.top_down_layer_1 = build_fpn_block(cfg, self.fpn_dims[1] + self.fpn_dims[2]//2, self.fpn_dims[1])
			
 
				-
			
 
				-        ## P4 -> P3
			
 
				-        self.reduce_layer_2 = build_reduce_layer(cfg, self.fpn_dims[1], self.fpn_dims[1]//2)
			
 
				-        self.top_down_layer_2 = build_fpn_block(cfg, self.fpn_dims[0] + self.fpn_dims[1]//2, self.fpn_dims[0])
			
 
				-
			
 
				-        # --------------------------- Bottom-up FPN ---------------------------
			
 
				-        ## P3 -> P4
			
 
				-        self.downsample_layer_1 = build_downsample_layer(cfg, self.fpn_dims[0], self.fpn_dims[0])
			
 
				-        self.bottom_up_layer_1 = build_fpn_block(cfg, self.fpn_dims[0] + self.fpn_dims[1]//2, self.fpn_dims[1])
			
 
				-
			
 
				-        ## P4 -> P5
			
 
				-        self.downsample_layer_2 = build_downsample_layer(cfg, self.fpn_dims[1], self.fpn_dims[1])
			
 
				-        self.bottom_up_layer_2 = build_fpn_block(cfg, self.fpn_dims[1] + self.fpn_dims[2]//2, self.fpn_dims[2])
			
 
				-                
			
 
				-        # --------------------------- Output proj ---------------------------
			
 
				-        if out_dim is not None:
			
 
				-            self.out_layers = nn.ModuleList([
			
 
				-                Conv(in_dim, out_dim, k=1,
			
 
				-                     act_type=cfg['fpn_act'], norm_type=cfg['fpn_norm'])
			
 
				-                     for in_dim in self.fpn_dims
			
 
				-                     ])
			
 
				-            self.out_dim = [out_dim] * 3
			
 
				-        else:
			
 
				-            self.out_layers = None
			
 
				-            self.out_dim = self.fpn_dims
			
 
				-
			
 
				-
			
 
				-    def forward(self, features):
			
 
				-        fpn_feats = [layer(feat) for feat, layer in zip(features, self.input_projs)]
			
 
				-        c3, c4, c5 = fpn_feats
			
 
				-
			
 
				-        # Top down
			
 
				-        ## P5 -> P4
			
 
				-        c6 = self.reduce_layer_1(c5)
			
 
				-        c7 = F.interpolate(c6, scale_factor=2.0)
			
 
				-        c8 = torch.cat([c7, c4], dim=1)
			
 
				-        c9 = self.top_down_layer_1(c8)
			
 
				-        ## P4 -> P3
			
 
				-        c10 = self.reduce_layer_2(c9)
			
 
				-        c11 = F.interpolate(c10, scale_factor=2.0)
			
 
				-        c12 = torch.cat([c11, c3], dim=1)
			
 
				-        c13 = self.top_down_layer_2(c12)
			
 
				-
			
 
				-        # Bottom up
			
 
				-        ## p3 -> P4
			
 
				-        c14 = self.downsample_layer_1(c13)
			
 
				-        c15 = torch.cat([c14, c10], dim=1)
			
 
				-        c16 = self.bottom_up_layer_1(c15)
			
 
				-        ## P4 -> P5
			
 
				-        c17 = self.downsample_layer_2(c16)
			
 
				-        c18 = torch.cat([c17, c6], dim=1)
			
 
				-        c19 = self.bottom_up_layer_2(c18)
			
 
				-
			
 
				-        out_feats = [c13, c16, c19] # [P3, P4, P5]
			
 
				-        
			
 
				-        # output proj layers
			
 
				-        if self.out_layers is not None:
			
 
				-            out_feats_proj = []
			
 
				-            for feat, layer in zip(out_feats, self.out_layers):
			
 
				-                out_feats_proj.append(layer(feat))
			
 
				-            return out_feats_proj
			
 
				-
			
 
				-        return out_feats
			
 
				-
			
 
				-
			
 
				-def build_fpn(cfg, in_dims, out_dim=None, input_proj=False):
			
 
				-    model = cfg['fpn']
			
 
				-    # build pafpn
			
 
				-    if model == 'yolovx_pafpn':
			
 
				-        fpn_net = YolovxPaFPN(cfg, in_dims, out_dim, input_proj)
			
 
				-
			
 
				-    return fpn_net
			
--- a/models/detectors/rtdetr/image_encoder/img_encoder.py
+++ b/models/detectors/rtdetr/image_encoder/img_encoder.py
@@ -1,39 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .cnn_backbone import build_backbone
			
 
				-from .cnn_neck import build_neck
			
 
				-from .cnn_pafpn import build_fpn
			
 
				-
			
 
				-
			
 
				-# ------------------------ Image Encoder ------------------------
			
 
				-class ImageEncoder(nn.Module):
			
 
				-    def __init__(self, cfg, trainable=False) -> None:
			
 
				-        super().__init__()
			
 
				-        ## Backbone
			
 
				-        self.backbone, feats_dim = build_backbone(cfg, cfg['pretrained']*trainable)
			
 
				-
			
 
				-        ## Encoder
			
 
				-        self.encoder = build_neck(cfg, feats_dim[-1], feats_dim[-1])
			
 
				-
			
 
				-        ## CSFM
			
 
				-        self.csfm = build_fpn(cfg=cfg, in_dims=feats_dim, out_dim=round(cfg['d_model']*cfg['width']), input_proj=True)
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        # Backbone
			
 
				-        pyramid_feats = self.backbone(x)
			
 
				-
			
 
				-        # Encoder
			
 
				-        pyramid_feats[-1] = self.encoder(pyramid_feats[-1])
			
 
				-
			
 
				-        # CSFM
			
 
				-        pyramid_feats = self.csfm(pyramid_feats)
			
 
				-
			
 
				-        return pyramid_feats
			
 
				-
			
 
				-
			
 
				-# build img-encoder
			
 
				-def build_img_encoder(cfg, trainable):
			
 
				-    return ImageEncoder(cfg, trainable)
			
 
				-
			
--- a/models/detectors/rtdetr/loss.py
+++ b/models/detectors/rtdetr/loss.py
@@ -1,171 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-import copy
			
 
				-
			
 
				-from .matcher import build_matcher
			
 
				-from utils.misc import sigmoid_focal_loss
			
 
				-from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
			
 
				-from utils.distributed_utils import is_dist_avail_and_initialized, get_world_size
			
 
				-
			
 
				-
			
 
				-class Criterion(nn.Module):
			
 
				-    """ This class computes the loss for DETR.
			
 
				-    The process happens in two steps:
			
 
				-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
			
 
				-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
			
 
				-    """
			
 
				-    def __init__(self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25):
			
 
				-        """ Create the criterion.
			
 
				-        Parameters:
			
 
				-            num_classes: number of object categories, omitting the special no-object category
			
 
				-            matcher: module able to compute a matching between targets and proposals
			
 
				-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
			
 
				-            eos_coef: relative classification weight applied to the no-object category
			
 
				-            losses: list of all the losses to be applied. See get_loss for list of available losses.
			
 
				-        """
			
 
				-        super().__init__()
			
 
				-        self.num_classes = num_classes
			
 
				-        self.matcher = matcher
			
 
				-        self.weight_dict = weight_dict
			
 
				-        self.losses = losses
			
 
				-        self.focal_alpha = focal_alpha
			
 
				-
			
 
				-
			
 
				-    def _get_src_permutation_idx(self, indices):
			
 
				-        # permute predictions following indices
			
 
				-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
			
 
				-        src_idx = torch.cat([src for (src, _) in indices])
			
 
				-        return batch_idx, src_idx
			
 
				-
			
 
				-
			
 
				-    def _get_tgt_permutation_idx(self, indices):
			
 
				-        # permute targets following indices
			
 
				-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
			
 
				-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
			
 
				-        return batch_idx, tgt_idx
			
 
				-
			
 
				-
			
 
				-    def loss_labels(self, outputs, targets, indices, num_boxes):
			
 
				-        """Classification loss (NLL)
			
 
				-        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
			
 
				-        """
			
 
				-        assert 'pred_logits' in outputs
			
 
				-        src_logits = outputs['pred_logits']
			
 
				-
			
 
				-        idx = self._get_src_permutation_idx(indices)
			
 
				-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]).to(src_logits.device)
			
 
				-        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
			
 
				-                                    dtype=torch.int64, device=src_logits.device)
			
 
				-        target_classes[idx] = target_classes_o
			
 
				-
			
 
				-        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
			
 
				-                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
			
 
				-        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
			
 
				-
			
 
				-        target_classes_onehot = target_classes_onehot[:, :, :-1]
			
 
				-        loss_cls = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * \
			
 
				-                  src_logits.shape[1]
			
 
				-        losses = {'loss_cls': loss_cls}
			
 
				-
			
 
				-        return losses
			
 
				-
			
 
				-
			
 
				-    def loss_boxes(self, outputs, targets, indices, num_boxes):
			
 
				-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
			
 
				-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
			
 
				-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
			
 
				-        """
			
 
				-        assert 'pred_boxes' in outputs
			
 
				-        idx = self._get_src_permutation_idx(indices)
			
 
				-        src_boxes = outputs['pred_boxes'][idx]
			
 
				-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0).to(src_boxes.device)
			
 
				-
			
 
				-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
			
 
				-
			
 
				-        losses = {}
			
 
				-        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
			
 
				-
			
 
				-        loss_giou = 1 - torch.diag(generalized_box_iou(
			
 
				-            box_cxcywh_to_xyxy(src_boxes),
			
 
				-            box_cxcywh_to_xyxy(target_boxes)))
			
 
				-        losses['loss_giou'] = loss_giou.sum() / num_boxes
			
 
				-        return losses
			
 
				-
			
 
				-
			
 
				-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
			
 
				-        loss_map = {
			
 
				-            'labels': self.loss_labels,
			
 
				-            'boxes': self.loss_boxes,
			
 
				-        }
			
 
				-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
			
 
				-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
			
 
				-
			
 
				-
			
 
				-    def forward(self, outputs, targets, epoch=0):
			
 
				-        """ This performs the loss computation.
			
 
				-        Parameters:
			
 
				-             outputs: dict of tensors, see the output specification of the model for the format
			
 
				-             targets: list of dicts, such that len(targets) == batch_size.
			
 
				-                      The expected keys in each dict depends on the losses applied, see each loss' doc
			
 
				-        """
			
 
				-        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
			
 
				-
			
 
				-        # Retrieve the matching between the outputs of the last layer and the targets
			
 
				-        indices = self.matcher(outputs_without_aux, targets)
			
 
				-
			
 
				-        # Compute the average number of target boxes accross all nodes, for normalization purposes
			
 
				-        num_boxes = sum(len(t["labels"]) for t in targets)
			
 
				-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
			
 
				-        if is_dist_avail_and_initialized():
			
 
				-            torch.distributed.all_reduce(num_boxes)
			
 
				-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
			
 
				-
			
 
				-        # Compute all the requested losses
			
 
				-        losses = {}
			
 
				-        for loss in self.losses:
			
 
				-            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
			
 
				-
			
 
				-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
			
 
				-        if 'aux_outputs' in outputs:
			
 
				-            for i, aux_outputs in enumerate(outputs['aux_outputs']):
			
 
				-                indices = self.matcher(aux_outputs, targets)
			
 
				-                for loss in self.losses:
			
 
				-                    kwargs = {}
			
 
				-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
			
 
				-                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
			
 
				-                    losses.update(l_dict)
			
 
				-
			
 
				-        weight_dict = self.weight_dict
			
 
				-        total_loss = sum(losses[k] * weight_dict[k] for k in losses.keys() if k in weight_dict)
			
 
				-        losses['losses'] = total_loss
			
 
				-
			
 
				-        return losses
			
 
				-
			
 
				-
			
 
				-# build criterion
			
 
				-def build_criterion(cfg, num_classes, aux_loss=False):
			
 
				-    matcher = build_matcher(cfg)
			
 
				-    
			
 
				-    weight_dict = {'loss_cls': cfg['loss_cls_weight'],
			
 
				-                  'loss_bbox': cfg['loss_box_weight'],
			
 
				-                  'loss_giou': cfg['loss_giou_weight']}
			
 
				-
			
 
				-    # TODO this is a hack
			
 
				-    if aux_loss:
			
 
				-        aux_weight_dict = {}
			
 
				-        for i in range(cfg['num_decoder_layers'] - 1):
			
 
				-            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
			
 
				-        weight_dict.update(aux_weight_dict)
			
 
				-
			
 
				-    losses = ['labels', 'boxes']
			
 
				-    
			
 
				-    criterion = Criterion(
			
 
				-        num_classes=num_classes,
			
 
				-        matcher=matcher,
			
 
				-        weight_dict=weight_dict,
			
 
				-        losses=losses,
			
 
				-        focal_alpha=cfg['focal_alpha'])
			
 
				-
			
 
				-    return criterion
			
 
				-    
			
--- a/models/detectors/rtdetr/matcher.py
+++ b/models/detectors/rtdetr/matcher.py
@@ -1,102 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-from scipy.optimize import linear_sum_assignment
			
 
				-from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
			
 
				-
			
 
				-
			
 
				-class HungarianMatcher(nn.Module):
			
 
				-    """This class computes an assignment between the targets and the predictions of the network
			
 
				-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
			
 
				-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
			
 
				-    while the others are un-matched (and thus treated as non-objects).
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
			
 
				-        """Creates the matcher
			
 
				-        Params:
			
 
				-            cost_class: This is the relative weight of the classification error in the matching cost
			
 
				-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
			
 
				-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
			
 
				-        """
			
 
				-        super().__init__()
			
 
				-        self.cost_class = cost_class
			
 
				-        self.cost_bbox = cost_bbox
			
 
				-        self.cost_giou = cost_giou
			
 
				-        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
			
 
				-
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def forward(self, outputs, targets):
			
 
				-        """ Performs the matching
			
 
				-        Params:
			
 
				-            outputs: This is a dict that contains at least these entries:
			
 
				-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
			
 
				-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
			
 
				-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
			
 
				-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
			
 
				-                           objects in the target) containing the class labels
			
 
				-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
			
 
				-        Returns:
			
 
				-            A list of size batch_size, containing tuples of (index_i, index_j) where:
			
 
				-                - index_i is the indices of the selected predictions (in order)
			
 
				-                - index_j is the indices of the corresponding selected targets (in order)
			
 
				-            For each batch element, it holds:
			
 
				-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
			
 
				-        """
			
 
				-        bs, num_queries = outputs["pred_logits"].shape[:2]
			
 
				-
			
 
				-        # We flatten to compute the cost matrices in a batch
			
 
				-        # [B * num_queries, C] = [N, C], where N is B * num_queries
			
 
				-        out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
			
 
				-        # [B * num_queries, 4] = [N, 4]
			
 
				-        out_bbox = outputs["pred_boxes"].flatten(0, 1)
			
 
				-
			
 
				-        # Also concat the target labels and boxes
			
 
				-        # [M,] where M is number of all targets in this batch
			
 
				-        tgt_ids = torch.cat([v["labels"] for v in targets])
			
 
				-        # [M, 4] where M is number of all targets in this batch
			
 
				-        tgt_bbox = torch.cat([v["boxes"] for v in targets])
			
 
				-
			
 
				-        # Compute the classification cost.
			
 
				-        alpha = 0.25
			
 
				-        gamma = 2.0
			
 
				-        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
			
 
				-        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
			
 
				-        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
			
 
				-
			
 
				-        # Compute the L1 cost between boxes
			
 
				-        # [N, M]
			
 
				-        cost_bbox = torch.cdist(out_bbox, tgt_bbox.to(out_bbox.device), p=1)
			
 
				-
			
 
				-        # Compute the giou cost betwen boxes
			
 
				-        # [N, M]
			
 
				-        cost_giou = -generalized_box_iou(
			
 
				-            box_cxcywh_to_xyxy(out_bbox),
			
 
				-            box_cxcywh_to_xyxy(tgt_bbox.to(out_bbox.device)))
			
 
				-
			
 
				-        # Final cost matrix: [N, M]
			
 
				-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
			
 
				-        # [N, M] -> [B, num_queries, M]
			
 
				-        C = C.view(bs, num_queries, -1).cpu()
			
 
				-
			
 
				-        # The number of boxes in each image
			
 
				-        sizes = [len(v["boxes"]) for v in targets]
			
 
				-        # In the last dimension of C, we divide it into B costs, and each cost is [B, num_querys, M_i]
			
 
				-        # where sum(Mi) = M.
			
 
				-        # i is the batch index and c is cost_i = [B, num_querys, M_i].
			
 
				-        # Therefore c[i] is the cost between the i-th sample and i-th prediction.
			
 
				-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
			
 
				-        # As for each (i, j) in indices, i is the prediction indexes and j is the target indexes
			
 
				-        # i contains row indexes of cost matrix: array([row_1, row_2, row_3]) 
			
 
				-        # j contains col indexes of cost matrix: array([col_1, col_2, col_3])
			
 
				-        # len(i) == len(j)
			
 
				-        # len(indices) = batch_size
			
 
				-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
			
 
				-
			
 
				-
			
 
				-def build_matcher(cfg):
			
 
				-    return HungarianMatcher(
			
 
				-        cost_class=cfg['set_cost_class'],
			
 
				-        cost_bbox=cfg['set_cost_bbox'],
			
 
				-        cost_giou=cfg['set_cost_giou']
			
 
				-        )
			
--- a/models/detectors/rtdetr/rtdetr.py
+++ b/models/detectors/rtdetr/rtdetr.py
@@ -1,163 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .rtdetr_encoder import build_encoder
			
 
				-from .rtdetr_compressor import build_compressor
			
 
				-from .rtdetr_decoder import build_decoder
			
 
				-from .rtdetr_dethead import build_dethead
			
 
				-
			
 
				-
			
 
				-# Real-time DETR
			
 
				-class RTDETR(nn.Module):
			
 
				-    def __init__(self, 
			
 
				-                 cfg,
			
 
				-                 device, 
			
 
				-                 num_classes = 20, 
			
 
				-                 trainable = False, 
			
 
				-                 aux_loss = False,
			
 
				-                 with_box_refine = False,
			
 
				-                 deploy = False):
			
 
				-        super(RTDETR, self).__init__()
			
 
				-        # --------- Basic Parameters ----------
			
 
				-        self.cfg = cfg
			
 
				-        self.device = device
			
 
				-        self.num_classes = num_classes
			
 
				-        self.trainable = trainable
			
 
				-        self.max_stride = max(cfg['stride'])
			
 
				-        self.d_model = round(cfg['d_model'] * self.cfg['width'])
			
 
				-        self.aux_loss = aux_loss
			
 
				-        self.with_box_refine = with_box_refine
			
 
				-        self.deploy = deploy
			
 
				-        
			
 
				-        # --------- Network Parameters ----------
			
 
				-        ## Encoder
			
 
				-        self.encoder = build_encoder(cfg, trainable, 'img_encoder')
			
 
				-
			
 
				-        ## Compressor
			
 
				-        self.compressor = build_compressor(cfg, self.d_model)
			
 
				-
			
 
				-        ## Decoder
			
 
				-        self.decoder = build_decoder(cfg, self.d_model, return_intermediate=aux_loss)
			
 
				-
			
 
				-        ## DetHead
			
 
				-        self.dethead = build_dethead(cfg, self.d_model, num_classes, with_box_refine)
			
 
				-            
			
 
				-        # set for TR-Decoder
			
 
				-        self.decoder.class_embed = self.dethead.class_embed
			
 
				-        self.decoder.bbox_embed = self.dethead.bbox_embed
			
 
				-
			
 
				-
			
 
				-    # ---------------------- Basic Functions ----------------------
			
 
				-    def position_embedding(self, x, temperature=10000):
			
 
				-        hs, ws = x.shape[-2:]
			
 
				-        device = x.device
			
 
				-        num_pos_feats = x.shape[1] // 2       
			
 
				-        scale = 2 * 3.141592653589793
			
 
				-
			
 
				-        # generate xy coord mat
			
 
				-        y_embed, x_embed = torch.meshgrid(
			
 
				-            [torch.arange(1, hs+1, dtype=torch.float32),
			
 
				-             torch.arange(1, ws+1, dtype=torch.float32)])
			
 
				-        y_embed = y_embed / (hs + 1e-6) * scale
			
 
				-        x_embed = x_embed / (ws + 1e-6) * scale
			
 
				-    
			
 
				-        # [H, W] -> [1, H, W]
			
 
				-        y_embed = y_embed[None, :, :].to(device)
			
 
				-        x_embed = x_embed[None, :, :].to(device)
			
 
				-
			
 
				-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=device)
			
 
				-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats
			
 
				-        dim_t = temperature ** (2 * dim_t_)
			
 
				-
			
 
				-        pos_x = torch.div(x_embed[:, :, :, None], dim_t)
			
 
				-        pos_y = torch.div(y_embed[:, :, :, None], dim_t)
			
 
				-        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
			
 
				-        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
			
 
				-
			
 
				-        # [B, C, H, W]
			
 
				-        pos_embed = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
			
 
				-        
			
 
				-        return pos_embed
			
 
				-        
			
 
				-
			
 
				-    @torch.jit.unused
			
 
				-    def set_aux_loss(self, outputs_class, outputs_coord):
			
 
				-        # this is a workaround to make torchscript happy, as torchscript
			
 
				-        # doesn't support dictionary with non-homogeneous values, such
			
 
				-        # as a dict having both a Tensor and a list.
			
 
				-        return [{'pred_logits': a, 'pred_boxes': b}
			
 
				-                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
			
 
				-
			
 
				-
			
 
				-    # ---------------------- Main Process for Inference ----------------------
			
 
				-    @torch.no_grad()
			
 
				-    def inference_single_image(self, x):
			
 
				-        # -------------------- Encoder --------------------
			
 
				-        pyramid_feats = self.encoder(x)
			
 
				-
			
 
				-        # -------------------- Pos Embed --------------------
			
 
				-        memory = torch.cat([feat.flatten(2) for feat in pyramid_feats], dim=-1)
			
 
				-        memory_pos = torch.cat([self.position_embedding(feat).flatten(2) for feat in pyramid_feats], dim=-1)
			
 
				-        memory = memory.permute(0, 2, 1).contiguous()
			
 
				-        memory_pos = memory_pos.permute(0, 2, 1).contiguous()
			
 
				-
			
 
				-        # -------------------- Compressor --------------------
			
 
				-        compressed_memory = self.compressor(memory, memory_pos)
			
 
				-
			
 
				-        # -------------------- Decoder --------------------
			
 
				-        hs, reference = self.decoder(compressed_memory, None)
			
 
				-
			
 
				-        # -------------------- DetHead --------------------
			
 
				-        out_logits, out_bbox = self.dethead(hs, reference, False)
			
 
				-        cls_pred, box_pred = out_logits[0], out_bbox[0]
			
 
				-
			
 
				-        # -------------------- Top-k --------------------
			
 
				-        cls_pred = cls_pred.flatten().sigmoid_()
			
 
				-        num_topk = 100
			
 
				-        predicted_prob, topk_idxs = cls_pred.sort(descending=True)
			
 
				-        topk_idxs = topk_idxs[:num_topk]
			
 
				-        topk_box_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
			
 
				-        topk_scores = predicted_prob[:num_topk]
			
 
				-        topk_labels = topk_idxs % self.num_classes
			
 
				-        topk_bboxes = box_pred[topk_box_idxs]
			
 
				-
			
 
				-        # denormalize bbox
			
 
				-        img_h, img_w = x.shape[-2:]
			
 
				-        topk_bboxes[..., 0::2] *= img_w
			
 
				-        topk_bboxes[..., 1::2] *= img_h
			
 
				-
			
 
				-        if self.deploy:
			
 
				-            return topk_bboxes, topk_scores, topk_labels
			
 
				-        else:
			
 
				-            return topk_bboxes.cpu().numpy(), topk_scores.cpu().numpy(), topk_labels.cpu().numpy()
			
 
				-        
			
 
				-
			
 
				-    # ---------------------- Main Process for Training ----------------------
			
 
				-    def forward(self, x):
			
 
				-        if not self.trainable:
			
 
				-            return self.inference_single_image(x)
			
 
				-        else:
			
 
				-            # -------------------- Encoder --------------------
			
 
				-            pyramid_feats = self.encoder(x)
			
 
				-
			
 
				-            # -------------------- Pos Embed --------------------
			
 
				-            memory = torch.cat([feat.flatten(2) for feat in pyramid_feats], dim=-1)
			
 
				-            memory_pos = torch.cat([self.position_embedding(feat).flatten(2) for feat in pyramid_feats], dim=-1)
			
 
				-            memory = memory.permute(0, 2, 1).contiguous()
			
 
				-            memory_pos = memory_pos.permute(0, 2, 1).contiguous()
			
 
				-            
			
 
				-            # -------------------- Compressor --------------------
			
 
				-            compressed_memory = self.compressor(memory, memory_pos)
			
 
				-
			
 
				-            # -------------------- Decoder --------------------
			
 
				-            hs, reference = self.decoder(compressed_memory, None)
			
 
				-
			
 
				-            # -------------------- DetHead --------------------
			
 
				-            outputs_class, outputs_coords = self.dethead(hs, reference, True)
			
 
				-
			
 
				-            outputs = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coords[-1]}
			
 
				-            if self.aux_loss:
			
 
				-                outputs['aux_outputs'] = self.set_aux_loss(outputs_class, outputs_coords)
			
 
				-            
			
 
				-            return outputs
			
 
				-    
			
--- a/models/detectors/rtdetr/rtdetr_basic.py
+++ b/models/detectors/rtdetr/rtdetr_basic.py
@@ -1,221 +0,0 @@
 
				-import copy
			
 
				-from typing import Optional
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from torch import nn, Tensor
			
 
				-
			
 
				-
			
 
				-# ------------------------------- Basic Modules -------------------------------
			
 
				-def get_activation(act_type=None):
			
 
				-    if act_type == 'relu':
			
 
				-        return nn.ReLU(inplace=True)
			
 
				-    elif act_type == 'gelu':
			
 
				-        return nn.GELU()
			
 
				-    elif act_type == 'lrelu':
			
 
				-        return nn.LeakyReLU(0.1, inplace=True)
			
 
				-    elif act_type == 'mish':
			
 
				-        return nn.Mish(inplace=True)
			
 
				-    elif act_type == 'silu':
			
 
				-        return nn.SiLU(inplace=True)
			
 
				-
			
 
				-
			
 
				-def get_norm(norm_type, dim):
			
 
				-    if norm_type == 'BN':
			
 
				-        return nn.BatchNorm2d(dim)
			
 
				-    elif norm_type == 'GN':
			
 
				-        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				-    elif norm_type == 'LN':
			
 
				-        return nn.LayerNorm(dim)
			
 
				-
			
 
				-
			
 
				-def get_clones(module, N):
			
 
				-    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
			
 
				-    
			
 
				-
			
 
				-def build_multi_head_attention(d_model, num_heads, dropout, attn_type='mhsa'):
			
 
				-    if attn_type == 'mhsa':
			
 
				-        attn_layer = MultiHeadAttention(d_model, num_heads, dropout)
			
 
				-    elif attn_type == 's_mhsa':
			
 
				-        attn_layer = None
			
 
				-
			
 
				-    return attn_layer
			
 
				-
			
 
				-
			
 
				-# ------------------------------- MLP -------------------------------
			
 
				-class MLP(nn.Module):
			
 
				-    """ Very simple multi-layer perceptron (also called FFN)"""
			
 
				-
			
 
				-    def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
			
 
				-        super().__init__()
			
 
				-        self.num_layers = num_layers
			
 
				-        h = [hidden_dim] * (num_layers - 1)
			
 
				-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([in_dim] + h, h + [out_dim]))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        for i, layer in enumerate(self.layers):
			
 
				-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-# ------------------------------- Transformer Modules -------------------------------
			
 
				-## Vanilla Multi-Head Attention
			
 
				-class MultiHeadAttention(nn.Module):
			
 
				-    def __init__(self, d_model, num_heads, dropout=0.) -> None:
			
 
				-        super().__init__()
			
 
				-        # --------------- Basic parameters ---------------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.dropout = dropout
			
 
				-        self.scale = (d_model // num_heads) ** -0.5
			
 
				-
			
 
				-        # --------------- Network parameters ---------------
			
 
				-        self.q_proj = nn.Linear(d_model, d_model, bias = False) # W_q, W_k, W_v
			
 
				-        self.k_proj = nn.Linear(d_model, d_model, bias = False) # W_q, W_k, W_v
			
 
				-        self.v_proj = nn.Linear(d_model, d_model, bias = False) # W_q, W_k, W_v
			
 
				-
			
 
				-        self.out_proj = nn.Linear(d_model, d_model)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-
			
 
				-
			
 
				-    def forward(self, query, key, value):
			
 
				-        """
			
 
				-        Inputs:
			
 
				-            query : (Tensor) -> [B, Nq, C]
			
 
				-            key   : (Tensor) -> [B, Nk, C]
			
 
				-            value : (Tensor) -> [B, Nk, C]
			
 
				-        """
			
 
				-        bs = query.shape[0]
			
 
				-        Nq = query.shape[1]
			
 
				-        Nk = key.shape[1]
			
 
				-
			
 
				-        # ----------------- Input proj -----------------
			
 
				-        query = self.q_proj(query)
			
 
				-        key   = self.k_proj(key)
			
 
				-        value = self.v_proj(value)
			
 
				-
			
 
				-        # ----------------- Multi-head Attn -----------------
			
 
				-        ## [B, N, C] -> [B, N, H, C_h] -> [B, H, N, C_h]
			
 
				-        query = query.view(bs, Nq, self.num_heads, self.d_model // self.num_heads)
			
 
				-        query = query.permute(0, 2, 1, 3).contiguous()
			
 
				-        key   = key.view(bs, Nk, self.num_heads, self.d_model // self.num_heads)
			
 
				-        key   = key.permute(0, 2, 1, 3).contiguous()
			
 
				-        value = value.view(bs, Nk, self.num_heads, self.d_model // self.num_heads)
			
 
				-        value = value.permute(0, 2, 1, 3).contiguous()
			
 
				-        # Attention
			
 
				-        ## [B, H, Nq, C_h] X [B, H, C_h, Nk] = [B, H, Nq, Nk]
			
 
				-        sim_matrix = torch.matmul(query, key.transpose(-1, -2)) * self.scale
			
 
				-        sim_matrix = torch.softmax(sim_matrix, dim=-1)
			
 
				-
			
 
				-        # ----------------- Output -----------------
			
 
				-        out = torch.matmul(sim_matrix, value)  # [B, H, Nq, C_h]
			
 
				-        out = out.permute(0, 2, 1, 3).contiguous().view(bs, Nq, -1)
			
 
				-        out = self.out_proj(out)
			
 
				-
			
 
				-        return out
			
 
				-        
			
 
				-## Transformer Encoder layer
			
 
				-class TREncoderLayer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 d_model,
			
 
				-                 num_heads,
			
 
				-                 dim_feedforward=2048,
			
 
				-                 dropout=0.1,
			
 
				-                 act_type="relu",
			
 
				-                 attn_type='mhsa'
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # Multi-head Self-Attn
			
 
				-        self.self_attn = build_multi_head_attention(d_model, num_heads, dropout, attn_type)
			
 
				-
			
 
				-        # Feedforwaed Network
			
 
				-        self.linear1 = nn.Linear(d_model, dim_feedforward)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-        self.linear2 = nn.Linear(dim_feedforward, d_model)
			
 
				-
			
 
				-        self.norm1 = nn.LayerNorm(d_model)
			
 
				-        self.norm2 = nn.LayerNorm(d_model)
			
 
				-        self.dropout1 = nn.Dropout(dropout)
			
 
				-        self.dropout2 = nn.Dropout(dropout)
			
 
				-
			
 
				-        self.activation = get_activation(act_type)
			
 
				-
			
 
				-
			
 
				-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
			
 
				-        return tensor if pos is None else tensor + pos
			
 
				-
			
 
				-
			
 
				-    def forward(self, src, pos):
			
 
				-        """
			
 
				-        Input:
			
 
				-            src: [torch.Tensor] -> [B, N, C]
			
 
				-            pos: [torch.Tensor] -> [B, N, C]
			
 
				-        Output:
			
 
				-            src: [torch.Tensor] -> [B, N, C]
			
 
				-        """
			
 
				-        q = k = self.with_pos_embed(src, pos)
			
 
				-
			
 
				-        # self-attn
			
 
				-        src2 = self.self_attn(q, k, value=src)
			
 
				-
			
 
				-        # reshape: [B, N, C] -> [B, C, H, W]
			
 
				-        src = src + self.dropout1(src2)
			
 
				-        src = self.norm1(src)
			
 
				-
			
 
				-        # ffpn
			
 
				-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
			
 
				-        src = src + self.dropout2(src2)
			
 
				-        src = self.norm2(src)
			
 
				-        
			
 
				-        return src
			
 
				-
			
 
				-## Transformer Decoder layer
			
 
				-class TRDecoderLayer(nn.Module):
			
 
				-    def __init__(self, d_model, num_heads, dim_feedforward=2048, dropout=0.1, act_type="relu", attn_type='mhsa'):
			
 
				-        super().__init__()
			
 
				-        # Multi-head Self-Attn
			
 
				-        self.self_attn = build_multi_head_attention(d_model, num_heads, dropout, attn_type)
			
 
				-        self.cross_attn = build_multi_head_attention(d_model, num_heads, dropout)
			
 
				-        # Feedforward Network
			
 
				-        self.linear1 = nn.Linear(d_model, dim_feedforward)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-        self.linear2 = nn.Linear(dim_feedforward, d_model)
			
 
				-
			
 
				-        self.norm1 = nn.LayerNorm(d_model)
			
 
				-        self.norm2 = nn.LayerNorm(d_model)
			
 
				-        self.norm3 = nn.LayerNorm(d_model)
			
 
				-        self.dropout1 = nn.Dropout(dropout)
			
 
				-        self.dropout2 = nn.Dropout(dropout)
			
 
				-        self.dropout3 = nn.Dropout(dropout)
			
 
				-
			
 
				-        self.activation = get_activation(act_type)
			
 
				-
			
 
				-
			
 
				-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
			
 
				-        return tensor if pos is None else tensor + pos
			
 
				-
			
 
				-
			
 
				-    def forward(self, tgt, tgt_query_pos, memory, memory_pos):
			
 
				-        # self attention
			
 
				-        tgt2 = self.self_attn(
			
 
				-            query=self.with_pos_embed(tgt, tgt_query_pos),
			
 
				-            key=self.with_pos_embed(tgt, tgt_query_pos),
			
 
				-            value=tgt)[0]
			
 
				-        tgt = tgt + self.dropout1(tgt2)
			
 
				-        tgt = self.norm1(tgt)
			
 
				-
			
 
				-        # cross attention
			
 
				-        tgt2 = self.cross_attn(
			
 
				-            query=self.with_pos_embed(tgt, tgt_query_pos),
			
 
				-            key=self.with_pos_embed(memory, memory_pos),
			
 
				-            value=memory)
			
 
				-        tgt = tgt + self.dropout2(tgt2)
			
 
				-        tgt = self.norm2(tgt)
			
 
				-
			
 
				-        # ffn
			
 
				-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
			
 
				-        tgt = tgt + self.dropout3(tgt2)
			
 
				-        tgt = self.norm3(tgt)
			
 
				-        
			
 
				-        return tgt
			
--- a/models/detectors/rtdetr/rtdetr_compressor.py
+++ b/models/detectors/rtdetr/rtdetr_compressor.py
@@ -1,34 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .rtdetr_basic import TRDecoderLayer
			
 
				-
			
 
				-
			
 
				-# Transformer Decoder Module
			
 
				-class MemoryCompressor(nn.Module):
			
 
				-    def __init__(self, cfg, in_dim):
			
 
				-        super().__init__()
			
 
				-        # -------------------- Basic Parameters ---------------------
			
 
				-        self.d_model = in_dim
			
 
				-        self.ffn_dim = round(cfg['com_dim_feedforward']*cfg['width'])
			
 
				-        self.compressed_vector = nn.Embedding(cfg['num_compressed'], in_dim)
			
 
				-        # -------------------- Network Parameters ---------------------
			
 
				-        self.compress_layer = TRDecoderLayer(
			
 
				-            d_model=in_dim,
			
 
				-            dim_feedforward=self.ffn_dim,
			
 
				-            num_heads=cfg['com_num_heads'],
			
 
				-            dropout=cfg['com_dropout'],
			
 
				-            act_type=cfg['com_act']
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-    def forward(self, memory, memory_pos):
			
 
				-        bs = memory.size(0)
			
 
				-        output = self.compressed_vector.weight[None].repeat(bs, 1, 1)
			
 
				-        output = self.compress_layer(output, None, memory, memory_pos)
			
 
				-
			
 
				-        return output
			
 
				-
			
 
				-
			
 
				-def build_compressor(cfg, in_dim):
			
 
				-    return MemoryCompressor(cfg, in_dim)
			
--- a/models/detectors/rtdetr/rtdetr_decoder.py
+++ b/models/detectors/rtdetr/rtdetr_decoder.py
@@ -1,115 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .rtdetr_basic import get_clones, TRDecoderLayer, MLP
			
 
				-
			
 
				-
			
 
				-# Transformer Decoder Module
			
 
				-class TransformerDecoder(nn.Module):
			
 
				-    def __init__(self, cfg, in_dim, return_intermediate=False):
			
 
				-        super().__init__()
			
 
				-        # -------------------- Basic Parameters ---------------------
			
 
				-        self.d_model = in_dim
			
 
				-        self.query_dim = 4  # For RefPoint head
			
 
				-        self.scale = 2 * 3.141592653589793
			
 
				-        self.num_queries = cfg['num_queries']
			
 
				-        self.num_deocder_layers = cfg['num_decoder_layers']
			
 
				-        self.return_intermediate = return_intermediate
			
 
				-        self.ffn_dim = round(cfg['de_dim_feedforward']*cfg['width'])
			
 
				-
			
 
				-        # -------------------- Network Parameters ---------------------
			
 
				-        ## Decoder
			
 
				-        decoder_layer = TRDecoderLayer(
			
 
				-            d_model=in_dim,
			
 
				-            dim_feedforward=self.ffn_dim,
			
 
				-            num_heads=cfg['de_num_heads'],
			
 
				-            dropout=cfg['de_dropout'],
			
 
				-            act_type=cfg['de_act']
			
 
				-        )
			
 
				-        self.decoder_layers = get_clones(decoder_layer, cfg['num_decoder_layers'])
			
 
				-        ## RefPoint Embed
			
 
				-        self.refpoint_embed = nn.Embedding(cfg['num_queries'], 4)
			
 
				-        self.ref_point_head = MLP(self.query_dim // 2 * in_dim, in_dim, in_dim, 2)
			
 
				-        ## Object Query Embed
			
 
				-        self.object_query = nn.Embedding(cfg['num_queries'], in_dim)
			
 
				-        nn.init.normal_(self.object_query.weight.data)
			
 
				-        ## TODO: Group queries
			
 
				-
			
 
				-        self.bbox_embed = None
			
 
				-        self.class_embed = None
			
 
				-
			
 
				-
			
 
				-    def inverse_sigmoid(self, x):
			
 
				-        x = x.clamp(min=0, max=1)
			
 
				-        return torch.log(x.clamp(min=1e-5)/(1 - x).clamp(min=1e-5))
			
 
				-
			
 
				-
			
 
				-    def query_sine_embed(self, num_feats, reference_points):
			
 
				-        dim_t = torch.arange(num_feats, dtype=torch.float32, device=reference_points.device)
			
 
				-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_feats
			
 
				-        dim_t = 10000 ** (2 * dim_t_)
			
 
				-
			
 
				-        x_embed = reference_points[:, :, 0] * self.scale
			
 
				-        y_embed = reference_points[:, :, 1] * self.scale
			
 
				-        pos_x = x_embed[:, :, None] / dim_t
			
 
				-        pos_y = y_embed[:, :, None] / dim_t
			
 
				-        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
			
 
				-        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
			
 
				-        w_embed = reference_points[:, :, 2] * self.scale
			
 
				-        pos_w = w_embed[:, :, None] / dim_t
			
 
				-        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
			
 
				-
			
 
				-        h_embed = reference_points[:, :, 3] * self.scale
			
 
				-        pos_h = h_embed[:, :, None] / dim_t
			
 
				-        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
			
 
				-        query_sine_embed = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
			
 
				-
			
 
				-        return query_sine_embed
			
 
				-    
			
 
				-
			
 
				-    def forward(self, memory, memory_pos):
			
 
				-        bs, _, channels = memory.size()
			
 
				-        num_feats = channels // 2
			
 
				-
			
 
				-        # prepare tgt & refpoint
			
 
				-        tgt = self.object_query.weight[None].repeat(bs, 1, 1)
			
 
				-        refpoint_embed = self.refpoint_embed.weight[None].repeat(bs, 1, 1)
			
 
				-
			
 
				-        intermediate = []
			
 
				-        reference_points = refpoint_embed.sigmoid()
			
 
				-        ref_points = [reference_points]
			
 
				-
			
 
				-        # main process
			
 
				-        output = tgt
			
 
				-        for layer_id, layer in enumerate(self.decoder_layers):
			
 
				-            # Conditional query
			
 
				-            query_sine_embed = self.query_sine_embed(num_feats, reference_points)
			
 
				-            query_pos = self.ref_point_head(query_sine_embed) # [B, N, C]
			
 
				-            # Decoder
			
 
				-            output = layer(
			
 
				-                    # input for decoder
			
 
				-                    tgt = output,
			
 
				-                    tgt_query_pos = query_pos,
			
 
				-                    # input from encoder
			
 
				-                    memory = memory,
			
 
				-                    memory_pos = memory_pos,
			
 
				-                )
			
 
				-            # Iter update
			
 
				-            if self.bbox_embed is not None:
			
 
				-                delta_unsig = self.bbox_embed[layer_id](output)
			
 
				-                outputs_unsig = delta_unsig + self.inverse_sigmoid(reference_points)
			
 
				-                new_reference_points = outputs_unsig.sigmoid()
			
 
				-
			
 
				-                reference_points = new_reference_points.detach()
			
 
				-                ref_points.append(new_reference_points)
			
 
				-
			
 
				-            intermediate.append(output)
			
 
				-
			
 
				-        return torch.stack(intermediate), torch.stack(ref_points)
			
 
				-
			
 
				-
			
 
				-# build detection head
			
 
				-def build_decoder(cfg, in_dim, return_intermediate=False):
			
 
				-    decoder = TransformerDecoder(cfg, in_dim, return_intermediate=return_intermediate) 
			
 
				-
			
 
				-    return decoder
			
--- a/models/detectors/rtdetr/rtdetr_dethead.py
+++ b/models/detectors/rtdetr/rtdetr_dethead.py
@@ -1,83 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .rtdetr_basic import MLP
			
 
				-
			
 
				-
			
 
				-class DetectHead(nn.Module):
			
 
				-    def __init__(self, cfg, d_model, num_classes, with_box_refine=False):
			
 
				-        super().__init__()
			
 
				-        # --------- Basic Parameters ----------
			
 
				-        self.cfg = cfg
			
 
				-        self.num_classes = num_classes
			
 
				-
			
 
				-        # --------- Network Parameters ----------
			
 
				-        self.class_embed = nn.ModuleList([nn.Linear(d_model, self.num_classes)])
			
 
				-        self.bbox_embed = nn.ModuleList([MLP(d_model, d_model, 4, 3)])
			
 
				-        if with_box_refine:
			
 
				-            self.class_embed = nn.ModuleList([
			
 
				-                self.class_embed[0] for _ in range(cfg['num_decoder_layers'])])
			
 
				-            self.bbox_embed = nn.ModuleList([
			
 
				-                self.bbox_embed[0] for _ in range(cfg['num_decoder_layers'])])
			
 
				-
			
 
				-        self.init_weight()
			
 
				-
			
 
				-
			
 
				-    def init_weight(self):
			
 
				-        init_prob = 0.01
			
 
				-        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
			
 
				-
			
 
				-        # cls pred
			
 
				-        for class_embed in self.class_embed:
			
 
				-            class_embed.bias.data = torch.ones(self.num_classes) * bias_value
			
 
				-
			
 
				-        # box pred
			
 
				-        for bbox_embed in self.bbox_embed:
			
 
				-            nn.init.constant_(bbox_embed.layers[-1].weight.data, 0)
			
 
				-            nn.init.constant_(bbox_embed.layers[-1].bias.data, 0)
			
 
				-        
			
 
				-
			
 
				-    def inverse_sigmoid(self, x):
			
 
				-        x = x.clamp(min=0, max=1)
			
 
				-        return torch.log(x.clamp(min=1e-5)/(1 - x).clamp(min=1e-5))
			
 
				-
			
 
				-
			
 
				-    def decode_bbox(self, outputs_coords):
			
 
				-        ## cxcywh -> xyxy
			
 
				-        x1y1_pred = outputs_coords[..., :2] - outputs_coords[..., 2:] * 0.5
			
 
				-        x2y2_pred = outputs_coords[..., :2] + outputs_coords[..., 2:] * 0.5
			
 
				-        box_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
			
 
				-        
			
 
				-        return box_pred
			
 
				-
			
 
				-
			
 
				-    def forward(self, hs, reference, multi_layer=False):
			
 
				-        if multi_layer:
			
 
				-            # class embed
			
 
				-            outputs_class = torch.stack([
			
 
				-                layer_cls_embed(layer_hs) for layer_cls_embed, layer_hs in zip(self.class_embed, hs)])
			
 
				-            # bbox embed
			
 
				-            outputs_coords = []
			
 
				-            for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(zip(reference[:-1], self.bbox_embed, hs)):
			
 
				-                layer_delta_unsig = layer_bbox_embed(layer_hs)
			
 
				-                layer_outputs_unsig = layer_delta_unsig + self.inverse_sigmoid(layer_ref_sig)
			
 
				-                layer_outputs_unsig = layer_outputs_unsig.sigmoid()
			
 
				-                outputs_coords.append(layer_outputs_unsig)
			
 
				-        else:
			
 
				-            # class embed
			
 
				-            outputs_class = self.class_embed[-1](hs[-1]) 
			
 
				-            # bbox embed
			
 
				-            delta_unsig = self.bbox_embed[-1](hs[-1])
			
 
				-            ref_sig = reference[-2]
			
 
				-            ref_sig = self.inverse_sigmoid(ref_sig)
			
 
				-            outputs_unsig = delta_unsig + ref_sig
			
 
				-            outputs_coords = outputs_unsig.sigmoid()
			
 
				-            # decode bbox
			
 
				-            outputs_coords = self.decode_bbox(outputs_coords)
			
 
				-
			
 
				-
			
 
				-        return outputs_class, outputs_coords
			
 
				-
			
 
				-
			
 
				-def build_dethead(cfg, d_model, num_classes, with_box_refine):
			
 
				-    return DetectHead(cfg, d_model, num_classes, with_box_refine)
			
--- a/models/detectors/rtdetr/rtdetr_encoder.py
+++ b/models/detectors/rtdetr/rtdetr_encoder.py
@@ -1,10 +0,0 @@
 
				-from .image_encoder.img_encoder import build_img_encoder
			
 
				-
			
 
				-
			
 
				-# build encoder
			
 
				-def build_encoder(cfg, trainable=False, en_type='img_encoder'):
			
 
				-    if en_type == 'img_encoder':
			
 
				-        return build_img_encoder(cfg, trainable)
			
 
				-    elif en_type == 'text_encoder':
			
 
				-        ## TODO: design text encoder
			
 
				-        return None
			
--- a/train.sh
+++ b/train.sh
@@ -16,25 +16,3 @@ python train.py \
 
				         # --resume weights/coco/yolox_m/yolox_m_best.pth \
			
 
				         # --pretrained weights/coco/yolo_free_medium/yolo_free_medium_39.46.pth \
			
 
				         # --eval_first
			
 
				-
			
 
				-
			
 
				-# # Train RT-DETR
			
 
				-# python train.py \
			
 
				-#         --cuda \
			
 
				-#         -d voc \
			
 
				-#         --root /mnt/share/ssd2/dataset/ \
			
 
				-#         -m rtdetr_n \
			
 
				-#         -bs 16 \
			
 
				-#         -size 640 \
			
 
				-#         --wp_epoch 1 \
			
 
				-#         --max_epoch 150 \
			
 
				-#         --eval_epoch 10 \
			
 
				-#         --ema \
			
 
				-#         --fp16 \
			
 
				-#         --multi_scale \
			
 
				-#         --mosaic 0 \
			
 
				-#         --mixup 0
			
 
				-#         # --resume weights/coco/yolox_s/yolox_s_best.pth \
			
 
				-#         # --pretrained weights/coco/yolo_free_medium/yolo_free_medium_39.46.pth \
			
 
				-#         # --eval_first
			
 
				-
			
--- a/train_ddp.sh
+++ b/train_ddp.sh
@@ -5,7 +5,7 @@ python -m torch.distributed.run --nproc_per_node=8 train.py \
 
				                                                     -dist \
			
 
				                                                     -d coco \
			
 
				                                                     --root /data/datasets/ \
			
 
				-                                                    -m yolovx_t \
			
 
				+                                                    -m yolovx_n\
			
 
				                                                     -bs 128 \
			
 
				                                                     -size 640 \
			
 
				                                                     --wp_epoch 3 \