1 year ago · 4a379c0f58
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -91,8 +91,6 @@ from .model_config.yolov5_config import yolov5_cfg, yolov5_adamw_cfg
 
				 from .model_config.yolov7_config import yolov7_cfg
			
 
				 from .model_config.yolov8_config import yolov8_cfg
			
 
				 from .model_config.yolox_config  import yolox_cfg
			
 
				-## Real-time DETR series
			
 
				-from .model_config.rtdetr_config import rtdetr_cfg
			
 
				 
			
 
				 def build_model_config(args):
			
 
				     print('==============================')
			
@@ -124,9 +122,6 @@ def build_model_config(args):
 
				     # YOLOX
			
 
				     elif args.model in ['yolox_n', 'yolox_s', 'yolox_m', 'yolox_l', 'yolox_x']:
			
 
				         cfg = yolox_cfg[args.model]
			
 
				-    # RT-DETR
			
 
				-    elif args.model in ['rtdetr_r18', 'rtdetr_r34', 'rtdetr_r50', 'rtdetr_r101']:
			
 
				-        cfg = rtdetr_cfg[args.model]
			
 
				 
			
 
				     return cfg
			
 
				 
			
--- a/config/model_config/rtdetr_config.py
+++ b/config/model_config/rtdetr_config.py
@@ -1,175 +0,0 @@
 
				-# Real-time Transformer-based Object Detector
			
 
				-
			
 
				-
			
 
				-# ------------------- Det task --------------------
			
 
				-rtdetr_cfg = {
			
 
				-    'rtdetr_r18':{
			
 
				-        # ---------------- Model config ----------------
			
 
				-        ## Image Encoder - Backbone
			
 
				-        'backbone': 'resnet18',
			
 
				-        'backbone_norm': 'BN',
			
 
				-        'pretrained': True,
			
 
				-        'pretrained_weight': 'imagenet1k_v1',
			
 
				-        'freeze_at': 0,
			
 
				-        'freeze_stem_only': False,
			
 
				-        'out_stride': [8, 16, 32],
			
 
				-        'max_stride': 32,
			
 
				-        ## Image Encoder - FPN
			
 
				-        'fpn': 'hybrid_encoder',
			
 
				-        'fpn_num_blocks': 3,
			
 
				-        'fpn_act': 'silu',
			
 
				-        'fpn_norm': 'BN',
			
 
				-        'fpn_depthwise': False,
			
 
				-        'hidden_dim': 256,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 1,
			
 
				-        'en_ffn_dim': 1024,
			
 
				-        'en_dropout': 0.0,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'en_act': 'gelu',
			
 
				-        # Transformer Decoder
			
 
				-        'transformer': 'rtdetr_transformer',
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_num_layers': 3,
			
 
				-        'de_ffn_dim': 1024,
			
 
				-        'de_dropout': 0.0,
			
 
				-        'de_act': 'relu',
			
 
				-        'de_num_points': 4,
			
 
				-        'num_queries': 300,
			
 
				-        'learnt_init_query': False,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'dn_num_denoising': 100,
			
 
				-        'dn_label_noise_ratio': 0.5,
			
 
				-        'dn_box_noise_scale': 1,
			
 
				-        # ---------------- Assignment config ----------------
			
 
				-        'matcher_hpy': {'cost_class': 2.0,
			
 
				-                        'cost_bbox': 5.0,
			
 
				-                        'cost_giou': 2.0,},
			
 
				-        # ---------------- Loss config ----------------
			
 
				-        'use_vfl': True,
			
 
				-        'loss_coeff': {'class': 1,
			
 
				-                       'bbox': 5,
			
 
				-                       'giou': 2,},
			
 
				-        # ---------------- Train config ----------------
			
 
				-        ## input
			
 
				-        'multi_scale': [0.5, 1.25],   # 320 -> 800
			
 
				-        'trans_type': 'rtdetr_base',
			
 
				-        # ---------------- Train config ----------------
			
 
				-        'trainer_type': 'rtdetr',
			
 
				-    },
			
 
				-
			
 
				-    'rtdetr_r50':{
			
 
				-        # ---------------- Model config ----------------
			
 
				-        ## Image Encoder - Backbone
			
 
				-        'backbone': 'resnet50',
			
 
				-        'backbone_norm': 'FrozeBN',
			
 
				-        'pretrained': True,
			
 
				-        'pretrained_weight': 'imagenet1k_v2',
			
 
				-        'freeze_at': 0,
			
 
				-        'freeze_stem_only': False,
			
 
				-        'out_stride': [8, 16, 32],
			
 
				-        'max_stride': 32,
			
 
				-        ## Image Encoder - FPN
			
 
				-        'fpn': 'hybrid_encoder',
			
 
				-        'fpn_num_blocks': 3,
			
 
				-        'fpn_act': 'silu',
			
 
				-        'fpn_norm': 'BN',
			
 
				-        'fpn_depthwise': False,
			
 
				-        'hidden_dim': 256,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 1,
			
 
				-        'en_ffn_dim': 2048,
			
 
				-        'en_dropout': 0.0,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'en_act': 'gelu',
			
 
				-        # Transformer Decoder
			
 
				-        'transformer': 'rtdetr_transformer',
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_num_layers': 6,
			
 
				-        'de_ffn_dim': 2048,
			
 
				-        'de_dropout': 0.0,
			
 
				-        'de_act': 'relu',
			
 
				-        'de_num_points': 4,
			
 
				-        'num_queries': 300,
			
 
				-        'learnt_init_query': False,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'dn_num_denoising': 100,
			
 
				-        'dn_label_noise_ratio': 0.5,
			
 
				-        'dn_box_noise_scale': 1,
			
 
				-        # Head
			
 
				-        'det_head': 'dino_head',
			
 
				-        # ---------------- Assignment config ----------------
			
 
				-        'matcher_hpy': {'cost_class': 2.0,
			
 
				-                        'cost_bbox': 5.0,
			
 
				-                        'cost_giou': 2.0,},
			
 
				-        # ---------------- Loss config ----------------
			
 
				-        'use_vfl': True,
			
 
				-        'loss_coeff': {'class': 1,
			
 
				-                       'bbox': 5,
			
 
				-                       'giou': 2,},
			
 
				-        # ---------------- Train config ----------------
			
 
				-        ## input
			
 
				-        'multi_scale': [0.5, 1.25],   # 320 -> 800
			
 
				-        'trans_type': 'rtdetr_base',
			
 
				-        # ---------------- Train config ----------------
			
 
				-        'trainer_type': 'rtdetr',
			
 
				-    },
			
 
				-
			
 
				-    'rtdetr_r101':{
			
 
				-        # ---------------- Model config ----------------
			
 
				-        ## Image Encoder - Backbone
			
 
				-        'backbone': 'resnet101',
			
 
				-        'backbone_norm': 'FrozeBN',
			
 
				-        'pretrained': True,
			
 
				-        'pretrained_weight': 'imagenet1k_v2',
			
 
				-        'freeze_at': 0,
			
 
				-        'freeze_stem_only': False,
			
 
				-        'out_stride': [8, 16, 32],
			
 
				-        'max_stride': 32,
			
 
				-        ## Image Encoder - FPN
			
 
				-        'fpn': 'hybrid_encoder',
			
 
				-        'fpn_num_blocks': 4,
			
 
				-        'fpn_act': 'silu',
			
 
				-        'fpn_norm': 'BN',
			
 
				-        'fpn_depthwise': False,
			
 
				-        'hidden_dim': 384,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 1,
			
 
				-        'en_ffn_dim': 2048,
			
 
				-        'en_dropout': 0.0,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'en_act': 'gelu',
			
 
				-        # Transformer Decoder
			
 
				-        'transformer': 'rtdetr_transformer',
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_num_layers': 6,
			
 
				-        'de_ffn_dim': 2048,
			
 
				-        'de_dropout': 0.0,
			
 
				-        'de_act': 'relu',
			
 
				-        'de_num_points': 4,
			
 
				-        'num_queries': 300,
			
 
				-        'learnt_init_query': False,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'dn_num_denoising': 100,
			
 
				-        'dn_label_noise_ratio': 0.5,
			
 
				-        'dn_box_noise_scale': 1,
			
 
				-        # Head
			
 
				-        'det_head': 'dino_head',
			
 
				-        # ---------------- Assignment config ----------------
			
 
				-        'matcher_hpy': {'cost_class': 2.0,
			
 
				-                        'cost_bbox': 5.0,
			
 
				-                        'cost_giou': 2.0,},
			
 
				-        # ---------------- Loss config ----------------
			
 
				-        'use_vfl': True,
			
 
				-        'loss_coeff': {'class': 1,
			
 
				-                       'bbox': 5,
			
 
				-                       'giou': 2,},
			
 
				-        # ---------------- Train config ----------------
			
 
				-        ## input
			
 
				-        'multi_scale': [0.5, 1.25],   # 320 -> 800
			
 
				-        'trans_type': 'rtdetr_base',
			
 
				-        # ---------------- Train config ----------------
			
 
				-        'trainer_type': 'rtdetr',
			
 
				-    },
			
 
				-
			
 
				-}
			
--- a/models/detectors/__init__.py
+++ b/models/detectors/__init__.py
@@ -11,8 +11,6 @@ from .yolov5.build import build_yolov5
 
				 from .yolov7.build import build_yolov7
			
 
				 from .yolov8.build import build_yolov8
			
 
				 from .yolox.build import build_yolox
			
 
				-# Real-time DETR series
			
 
				-from .rtdetr.build import build_rtdetr
			
 
				 
			
 
				 
			
 
				 # build object detector
			
@@ -62,10 +60,6 @@ def build_model(args,
 
				     elif args.model in ['yolox_n_adamw', 'yolox_s_adamw', 'yolox_m_adamw', 'yolox_l_adamw', 'yolox_x_adamw']:
			
 
				         model, criterion = build_yolox(
			
 
				             args, model_cfg, device, num_classes, trainable, deploy)
			
 
				-    # RT-DETR
			
 
				-    elif args.model in ['rtdetr_r18', 'rtdetr_r34', 'rtdetr_r50', 'rtdetr_r101']:
			
 
				-        model, criterion = build_rtdetr(
			
 
				-            args, model_cfg, num_classes, trainable, deploy)
			
 
				 
			
 
				     if trainable:
			
 
				         # Load pretrained weight
			
--- a/models/detectors/rtdetr/README.md
+++ b/models/detectors/rtdetr/README.md
@@ -1,59 +0,0 @@
 
				-# Real-time Transformer-based Object Detector:
			
 
				-This model is not yet complete.
			
 
				-
			
 
				-## Results on the COCO-val
			
 
				-|     Model    | Batch | Scale | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight | Los |
			
 
				-|--------------|-------|-------|------------------------|-------------------|-------------------|--------------------|--------|-----|
			
 
				-| RT-DETR-R18  | 4xb4  |  640  |           45.5         |        63.0       |        66.8       |        21.0        | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/detr_series_ckpt/rtdetr_r18_coco.pth) | [log](https://github.com/yjh0410/RT-ODLab/releases/download/detr_series_ckpt/RT-DETR-R18-COCO.txt)|
			
 
				-| RT-DETR-R50  | 4xb4  |  640  |           50.2         |        68.5       |       113.7       |        40.4        | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/detr_series_ckpt/rtdetr_r50_coco.pth) | [log](https://github.com/yjh0410/RT-ODLab/releases/download/detr_series_ckpt/RT-DETR-R50-COCO.txt)|
			
 
				-| RT-DETR-R101 | 4xb4  |  640  |                        |                   |                   |                    |  | |
			
 
				-
			
 
				-- For the backbone of the image encoder, we use the IN-1K classification pretrained weight from torchvision, which is different from the official
			
 
				-RT-DETR. It might be hard to train RT-DETR from scratch without IN-1K pretrained weight.
			
 
				-- For the HybridEncoder, we use the C2f of YOLOv8 rather than the CSPRepLayer.
			
 
				-- For training, we train RT-DETR series with 6x (~72 epochs) schedule on COCO and use ModelEMA trick. We close the fp16 training trick.
			
 
				-- For data augmentation, we use the `color jitter`, `random hflip`, `random crop`, and multi-scale training trick.
			
 
				-- For optimizer, we use AdamW with weight decay 0.0001 and base per image lr 0.0001 / 16.
			
 
				-- For learning rate scheduler, we use constant learning rate (=0.0001), following the official setting.
			
 
				-- For post-processing, we think it is still a little helpful to deploy NMS even if it is not essential.
			
 
				-
			
 
				-## Train RT-DETR
			
 
				-### Single GPU
			
 
				-Taking training RT-DETR-R18 on COCO as the example,
			
 
				-```Shell
			
 
				-python train.py --cuda -d coco --root path/to/coco -m rtdetr_r18 -bs 16 -size 640 --max_epoch 72 --eval_epoch 1 --ema --multi_scale 
			
 
				-```
			
 
				-
			
 
				-### Multi GPU
			
 
				-Taking training RT-DETR-R18 on COCO with 4 GPUs as the example,
			
 
				-```Shell
			
 
				-python -m torch.distributed.run --nproc_per_node=4 train.py --cuda -dist -d coco --root /data/datasets/ -m rtdetr_r18 -bs 16 -size 640 --max_epoch 72 --eval_epoch 1 --ema --sybn --multi_scale 
			
 
				-```
			
 
				-
			
 
				-## Test RT-DETR
			
 
				-Taking testing RT-DETR-R18 on COCO-val as the example,
			
 
				-```Shell
			
 
				-python test.py --cuda -d coco --root path/to/coco -m rtdetr_r18 --weight path/to/rtdetr_r18.pth -size 640 -ct 0.4 --show 
			
 
				-```
			
 
				-
			
 
				-## Evaluate RT-DETR
			
 
				-Taking evaluating RT-DETR-R18 on COCO-val as the example,
			
 
				-```Shell
			
 
				-python eval.py --cuda -d coco --root path/to/coco -m rtdetr_r18 --weight path/to/rtdetr_r18.pth -size 640
			
 
				-```
			
 
				-
			
 
				-## Demo
			
 
				-### Detect with Image
			
 
				-```Shell
			
 
				-python demo.py --mode image --path_to_img path/to/image_dirs/ --cuda -m rtdetr_r18 --weight path/to/weight -size 640 -ct 0.4 --show
			
 
				-```
			
 
				-
			
 
				-### Detect with Video
			
 
				-```Shell
			
 
				-python demo.py --mode video --path_to_vid path/to/video --cuda -m rtdetr_r18 --weight path/to/weight -size 640 -ct 0.4 --show --gif
			
 
				-```
			
 
				-
			
 
				-### Detect with Camera
			
 
				-```Shell
			
 
				-python demo.py --mode camera --cuda -m rtdetr_r18 --weight path/to/weight -size 640 -ct 0.4 --show --gif
			
 
				-```
			
--- a/models/detectors/rtdetr/basic_modules/backbone.py
+++ b/models/detectors/rtdetr/basic_modules/backbone.py
@@ -1,134 +0,0 @@
 
				-import torch
			
 
				-import torchvision
			
 
				-from torch import nn
			
 
				-from torchvision.models._utils import IntermediateLayerGetter
			
 
				-from torchvision.models.resnet import (ResNet18_Weights,
			
 
				-                                       ResNet34_Weights,
			
 
				-                                       ResNet50_Weights,
			
 
				-                                       ResNet101_Weights)
			
 
				-try:
			
 
				-    from .basic import FrozenBatchNorm2d
			
 
				-except:
			
 
				-    from basic  import FrozenBatchNorm2d
			
 
				-   
			
 
				-
			
 
				-# IN1K pretrained weights
			
 
				-pretrained_urls = {
			
 
				-    # ResNet series
			
 
				-    'resnet18':  ResNet18_Weights,
			
 
				-    'resnet34':  ResNet34_Weights,
			
 
				-    'resnet50':  ResNet50_Weights,
			
 
				-    'resnet101': ResNet101_Weights,
			
 
				-    # ShuffleNet series
			
 
				-}
			
 
				-
			
 
				-
			
 
				-# ----------------- Model functions -----------------
			
 
				-## Build backbone network
			
 
				-def build_backbone(cfg, pretrained):
			
 
				-    print('==============================')
			
 
				-    print('Backbone: {}'.format(cfg['backbone']))
			
 
				-    # ResNet
			
 
				-    if 'resnet' in cfg['backbone']:
			
 
				-        pretrained_weight = cfg['pretrained_weight'] if pretrained else None
			
 
				-        model, feats = build_resnet(cfg, pretrained_weight)
			
 
				-    elif 'svnetv2' in cfg['backbone']:
			
 
				-        pretrained_weight = cfg['pretrained_weight'] if pretrained else None
			
 
				-        model, feats = build_scnetv2(cfg, pretrained_weight)
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown backbone: <>.".format(cfg['backbone']))
			
 
				-    
			
 
				-    return model, feats
			
 
				-
			
 
				-
			
 
				-# ----------------- ResNet Backbone -----------------
			
 
				-class ResNet(nn.Module):
			
 
				-    """ResNet backbone with frozen BatchNorm."""
			
 
				-    def __init__(self,
			
 
				-                 name: str,
			
 
				-                 norm_type: str,
			
 
				-                 pretrained_weights: str = "imagenet1k_v1",
			
 
				-                 freeze_at: int = -1,
			
 
				-                 freeze_stem_only: bool = False):
			
 
				-        super().__init__()
			
 
				-        # Pretrained
			
 
				-        assert pretrained_weights in [None, "imagenet1k_v1", "imagenet1k_v2"]
			
 
				-        if pretrained_weights is not None:
			
 
				-            if name in ('resnet18', 'resnet34'):
			
 
				-                pretrained_weights = pretrained_urls[name].IMAGENET1K_V1
			
 
				-            else:
			
 
				-                if pretrained_weights == "imagenet1k_v1":
			
 
				-                    pretrained_weights = pretrained_urls[name].IMAGENET1K_V1
			
 
				-                else:
			
 
				-                    pretrained_weights = pretrained_urls[name].IMAGENET1K_V2
			
 
				-        else:
			
 
				-            pretrained_weights = None
			
 
				-        print('ImageNet pretrained weight: ', pretrained_weights)
			
 
				-        # Norm layer
			
 
				-        if norm_type == 'BN':
			
 
				-            norm_layer = nn.BatchNorm2d
			
 
				-        elif norm_type == 'FrozeBN':
			
 
				-            norm_layer = FrozenBatchNorm2d
			
 
				-        # Backbone
			
 
				-        backbone = getattr(torchvision.models, name)(norm_layer=norm_layer, weights=pretrained_weights)
			
 
				-        return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
			
 
				-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
			
 
				-        self.feat_dims = [128, 256, 512] if name in ('resnet18', 'resnet34') else [512, 1024, 2048]
			
 
				-        # Freeze
			
 
				-        if freeze_at >= 0:
			
 
				-            for name, parameter in backbone.named_parameters():
			
 
				-                if freeze_stem_only:
			
 
				-                    if 'layer1' not in name and 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
			
 
				-                        parameter.requires_grad_(False)
			
 
				-                else:
			
 
				-                    if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
			
 
				-                        parameter.requires_grad_(False)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        xs = self.body(x)
			
 
				-        fmp_list = []
			
 
				-        for name, fmp in xs.items():
			
 
				-            fmp_list.append(fmp)
			
 
				-
			
 
				-        return fmp_list
			
 
				-
			
 
				-def build_resnet(cfg, pretrained_weight=None):
			
 
				-    # ResNet series
			
 
				-    backbone = ResNet(cfg['backbone'],
			
 
				-                      cfg['backbone_norm'],
			
 
				-                      pretrained_weight,
			
 
				-                      cfg['freeze_at'],
			
 
				-                      cfg['freeze_stem_only'])
			
 
				-
			
 
				-    return backbone, backbone.feat_dims
			
 
				-
			
 
				-
			
 
				-# ----------------- ShuffleNet Backbone -----------------
			
 
				-## TODO: Add shufflenet-v2
			
 
				-class ShuffleNetv2:
			
 
				-    pass
			
 
				-
			
 
				-def build_scnetv2(cfg, pretrained_weight=None):
			
 
				-    return
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    cfg = {
			
 
				-        'backbone':      'resnet18',
			
 
				-        'backbone_norm': 'BN',
			
 
				-        'pretrained': True,
			
 
				-        'freeze_at': -1,
			
 
				-        'freeze_stem_only': True,
			
 
				-        'pretrained_weight': 'imagenet1k_v1',
			
 
				-    }
			
 
				-    model, feat_dim = build_backbone(cfg, cfg['pretrained'])
			
 
				-    print(feat_dim)
			
 
				-
			
 
				-    x = torch.randn(2, 3, 320, 320)
			
 
				-    output = model(x)
			
 
				-    for y in output:
			
 
				-        print(y.size())
			
 
				-
			
 
				-    # for n, p in model.named_parameters():
			
 
				-    #     print(n.split(".")[-1])
			
 
				-
			
--- a/models/detectors/rtdetr/basic_modules/basic.py
+++ b/models/detectors/rtdetr/basic_modules/basic.py
@@ -1,278 +0,0 @@
 
				-import numpy as np
			
 
				-import copy
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-
			
 
				-# ---------------------------- NMS ----------------------------
			
 
				-## basic NMS
			
 
				-def nms(bboxes, scores, nms_thresh):
			
 
				-    """"Pure Python NMS."""
			
 
				-    x1 = bboxes[:, 0]  #xmin
			
 
				-    y1 = bboxes[:, 1]  #ymin
			
 
				-    x2 = bboxes[:, 2]  #xmax
			
 
				-    y2 = bboxes[:, 3]  #ymax
			
 
				-
			
 
				-    areas = (x2 - x1) * (y2 - y1)
			
 
				-    order = scores.argsort()[::-1]
			
 
				-
			
 
				-    keep = []
			
 
				-    while order.size > 0:
			
 
				-        i = order[0]
			
 
				-        keep.append(i)
			
 
				-        # compute iou
			
 
				-        xx1 = np.maximum(x1[i], x1[order[1:]])
			
 
				-        yy1 = np.maximum(y1[i], y1[order[1:]])
			
 
				-        xx2 = np.minimum(x2[i], x2[order[1:]])
			
 
				-        yy2 = np.minimum(y2[i], y2[order[1:]])
			
 
				-
			
 
				-        w = np.maximum(1e-10, xx2 - xx1)
			
 
				-        h = np.maximum(1e-10, yy2 - yy1)
			
 
				-        inter = w * h
			
 
				-
			
 
				-        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
			
 
				-        #reserve all the boundingbox whose ovr less than thresh
			
 
				-        inds = np.where(iou <= nms_thresh)[0]
			
 
				-        order = order[inds + 1]
			
 
				-
			
 
				-    return keep
			
 
				-
			
 
				-## class-agnostic NMS 
			
 
				-def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh):
			
 
				-    # nms
			
 
				-    keep = nms(bboxes, scores, nms_thresh)
			
 
				-    scores = scores[keep]
			
 
				-    labels = labels[keep]
			
 
				-    bboxes = bboxes[keep]
			
 
				-
			
 
				-    return scores, labels, bboxes
			
 
				-
			
 
				-## class-aware NMS 
			
 
				-def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes):
			
 
				-    # nms
			
 
				-    keep = np.zeros(len(bboxes), dtype=np.int32)
			
 
				-    for i in range(num_classes):
			
 
				-        inds = np.where(labels == i)[0]
			
 
				-        if len(inds) == 0:
			
 
				-            continue
			
 
				-        c_bboxes = bboxes[inds]
			
 
				-        c_scores = scores[inds]
			
 
				-        c_keep = nms(c_bboxes, c_scores, nms_thresh)
			
 
				-        keep[inds[c_keep]] = 1
			
 
				-    keep = np.where(keep > 0)
			
 
				-    scores = scores[keep]
			
 
				-    labels = labels[keep]
			
 
				-    bboxes = bboxes[keep]
			
 
				-
			
 
				-    return scores, labels, bboxes
			
 
				-
			
 
				-## multi-class NMS 
			
 
				-def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False):
			
 
				-    if class_agnostic:
			
 
				-        return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh)
			
 
				-    else:
			
 
				-        return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes)
			
 
				-
			
 
				-
			
 
				-# ----------------- MLP modules -----------------
			
 
				-class MLP(nn.Module):
			
 
				-    def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
			
 
				-        super().__init__()
			
 
				-        self.num_layers = num_layers
			
 
				-        h = [hidden_dim] * (num_layers - 1)
			
 
				-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([in_dim] + h, h + [out_dim]))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        for i, layer in enumerate(self.layers):
			
 
				-            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
			
 
				-        return x
			
 
				-
			
 
				-class FFN(nn.Module):
			
 
				-    def __init__(self, d_model=256, ffn_dim=1024, dropout=0., act_type='relu'):
			
 
				-        super().__init__()
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.linear1 = nn.Linear(d_model, self.ffn_dim)
			
 
				-        self.activation = get_activation(act_type)
			
 
				-        self.dropout2 = nn.Dropout(dropout)
			
 
				-        self.linear2 = nn.Linear(self.ffn_dim, d_model)
			
 
				-        self.dropout3 = nn.Dropout(dropout)
			
 
				-        self.norm = nn.LayerNorm(d_model)
			
 
				-
			
 
				-    def forward(self, src):
			
 
				-        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
			
 
				-        src = src + self.dropout3(src2)
			
 
				-        src = self.norm(src)
			
 
				-        
			
 
				-        return src
			
 
				-    
			
 
				-
			
 
				-# ----------------- Basic CNN Ops -----------------
			
 
				-def get_conv2d(c1, c2, k, p, s, g, bias=False):
			
 
				-    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, groups=g, bias=bias)
			
 
				-
			
 
				-    return conv
			
 
				-
			
 
				-def get_activation(act_type=None):
			
 
				-    if act_type == 'relu':
			
 
				-        return nn.ReLU(inplace=True)
			
 
				-    elif act_type == 'lrelu':
			
 
				-        return nn.LeakyReLU(0.1, inplace=True)
			
 
				-    elif act_type == 'mish':
			
 
				-        return nn.Mish(inplace=True)
			
 
				-    elif act_type == 'silu':
			
 
				-        return nn.SiLU(inplace=True)
			
 
				-    elif act_type == 'gelu':
			
 
				-        return nn.GELU()
			
 
				-    elif act_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-        
			
 
				-def get_norm(norm_type, dim):
			
 
				-    if norm_type == 'BN':
			
 
				-        return nn.BatchNorm2d(dim)
			
 
				-    elif norm_type == 'GN':
			
 
				-        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				-    elif norm_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-
			
 
				-def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
			
 
				-    """3x3 convolution with padding"""
			
 
				-    return nn.Conv2d(
			
 
				-        in_planes,
			
 
				-        out_planes,
			
 
				-        kernel_size=3,
			
 
				-        stride=stride,
			
 
				-        padding=dilation,
			
 
				-        groups=groups,
			
 
				-        bias=False,
			
 
				-        dilation=dilation,
			
 
				-    )
			
 
				-
			
 
				-def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
			
 
				-    """1x1 convolution"""
			
 
				-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
			
 
				-
			
 
				-class FrozenBatchNorm2d(torch.nn.Module):
			
 
				-    def __init__(self, n):
			
 
				-        super(FrozenBatchNorm2d, self).__init__()
			
 
				-        self.register_buffer("weight", torch.ones(n))
			
 
				-        self.register_buffer("bias", torch.zeros(n))
			
 
				-        self.register_buffer("running_mean", torch.zeros(n))
			
 
				-        self.register_buffer("running_var", torch.ones(n))
			
 
				-
			
 
				-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
			
 
				-                              missing_keys, unexpected_keys, error_msgs):
			
 
				-        num_batches_tracked_key = prefix + 'num_batches_tracked'
			
 
				-        if num_batches_tracked_key in state_dict:
			
 
				-            del state_dict[num_batches_tracked_key]
			
 
				-
			
 
				-        super(FrozenBatchNorm2d, self)._load_from_state_dict(
			
 
				-            state_dict, prefix, local_metadata, strict,
			
 
				-            missing_keys, unexpected_keys, error_msgs)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        # move reshapes to the beginning
			
 
				-        # to make it fuser-friendly
			
 
				-        w = self.weight.reshape(1, -1, 1, 1)
			
 
				-        b = self.bias.reshape(1, -1, 1, 1)
			
 
				-        rv = self.running_var.reshape(1, -1, 1, 1)
			
 
				-        rm = self.running_mean.reshape(1, -1, 1, 1)
			
 
				-        eps = 1e-5
			
 
				-        scale = w * (rv + eps).rsqrt()
			
 
				-        bias = b - rm * scale
			
 
				-        return x * scale + bias
			
 
				-    
			
 
				-class BasicConv(nn.Module):
			
 
				-    def __init__(self, 
			
 
				-                 in_dim,                   # in channels
			
 
				-                 out_dim,                  # out channels 
			
 
				-                 kernel_size=1,            # kernel size 
			
 
				-                 padding=0,                # padding
			
 
				-                 stride=1,                 # padding
			
 
				-                 act_type  :str = 'lrelu', # activation
			
 
				-                 norm_type :str = 'BN',    # normalization
			
 
				-                 depthwise :bool = False
			
 
				-                ):
			
 
				-        super(BasicConv, self).__init__()
			
 
				-        add_bias = False if norm_type else True
			
 
				-        self.depthwise = depthwise
			
 
				-        if not depthwise:
			
 
				-            self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
			
 
				-            self.norm = get_norm(norm_type, out_dim)
			
 
				-        else:
			
 
				-            self.conv1 = get_conv2d(in_dim, in_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
			
 
				-            self.norm1 = get_norm(norm_type, in_dim)
			
 
				-            self.conv2 = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
			
 
				-            self.norm2 = get_norm(norm_type, out_dim)
			
 
				-        self.act  = get_activation(act_type)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        if not self.depthwise:
			
 
				-            return self.act(self.norm(self.conv(x)))
			
 
				-        else:
			
 
				-            # Depthwise conv
			
 
				-            x = self.norm1(self.conv1(x))
			
 
				-            # Pointwise conv
			
 
				-            x = self.norm2(self.conv2(x))
			
 
				-            return x
			
 
				-
			
 
				-
			
 
				-# ----------------- CNN Modules -----------------
			
 
				-class Bottleneck(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim,
			
 
				-                 out_dim,
			
 
				-                 expand_ratio = 0.5,
			
 
				-                 kernel_sizes = [3, 3],
			
 
				-                 shortcut     = True,
			
 
				-                 act_type     = 'silu',
			
 
				-                 norm_type    = 'BN',
			
 
				-                 depthwise    = False,):
			
 
				-        super(Bottleneck, self).__init__()
			
 
				-        inter_dim = int(out_dim * expand_ratio)
			
 
				-        paddings = [k // 2 for k in kernel_sizes]
			
 
				-        self.cv1 = BasicConv(in_dim, inter_dim,
			
 
				-                             kernel_size=kernel_sizes[0], padding=paddings[0],
			
 
				-                             act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        self.cv2 = BasicConv(inter_dim, out_dim,
			
 
				-                             kernel_size=kernel_sizes[1], padding=paddings[1],
			
 
				-                             act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        self.shortcut = shortcut and in_dim == out_dim
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        h = self.cv2(self.cv1(x))
			
 
				-
			
 
				-        return x + h if self.shortcut else h
			
 
				-
			
 
				-class RTCBlock(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim,
			
 
				-                 out_dim,
			
 
				-                 num_blocks = 1,
			
 
				-                 shortcut   = False,
			
 
				-                 act_type   = 'silu',
			
 
				-                 norm_type  = 'BN',
			
 
				-                 depthwise  = False,):
			
 
				-        super(RTCBlock, self).__init__()
			
 
				-        self.inter_dim = out_dim // 2
			
 
				-        self.conv1 = BasicConv(in_dim, self.inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.conv2 = BasicConv(in_dim, self.inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.cmodules = nn.ModuleList([Bottleneck(self.inter_dim, self.inter_dim,
			
 
				-                                                   1.0, [3, 3], shortcut,
			
 
				-                                                   act_type, norm_type, depthwise)
			
 
				-                                                   for _ in range(num_blocks)])
			
 
				-        self.conv3 = BasicConv(self.inter_dim * (2 + num_blocks), out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x1, x2 = self.conv1(x), self.conv2(x)
			
 
				-        out = [x1, x2]
			
 
				-        for m in self.cmodules:
			
 
				-            x2 = m(x2)
			
 
				-            out.append(x2)
			
 
				-
			
 
				-        return self.conv3(torch.cat(out, dim=1))
			
 
				-    
			
--- a/models/detectors/rtdetr/basic_modules/dn_compoments.py
+++ b/models/detectors/rtdetr/basic_modules/dn_compoments.py
@@ -1,109 +0,0 @@
 
				-import torch
			
 
				-
			
 
				-
			
 
				-def inverse_sigmoid(x, eps=1e-5):
			
 
				-    x = x.clamp(min=0., max=1.)
			
 
				-    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
			
 
				-
			
 
				-def box_cxcywh_to_xyxy(x):
			
 
				-    x_c, y_c, w, h = x.unbind(-1)
			
 
				-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
			
 
				-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
			
 
				-    return torch.stack(b, dim=-1)
			
 
				-
			
 
				-def box_xyxy_to_cxcywh(x):
			
 
				-    x0, y0, x1, y1 = x.unbind(-1)
			
 
				-    b = [(x0 + x1) / 2, (y0 + y1) / 2,
			
 
				-         (x1 - x0), (y1 - y0)]
			
 
				-    return torch.stack(b, dim=-1)
			
 
				-
			
 
				-def get_contrastive_denoising_training_group(targets,
			
 
				-                                             num_classes,
			
 
				-                                             num_queries,
			
 
				-                                             class_embed,
			
 
				-                                             num_denoising=100,
			
 
				-                                             label_noise_ratio=0.5,
			
 
				-                                             box_noise_scale=1.0,):
			
 
				-    if num_denoising <= 0:
			
 
				-        return None, None, None, None
			
 
				-
			
 
				-    num_gts = [len(t['labels']) for t in targets]
			
 
				-    device = targets[0]['labels'].device
			
 
				-    
			
 
				-    max_gt_num = max(num_gts)
			
 
				-    if max_gt_num == 0:
			
 
				-        return None, None, None, None
			
 
				-
			
 
				-    num_group = num_denoising // max_gt_num
			
 
				-    num_group = 1 if num_group == 0 else num_group
			
 
				-    # pad gt to max_num of a batch
			
 
				-    bs = len(num_gts)
			
 
				-
			
 
				-    input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device)
			
 
				-    input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device)
			
 
				-    pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device)
			
 
				-
			
 
				-    for i in range(bs):
			
 
				-        num_gt = num_gts[i]
			
 
				-        if num_gt > 0:
			
 
				-            input_query_class[i, :num_gt] = targets[i]['labels']
			
 
				-            input_query_bbox[i, :num_gt] = targets[i]['boxes']
			
 
				-            pad_gt_mask[i, :num_gt] = 1
			
 
				-    # each group has positive and negative queries.
			
 
				-    input_query_class = input_query_class.tile([1, 2 * num_group])
			
 
				-    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
			
 
				-    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
			
 
				-    # positive and negative mask
			
 
				-    negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device)
			
 
				-    negative_gt_mask[:, max_gt_num:] = 1
			
 
				-    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
			
 
				-    positive_gt_mask = 1 - negative_gt_mask
			
 
				-    # contrastive denoising training positive index
			
 
				-    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
			
 
				-    dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
			
 
				-    dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts])
			
 
				-    # total denoising queries
			
 
				-    num_denoising = int(max_gt_num * 2 * num_group)
			
 
				-
			
 
				-    if label_noise_ratio > 0:
			
 
				-        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
			
 
				-        # randomly put a new one here
			
 
				-        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
			
 
				-        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
			
 
				-
			
 
				-    if box_noise_scale > 0:
			
 
				-        known_bbox = box_cxcywh_to_xyxy(input_query_bbox)
			
 
				-        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
			
 
				-        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
			
 
				-        rand_part = torch.rand_like(input_query_bbox)
			
 
				-        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
			
 
				-        rand_part *= rand_sign
			
 
				-        known_bbox += rand_part * diff
			
 
				-        known_bbox.clip_(min=0.0, max=1.0)
			
 
				-        input_query_bbox = box_xyxy_to_cxcywh(known_bbox)
			
 
				-        input_query_bbox = inverse_sigmoid(input_query_bbox)
			
 
				-    input_query_class = class_embed(input_query_class)
			
 
				-
			
 
				-    tgt_size = num_denoising + num_queries
			
 
				-    # attn_mask = torch.ones([tgt_size, tgt_size], device=device) < 0
			
 
				-    attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device)
			
 
				-    # match query cannot see the reconstruction
			
 
				-    attn_mask[num_denoising:, :num_denoising] = True
			
 
				-    
			
 
				-    # reconstruct cannot see each other
			
 
				-    for i in range(num_group):
			
 
				-        if i == 0:
			
 
				-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
			
 
				-        if i == num_group - 1:
			
 
				-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True
			
 
				-        else:
			
 
				-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
			
 
				-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True
			
 
				-        
			
 
				-    dn_meta = {
			
 
				-        "dn_positive_idx": dn_positive_idx,
			
 
				-        "dn_num_group": num_group,
			
 
				-        "dn_num_split": [num_denoising, num_queries]
			
 
				-    }
			
 
				-
			
 
				-    return input_query_class, input_query_bbox, attn_mask, dn_meta
			
--- a/models/detectors/rtdetr/basic_modules/ext_op/README.md
+++ b/models/detectors/rtdetr/basic_modules/ext_op/README.md
@@ -1,85 +0,0 @@
 
				-# Multi-scale deformable attention自定义OP编译
			
 
				-该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
			
 
				-
			
 
				-## 1. 环境依赖
			
 
				-- Paddle >= 2.3.2
			
 
				-- gcc 8.2
			
 
				-
			
 
				-## 2. 安装
			
 
				-请在当前路径下进行编译安装
			
 
				-```
			
 
				-cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/
			
 
				-python setup_ms_deformable_attn_op.py install
			
 
				-```
			
 
				-
			
 
				-编译完成后即可使用，以下为`ms_deformable_attn`的使用示例
			
 
				-```
			
 
				-# 引入自定义op
			
 
				-from deformable_detr_ops import ms_deformable_attn
			
 
				-
			
 
				-# 构造fake input tensor
			
 
				-bs, n_heads, c = 2, 8, 8
			
 
				-query_length, n_levels, n_points = 2, 2, 2
			
 
				-spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
			
 
				-level_start_index = paddle.concat((paddle.to_tensor(
			
 
				-    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
			
 
				-value_length = sum([(H * W).item() for H, W in spatial_shapes])
			
 
				-
			
 
				-def get_test_tensors(channels):
			
 
				-    value = paddle.rand(
			
 
				-        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
			
 
				-    sampling_locations = paddle.rand(
			
 
				-        [bs, query_length, n_heads, n_levels, n_points, 2],
			
 
				-        dtype=paddle.float32)
			
 
				-    attention_weights = paddle.rand(
			
 
				-        [bs, query_length, n_heads, n_levels, n_points],
			
 
				-        dtype=paddle.float32) + 1e-5
			
 
				-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
			
 
				-        -2, keepdim=True)
			
 
				-    return [value, sampling_locations, attention_weights]
			
 
				-
			
 
				-value, sampling_locations, attention_weights = get_test_tensors(c)
			
 
				-
			
 
				-output = ms_deformable_attn(value,
			
 
				-                            spatial_shapes,
			
 
				-                            level_start_index,
			
 
				-                            sampling_locations,
			
 
				-                            attention_weights)
			
 
				-```
			
 
				-
			
 
				-## 3. 单元测试
			
 
				-可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
			
 
				-```
			
 
				-python test_ms_deformable_attn_op.py
			
 
				-```
			
 
				-运行成功后，打印如下：
			
 
				-```
			
 
				-*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
			
 
				-*tensor1 True check_gradient_numerical(D=30)
			
 
				-*tensor2 True check_gradient_numerical(D=30)
			
 
				-*tensor3 True check_gradient_numerical(D=30)
			
 
				-*tensor1 True check_gradient_numerical(D=32)
			
 
				-*tensor2 True check_gradient_numerical(D=32)
			
 
				-*tensor3 True check_gradient_numerical(D=32)
			
 
				-*tensor1 True check_gradient_numerical(D=64)
			
 
				-*tensor2 True check_gradient_numerical(D=64)
			
 
				-*tensor3 True check_gradient_numerical(D=64)
			
 
				-*tensor1 True check_gradient_numerical(D=71)
			
 
				-*tensor2 True check_gradient_numerical(D=71)
			
 
				-*tensor3 True check_gradient_numerical(D=71)
			
 
				-*tensor1 True check_gradient_numerical(D=128)
			
 
				-*tensor2 True check_gradient_numerical(D=128)
			
 
				-*tensor3 True check_gradient_numerical(D=128)
			
 
				-*tensor1 True check_gradient_numerical(D=1024)
			
 
				-*tensor2 True check_gradient_numerical(D=1024)
			
 
				-*tensor3 True check_gradient_numerical(D=1024)
			
 
				-*tensor1 True check_gradient_numerical(D=1025)
			
 
				-*tensor2 True check_gradient_numerical(D=1025)
			
 
				-*tensor3 True check_gradient_numerical(D=1025)
			
 
				-*tensor1 True check_gradient_numerical(D=2048)
			
 
				-*tensor2 True check_gradient_numerical(D=2048)
			
 
				-*tensor3 True check_gradient_numerical(D=2048)
			
 
				-*tensor1 True check_gradient_numerical(D=3096)
			
 
				-*tensor2 True check_gradient_numerical(D=3096)
			
 
				-*tensor3 True check_gradient_numerical(D=3096)
			
 
				-```
			
--- a/models/detectors/rtdetr/basic_modules/ext_op/ms_deformable_attn_op.cc
+++ b/models/detectors/rtdetr/basic_modules/ext_op/ms_deformable_attn_op.cc
@@ -1,65 +0,0 @@
 
				-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
			
 
				-Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-you may not use this file except in compliance with the License.
			
 
				-You may obtain a copy of the License at
			
 
				-    http://www.apache.org/licenses/LICENSE-2.0
			
 
				-Unless required by applicable law or agreed to in writing, software
			
 
				-distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-See the License for the specific language governing permissions and
			
 
				-limitations under the License. */
			
 
				-
			
 
				-#include "paddle/extension.h"
			
 
				-
			
 
				-#include <vector>
			
 
				-
			
 
				-// declare GPU implementation
			
 
				-std::vector<paddle::Tensor>
			
 
				-MSDeformableAttnCUDAForward(const paddle::Tensor &value,
			
 
				-                            const paddle::Tensor &value_spatial_shapes,
			
 
				-                            const paddle::Tensor &value_level_start_index,
			
 
				-                            const paddle::Tensor &sampling_locations,
			
 
				-                            const paddle::Tensor &attention_weights);
			
 
				-
			
 
				-std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
			
 
				-    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
			
 
				-    const paddle::Tensor &value_level_start_index,
			
 
				-    const paddle::Tensor &sampling_locations,
			
 
				-    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
			
 
				-
			
 
				-//// CPU not implemented
			
 
				-
			
 
				-std::vector<std::vector<int64_t>>
			
 
				-MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
			
 
				-                           std::vector<int64_t> value_spatial_shapes_shape,
			
 
				-                           std::vector<int64_t> value_level_start_index_shape,
			
 
				-                           std::vector<int64_t> sampling_locations_shape,
			
 
				-                           std::vector<int64_t> attention_weights_shape) {
			
 
				-  return {{value_shape[0], sampling_locations_shape[1],
			
 
				-           value_shape[2] * value_shape[3]}};
			
 
				-}
			
 
				-
			
 
				-std::vector<paddle::DataType>
			
 
				-MSDeformableAttnInferDtype(paddle::DataType value_dtype,
			
 
				-                           paddle::DataType value_spatial_shapes_dtype,
			
 
				-                           paddle::DataType value_level_start_index_dtype,
			
 
				-                           paddle::DataType sampling_locations_dtype,
			
 
				-                           paddle::DataType attention_weights_dtype) {
			
 
				-  return {value_dtype};
			
 
				-}
			
 
				-
			
 
				-PD_BUILD_OP(ms_deformable_attn)
			
 
				-    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
			
 
				-             "AttentionWeights"})
			
 
				-    .Outputs({"Out"})
			
 
				-    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
			
 
				-    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
			
 
				-    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
			
 
				-
			
 
				-PD_BUILD_GRAD_OP(ms_deformable_attn)
			
 
				-    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
			
 
				-             "AttentionWeights", paddle::Grad("Out")})
			
 
				-    .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
			
 
				-              paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
			
 
				-              paddle::Grad("AttentionWeights")})
			
 
				-    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
			
--- a/models/detectors/rtdetr/basic_modules/ext_op/ms_deformable_attn_op.cu
+++ b/models/detectors/rtdetr/basic_modules/ext_op/ms_deformable_attn_op.cu
@@ -1,1073 +0,0 @@
 
				-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
			
 
				-Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-you may not use this file except in compliance with the License.
			
 
				-You may obtain a copy of the License at
			
 
				-    http://www.apache.org/licenses/LICENSE-2.0
			
 
				-Unless required by applicable law or agreed to in writing, software
			
 
				-distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-See the License for the specific language governing permissions and
			
 
				-limitations under the License. */
			
 
				-
			
 
				-#include "paddle/extension.h"
			
 
				-
			
 
				-#define CUDA_KERNEL_LOOP(i, n)                                                 \
			
 
				-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
			
 
				-       i += blockDim.x * gridDim.x)
			
 
				-
			
 
				-const int CUDA_NUM_THREADS = 1024;
			
 
				-inline int GET_BLOCKS(const int N, const int num_threads) {
			
 
				-  return (N + num_threads - 1) / num_threads;
			
 
				-}
			
 
				-
			
 
				-// forward bilinear
			
 
				-template <typename data_t>
			
 
				-__device__ data_t deformable_attn_bilinear_forward(
			
 
				-    const data_t *&bottom_data, const int &height, const int &width,
			
 
				-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
			
 
				-    const int &m, const int &c) {
			
 
				-  const int h_low = floor(h);
			
 
				-  const int w_low = floor(w);
			
 
				-  const int h_high = h_low + 1;
			
 
				-  const int w_high = w_low + 1;
			
 
				-
			
 
				-  const data_t lh = h - h_low;
			
 
				-  const data_t lw = w - w_low;
			
 
				-  const data_t hh = 1 - lh, hw = 1 - lw;
			
 
				-
			
 
				-  const int w_stride = nheads * channels;
			
 
				-  const int h_stride = width * w_stride;
			
 
				-  const int h_low_ptr_offset = h_low * h_stride;
			
 
				-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
			
 
				-  const int w_low_ptr_offset = w_low * w_stride;
			
 
				-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
			
 
				-  const int base_ptr = m * channels + c;
			
 
				-
			
 
				-  data_t v1 = 0;
			
 
				-  if (h_low >= 0 && w_low >= 0) {
			
 
				-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
			
 
				-    v1 = bottom_data[ptr1];
			
 
				-  }
			
 
				-  data_t v2 = 0;
			
 
				-  if (h_low >= 0 && w_high <= width - 1) {
			
 
				-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
			
 
				-    v2 = bottom_data[ptr2];
			
 
				-  }
			
 
				-  data_t v3 = 0;
			
 
				-  if (h_high <= height - 1 && w_low >= 0) {
			
 
				-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
			
 
				-    v3 = bottom_data[ptr3];
			
 
				-  }
			
 
				-  data_t v4 = 0;
			
 
				-  if (h_high <= height - 1 && w_high <= width - 1) {
			
 
				-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
			
 
				-    v4 = bottom_data[ptr4];
			
 
				-  }
			
 
				-
			
 
				-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
			
 
				-
			
 
				-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
			
 
				-  return val;
			
 
				-}
			
 
				-
			
 
				-// forward kernel
			
 
				-template <typename data_t>
			
 
				-__global__ void deformable_attn_cuda_kernel_forward(
			
 
				-    const int n, const data_t *data_value, const int64_t *data_spatial_shapes,
			
 
				-    const int64_t *data_level_start_index, const data_t *data_sampling_loc,
			
 
				-    const data_t *data_attn_weight, const int batch_size,
			
 
				-    const int value_length, const int num_heads, const int channels,
			
 
				-    const int num_levels, const int query_length, const int num_points,
			
 
				-    data_t *output_data_ptr) {
			
 
				-  CUDA_KERNEL_LOOP(index, n) {
			
 
				-    int _temp = index;
			
 
				-    const int c_col = _temp % channels;
			
 
				-    _temp /= channels;
			
 
				-    const int sampling_index = _temp;
			
 
				-    const int m_col = _temp % num_heads;
			
 
				-    _temp /= num_heads;
			
 
				-    const int q_col = _temp % query_length;
			
 
				-    _temp /= query_length;
			
 
				-    const int b_col = _temp;
			
 
				-
			
 
				-    data_t *data_ptr = output_data_ptr + index;
			
 
				-    int data_weight_ptr = sampling_index * num_levels * num_points;
			
 
				-    int data_loc_w_ptr = data_weight_ptr << 1;
			
 
				-    const int qid_stride = num_heads * channels;
			
 
				-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
			
 
				-    data_t col = 0;
			
 
				-
			
 
				-    for (int l_col = 0; l_col < num_levels; ++l_col) {
			
 
				-      const int level_start_id = data_level_start_index[l_col];
			
 
				-      const int spatial_h_ptr = l_col << 1;
			
 
				-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
			
 
				-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
			
 
				-      const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset +
			
 
				-                                                   level_start_id * qid_stride);
			
 
				-      for (int p_col = 0; p_col < num_points; ++p_col) {
			
 
				-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
			
 
				-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
			
 
				-        const data_t weight = data_attn_weight[data_weight_ptr];
			
 
				-
			
 
				-        const data_t h_im = loc_h * spatial_h - 0.5;
			
 
				-        const data_t w_im = loc_w * spatial_w - 0.5;
			
 
				-
			
 
				-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
			
 
				-          col += deformable_attn_bilinear_forward(
			
 
				-                     data_value_ptr, spatial_h, spatial_w, num_heads, channels,
			
 
				-                     h_im, w_im, m_col, c_col) *
			
 
				-                 weight;
			
 
				-        }
			
 
				-
			
 
				-        data_weight_ptr += 1;
			
 
				-        data_loc_w_ptr += 2;
			
 
				-      }
			
 
				-    }
			
 
				-    *data_ptr = col;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
			
 
				-// forward
			
 
				-std::vector<paddle::Tensor>
			
 
				-MSDeformableAttnCUDAForward(const paddle::Tensor &value,
			
 
				-                            const paddle::Tensor &value_spatial_shapes,
			
 
				-                            const paddle::Tensor &value_level_start_index,
			
 
				-                            const paddle::Tensor &sampling_locations,
			
 
				-                            const paddle::Tensor &attention_weights) {
			
 
				-
			
 
				-  CHECK_INPUT_GPU(value);
			
 
				-  CHECK_INPUT_GPU(value_spatial_shapes);
			
 
				-  CHECK_INPUT_GPU(value_level_start_index);
			
 
				-  CHECK_INPUT_GPU(sampling_locations);
			
 
				-  CHECK_INPUT_GPU(attention_weights);
			
 
				-
			
 
				-  const int batch_size = value.shape()[0];
			
 
				-  const int value_length = value.shape()[1];
			
 
				-  const int num_heads = value.shape()[2];
			
 
				-  const int channels = value.shape()[3];
			
 
				-
			
 
				-  const int num_levels = value_spatial_shapes.shape()[0];
			
 
				-  const int query_length = sampling_locations.shape()[1];
			
 
				-  const int num_points = sampling_locations.shape()[4];
			
 
				-
			
 
				-  auto output = paddle::full({batch_size, query_length, num_heads * channels},
			
 
				-                             0, value.dtype(), paddle::GPUPlace());
			
 
				-
			
 
				-  const int num_kernels = batch_size * query_length * num_heads * channels;
			
 
				-  deformable_attn_cuda_kernel_forward<float>
			
 
				-      <<<GET_BLOCKS(num_kernels, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
			
 
				-         value.stream()>>>(num_kernels, value.data<float>(),
			
 
				-                           value_spatial_shapes.data<int64_t>(),
			
 
				-                           value_level_start_index.data<int64_t>(),
			
 
				-                           sampling_locations.data<float>(),
			
 
				-                           attention_weights.data<float>(), batch_size,
			
 
				-                           value_length, num_heads, channels, num_levels,
			
 
				-                           query_length, num_points, output.data<float>());
			
 
				-  return {output};
			
 
				-}
			
 
				-
			
 
				-// backward bilinear
			
 
				-template <typename data_t>
			
 
				-__device__ void deformable_attn_bilinear_backward(
			
 
				-    const data_t *&bottom_data, const int &height, const int &width,
			
 
				-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
			
 
				-    const int &m, const int &c, const data_t &top_grad,
			
 
				-    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  const int h_low = floor(h);
			
 
				-  const int w_low = floor(w);
			
 
				-  const int h_high = h_low + 1;
			
 
				-  const int w_high = w_low + 1;
			
 
				-
			
 
				-  const data_t lh = h - h_low;
			
 
				-  const data_t lw = w - w_low;
			
 
				-  const data_t hh = 1 - lh, hw = 1 - lw;
			
 
				-
			
 
				-  const int w_stride = nheads * channels;
			
 
				-  const int h_stride = width * w_stride;
			
 
				-  const int h_low_ptr_offset = h_low * h_stride;
			
 
				-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
			
 
				-  const int w_low_ptr_offset = w_low * w_stride;
			
 
				-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
			
 
				-  const int base_ptr = m * channels + c;
			
 
				-
			
 
				-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
			
 
				-  const data_t top_grad_value = top_grad * attn_weight;
			
 
				-  data_t grad_h_weight = 0, grad_w_weight = 0;
			
 
				-
			
 
				-  data_t v1 = 0;
			
 
				-  if (h_low >= 0 && w_low >= 0) {
			
 
				-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
			
 
				-    v1 = bottom_data[ptr1];
			
 
				-    grad_h_weight -= hw * v1;
			
 
				-    grad_w_weight -= hh * v1;
			
 
				-    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
			
 
				-  }
			
 
				-  data_t v2 = 0;
			
 
				-  if (h_low >= 0 && w_high <= width - 1) {
			
 
				-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
			
 
				-    v2 = bottom_data[ptr2];
			
 
				-    grad_h_weight -= lw * v2;
			
 
				-    grad_w_weight += hh * v2;
			
 
				-    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
			
 
				-  }
			
 
				-  data_t v3 = 0;
			
 
				-  if (h_high <= height - 1 && w_low >= 0) {
			
 
				-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
			
 
				-    v3 = bottom_data[ptr3];
			
 
				-    grad_h_weight += hw * v3;
			
 
				-    grad_w_weight -= lh * v3;
			
 
				-    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
			
 
				-  }
			
 
				-  data_t v4 = 0;
			
 
				-  if (h_high <= height - 1 && w_high <= width - 1) {
			
 
				-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
			
 
				-    v4 = bottom_data[ptr4];
			
 
				-    grad_h_weight += lw * v4;
			
 
				-    grad_w_weight += lh * v4;
			
 
				-    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
			
 
				-  }
			
 
				-
			
 
				-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
			
 
				-  *grad_attn_weight = top_grad * val;
			
 
				-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
			
 
				-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
			
 
				-}
			
 
				-
			
 
				-template <typename data_t>
			
 
				-__device__ void deformable_attn_bilinear_backward_gm(
			
 
				-    const data_t *&bottom_data, const int &height, const int &width,
			
 
				-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
			
 
				-    const int &m, const int &c, const data_t &top_grad,
			
 
				-    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  const int h_low = floor(h);
			
 
				-  const int w_low = floor(w);
			
 
				-  const int h_high = h_low + 1;
			
 
				-  const int w_high = w_low + 1;
			
 
				-
			
 
				-  const data_t lh = h - h_low;
			
 
				-  const data_t lw = w - w_low;
			
 
				-  const data_t hh = 1 - lh, hw = 1 - lw;
			
 
				-
			
 
				-  const int w_stride = nheads * channels;
			
 
				-  const int h_stride = width * w_stride;
			
 
				-  const int h_low_ptr_offset = h_low * h_stride;
			
 
				-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
			
 
				-  const int w_low_ptr_offset = w_low * w_stride;
			
 
				-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
			
 
				-  const int base_ptr = m * channels + c;
			
 
				-
			
 
				-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
			
 
				-  const data_t top_grad_value = top_grad * attn_weight;
			
 
				-  data_t grad_h_weight = 0, grad_w_weight = 0;
			
 
				-
			
 
				-  data_t v1 = 0;
			
 
				-  if (h_low >= 0 && w_low >= 0) {
			
 
				-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
			
 
				-    v1 = bottom_data[ptr1];
			
 
				-    grad_h_weight -= hw * v1;
			
 
				-    grad_w_weight -= hh * v1;
			
 
				-    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
			
 
				-  }
			
 
				-  data_t v2 = 0;
			
 
				-  if (h_low >= 0 && w_high <= width - 1) {
			
 
				-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
			
 
				-    v2 = bottom_data[ptr2];
			
 
				-    grad_h_weight -= lw * v2;
			
 
				-    grad_w_weight += hh * v2;
			
 
				-    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
			
 
				-  }
			
 
				-  data_t v3 = 0;
			
 
				-  if (h_high <= height - 1 && w_low >= 0) {
			
 
				-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
			
 
				-    v3 = bottom_data[ptr3];
			
 
				-    grad_h_weight += hw * v3;
			
 
				-    grad_w_weight -= lh * v3;
			
 
				-    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
			
 
				-  }
			
 
				-  data_t v4 = 0;
			
 
				-  if (h_high <= height - 1 && w_high <= width - 1) {
			
 
				-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
			
 
				-    v4 = bottom_data[ptr4];
			
 
				-    grad_h_weight += lw * v4;
			
 
				-    grad_w_weight += lh * v4;
			
 
				-    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
			
 
				-  }
			
 
				-
			
 
				-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
			
 
				-  atomicAdd(grad_attn_weight, top_grad * val);
			
 
				-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
			
 
				-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
			
 
				-}
			
 
				-
			
 
				-// backward kernels
			
 
				-// channels > 1024
			
 
				-template <typename data_t>
			
 
				-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks(
			
 
				-    const int n, const data_t *grad_col, const data_t *data_value,
			
 
				-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
			
 
				-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
			
 
				-    const int batch_size, const int value_length, const int num_heads,
			
 
				-    const int channels, const int num_levels, const int query_length,
			
 
				-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  CUDA_KERNEL_LOOP(index, n) {
			
 
				-    extern __shared__ int _s[];
			
 
				-    data_t *cache_grad_sampling_loc = (data_t *)_s;
			
 
				-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
			
 
				-    unsigned int tid = threadIdx.x;
			
 
				-    int _temp = index;
			
 
				-    const int c_col = _temp % channels;
			
 
				-    _temp /= channels;
			
 
				-    const int sampling_index = _temp;
			
 
				-    const int m_col = _temp % num_heads;
			
 
				-    _temp /= num_heads;
			
 
				-    const int q_col = _temp % query_length;
			
 
				-    _temp /= query_length;
			
 
				-    const int b_col = _temp;
			
 
				-
			
 
				-    const data_t top_grad = grad_col[index];
			
 
				-
			
 
				-    int data_weight_ptr = sampling_index * num_levels * num_points;
			
 
				-    int data_loc_w_ptr = data_weight_ptr << 1;
			
 
				-    const int grad_sampling_ptr = data_weight_ptr;
			
 
				-    grad_sampling_loc += grad_sampling_ptr << 1;
			
 
				-    grad_attn_weight += grad_sampling_ptr;
			
 
				-    const int grad_weight_stride = 1;
			
 
				-    const int grad_loc_stride = 2;
			
 
				-    const int qid_stride = num_heads * channels;
			
 
				-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
			
 
				-
			
 
				-    for (int l_col = 0; l_col < num_levels; ++l_col) {
			
 
				-      const int level_start_id = data_level_start_index[l_col];
			
 
				-      const int spatial_h_ptr = l_col << 1;
			
 
				-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
			
 
				-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
			
 
				-      const int value_ptr_offset =
			
 
				-          data_value_ptr_init_offset + level_start_id * qid_stride;
			
 
				-      const data_t *data_value_ptr = data_value + value_ptr_offset;
			
 
				-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
			
 
				-
			
 
				-      for (int p_col = 0; p_col < num_points; ++p_col) {
			
 
				-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
			
 
				-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
			
 
				-        const data_t weight = data_attn_weight[data_weight_ptr];
			
 
				-
			
 
				-        const data_t h_im = loc_h * spatial_h - 0.5;
			
 
				-        const data_t w_im = loc_w * spatial_w - 0.5;
			
 
				-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
			
 
				-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
			
 
				-        *(cache_grad_attn_weight + threadIdx.x) = 0;
			
 
				-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
			
 
				-          deformable_attn_bilinear_backward(
			
 
				-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
			
 
				-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
			
 
				-              cache_grad_sampling_loc + (threadIdx.x << 1),
			
 
				-              cache_grad_attn_weight + threadIdx.x);
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-
			
 
				-        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
			
 
				-             s >>= 1, spre >>= 1) {
			
 
				-          if (tid < s) {
			
 
				-            const unsigned int xid1 = tid << 1;
			
 
				-            const unsigned int xid2 = (tid + s) << 1;
			
 
				-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
			
 
				-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
			
 
				-            cache_grad_sampling_loc[xid1 + 1] +=
			
 
				-                cache_grad_sampling_loc[xid2 + 1];
			
 
				-            if (tid + (s << 1) < spre) {
			
 
				-              cache_grad_attn_weight[tid] +=
			
 
				-                  cache_grad_attn_weight[tid + (s << 1)];
			
 
				-              cache_grad_sampling_loc[xid1] +=
			
 
				-                  cache_grad_sampling_loc[xid2 + (s << 1)];
			
 
				-              cache_grad_sampling_loc[xid1 + 1] +=
			
 
				-                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
			
 
				-            }
			
 
				-          }
			
 
				-          __syncthreads();
			
 
				-        }
			
 
				-
			
 
				-        if (tid == 0) {
			
 
				-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
			
 
				-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
			
 
				-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
			
 
				-        }
			
 
				-        __syncthreads();
			
 
				-
			
 
				-        data_weight_ptr += 1;
			
 
				-        data_loc_w_ptr += 2;
			
 
				-        grad_attn_weight += grad_weight_stride;
			
 
				-        grad_sampling_loc += grad_loc_stride;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-template <typename data_t>
			
 
				-__global__ void deformable_attn_cuda_kernel_backward_gm(
			
 
				-    const int n, const data_t *grad_col, const data_t *data_value,
			
 
				-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
			
 
				-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
			
 
				-    const int batch_size, const int value_length, const int num_heads,
			
 
				-    const int channels, const int num_levels, const int query_length,
			
 
				-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  CUDA_KERNEL_LOOP(index, n) {
			
 
				-    int _temp = index;
			
 
				-    const int c_col = _temp % channels;
			
 
				-    _temp /= channels;
			
 
				-    const int sampling_index = _temp;
			
 
				-    const int m_col = _temp % num_heads;
			
 
				-    _temp /= num_heads;
			
 
				-    const int q_col = _temp % query_length;
			
 
				-    _temp /= query_length;
			
 
				-    const int b_col = _temp;
			
 
				-
			
 
				-    const data_t top_grad = grad_col[index];
			
 
				-
			
 
				-    int data_weight_ptr = sampling_index * num_levels * num_points;
			
 
				-    int data_loc_w_ptr = data_weight_ptr << 1;
			
 
				-    const int grad_sampling_ptr = data_weight_ptr;
			
 
				-    grad_sampling_loc += grad_sampling_ptr << 1;
			
 
				-    grad_attn_weight += grad_sampling_ptr;
			
 
				-    const int grad_weight_stride = 1;
			
 
				-    const int grad_loc_stride = 2;
			
 
				-    const int qid_stride = num_heads * channels;
			
 
				-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
			
 
				-
			
 
				-    for (int l_col = 0; l_col < num_levels; ++l_col) {
			
 
				-      const int level_start_id = data_level_start_index[l_col];
			
 
				-      const int spatial_h_ptr = l_col << 1;
			
 
				-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
			
 
				-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
			
 
				-      const int value_ptr_offset =
			
 
				-          data_value_ptr_init_offset + level_start_id * qid_stride;
			
 
				-      const data_t *data_value_ptr = data_value + value_ptr_offset;
			
 
				-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
			
 
				-
			
 
				-      for (int p_col = 0; p_col < num_points; ++p_col) {
			
 
				-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
			
 
				-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
			
 
				-        const data_t weight = data_attn_weight[data_weight_ptr];
			
 
				-
			
 
				-        const data_t h_im = loc_h * spatial_h - 0.5;
			
 
				-        const data_t w_im = loc_w * spatial_w - 0.5;
			
 
				-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
			
 
				-          deformable_attn_bilinear_backward_gm(
			
 
				-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
			
 
				-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
			
 
				-              grad_sampling_loc, grad_attn_weight);
			
 
				-        }
			
 
				-        data_weight_ptr += 1;
			
 
				-        data_loc_w_ptr += 2;
			
 
				-        grad_attn_weight += grad_weight_stride;
			
 
				-        grad_sampling_loc += grad_loc_stride;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// channels <= 1024
			
 
				-template <typename data_t, unsigned int blockSize>
			
 
				-__global__ void
			
 
				-deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1(
			
 
				-    const int n, const data_t *grad_col, const data_t *data_value,
			
 
				-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
			
 
				-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
			
 
				-    const int batch_size, const int value_length, const int num_heads,
			
 
				-    const int channels, const int num_levels, const int query_length,
			
 
				-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  CUDA_KERNEL_LOOP(index, n) {
			
 
				-    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
			
 
				-    __shared__ data_t cache_grad_attn_weight[blockSize];
			
 
				-    unsigned int tid = threadIdx.x;
			
 
				-    int _temp = index;
			
 
				-    const int c_col = _temp % channels;
			
 
				-    _temp /= channels;
			
 
				-    const int sampling_index = _temp;
			
 
				-    const int m_col = _temp % num_heads;
			
 
				-    _temp /= num_heads;
			
 
				-    const int q_col = _temp % query_length;
			
 
				-    _temp /= query_length;
			
 
				-    const int b_col = _temp;
			
 
				-
			
 
				-    const data_t top_grad = grad_col[index];
			
 
				-
			
 
				-    int data_weight_ptr = sampling_index * num_levels * num_points;
			
 
				-    int data_loc_w_ptr = data_weight_ptr << 1;
			
 
				-    const int grad_sampling_ptr = data_weight_ptr;
			
 
				-    grad_sampling_loc += grad_sampling_ptr << 1;
			
 
				-    grad_attn_weight += grad_sampling_ptr;
			
 
				-    const int grad_weight_stride = 1;
			
 
				-    const int grad_loc_stride = 2;
			
 
				-    const int qid_stride = num_heads * channels;
			
 
				-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
			
 
				-
			
 
				-    for (int l_col = 0; l_col < num_levels; ++l_col) {
			
 
				-      const int level_start_id = data_level_start_index[l_col];
			
 
				-      const int spatial_h_ptr = l_col << 1;
			
 
				-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
			
 
				-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
			
 
				-      const int value_ptr_offset =
			
 
				-          data_value_ptr_init_offset + level_start_id * qid_stride;
			
 
				-      const data_t *data_value_ptr = data_value + value_ptr_offset;
			
 
				-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
			
 
				-
			
 
				-      for (int p_col = 0; p_col < num_points; ++p_col) {
			
 
				-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
			
 
				-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
			
 
				-        const data_t weight = data_attn_weight[data_weight_ptr];
			
 
				-
			
 
				-        const data_t h_im = loc_h * spatial_h - 0.5;
			
 
				-        const data_t w_im = loc_w * spatial_w - 0.5;
			
 
				-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
			
 
				-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
			
 
				-        *(cache_grad_attn_weight + threadIdx.x) = 0;
			
 
				-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
			
 
				-          deformable_attn_bilinear_backward(
			
 
				-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
			
 
				-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
			
 
				-              cache_grad_sampling_loc + (threadIdx.x << 1),
			
 
				-              cache_grad_attn_weight + threadIdx.x);
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-        if (tid == 0) {
			
 
				-          data_t _grad_w = cache_grad_sampling_loc[0],
			
 
				-                 _grad_h = cache_grad_sampling_loc[1],
			
 
				-                 _grad_a = cache_grad_attn_weight[0];
			
 
				-          int sid = 2;
			
 
				-          for (unsigned int tid = 1; tid < blockSize; ++tid) {
			
 
				-            _grad_w += cache_grad_sampling_loc[sid];
			
 
				-            _grad_h += cache_grad_sampling_loc[sid + 1];
			
 
				-            _grad_a += cache_grad_attn_weight[tid];
			
 
				-            sid += 2;
			
 
				-          }
			
 
				-
			
 
				-          *grad_sampling_loc = _grad_w;
			
 
				-          *(grad_sampling_loc + 1) = _grad_h;
			
 
				-          *grad_attn_weight = _grad_a;
			
 
				-        }
			
 
				-        __syncthreads();
			
 
				-
			
 
				-        data_weight_ptr += 1;
			
 
				-        data_loc_w_ptr += 2;
			
 
				-        grad_attn_weight += grad_weight_stride;
			
 
				-        grad_sampling_loc += grad_loc_stride;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-template <typename data_t, unsigned int blockSize>
			
 
				-__global__ void
			
 
				-deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2(
			
 
				-    const int n, const data_t *grad_col, const data_t *data_value,
			
 
				-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
			
 
				-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
			
 
				-    const int batch_size, const int value_length, const int num_heads,
			
 
				-    const int channels, const int num_levels, const int query_length,
			
 
				-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  CUDA_KERNEL_LOOP(index, n) {
			
 
				-    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
			
 
				-    __shared__ data_t cache_grad_attn_weight[blockSize];
			
 
				-    unsigned int tid = threadIdx.x;
			
 
				-    int _temp = index;
			
 
				-    const int c_col = _temp % channels;
			
 
				-    _temp /= channels;
			
 
				-    const int sampling_index = _temp;
			
 
				-    const int m_col = _temp % num_heads;
			
 
				-    _temp /= num_heads;
			
 
				-    const int q_col = _temp % query_length;
			
 
				-    _temp /= query_length;
			
 
				-    const int b_col = _temp;
			
 
				-
			
 
				-    const data_t top_grad = grad_col[index];
			
 
				-
			
 
				-    int data_weight_ptr = sampling_index * num_levels * num_points;
			
 
				-    int data_loc_w_ptr = data_weight_ptr << 1;
			
 
				-    const int grad_sampling_ptr = data_weight_ptr;
			
 
				-    grad_sampling_loc += grad_sampling_ptr << 1;
			
 
				-    grad_attn_weight += grad_sampling_ptr;
			
 
				-    const int grad_weight_stride = 1;
			
 
				-    const int grad_loc_stride = 2;
			
 
				-    const int qid_stride = num_heads * channels;
			
 
				-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
			
 
				-
			
 
				-    for (int l_col = 0; l_col < num_levels; ++l_col) {
			
 
				-      const int level_start_id = data_level_start_index[l_col];
			
 
				-      const int spatial_h_ptr = l_col << 1;
			
 
				-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
			
 
				-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
			
 
				-      const int value_ptr_offset =
			
 
				-          data_value_ptr_init_offset + level_start_id * qid_stride;
			
 
				-      const data_t *data_value_ptr = data_value + value_ptr_offset;
			
 
				-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
			
 
				-
			
 
				-      for (int p_col = 0; p_col < num_points; ++p_col) {
			
 
				-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
			
 
				-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
			
 
				-        const data_t weight = data_attn_weight[data_weight_ptr];
			
 
				-
			
 
				-        const data_t h_im = loc_h * spatial_h - 0.5;
			
 
				-        const data_t w_im = loc_w * spatial_w - 0.5;
			
 
				-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
			
 
				-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
			
 
				-        *(cache_grad_attn_weight + threadIdx.x) = 0;
			
 
				-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
			
 
				-          deformable_attn_bilinear_backward(
			
 
				-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
			
 
				-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
			
 
				-              cache_grad_sampling_loc + (threadIdx.x << 1),
			
 
				-              cache_grad_attn_weight + threadIdx.x);
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-
			
 
				-        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
			
 
				-          if (tid < s) {
			
 
				-            const unsigned int xid1 = tid << 1;
			
 
				-            const unsigned int xid2 = (tid + s) << 1;
			
 
				-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
			
 
				-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
			
 
				-            cache_grad_sampling_loc[xid1 + 1] +=
			
 
				-                cache_grad_sampling_loc[xid2 + 1];
			
 
				-          }
			
 
				-          __syncthreads();
			
 
				-        }
			
 
				-
			
 
				-        if (tid == 0) {
			
 
				-          *grad_sampling_loc = cache_grad_sampling_loc[0];
			
 
				-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
			
 
				-          *grad_attn_weight = cache_grad_attn_weight[0];
			
 
				-        }
			
 
				-        __syncthreads();
			
 
				-
			
 
				-        data_weight_ptr += 1;
			
 
				-        data_loc_w_ptr += 2;
			
 
				-        grad_attn_weight += grad_weight_stride;
			
 
				-        grad_sampling_loc += grad_loc_stride;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-template <typename data_t>
			
 
				-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1(
			
 
				-    const int n, const data_t *grad_col, const data_t *data_value,
			
 
				-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
			
 
				-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
			
 
				-    const int batch_size, const int value_length, const int num_heads,
			
 
				-    const int channels, const int num_levels, const int query_length,
			
 
				-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  CUDA_KERNEL_LOOP(index, n) {
			
 
				-    extern __shared__ int _s[];
			
 
				-    data_t *cache_grad_sampling_loc = (data_t *)_s;
			
 
				-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
			
 
				-    unsigned int tid = threadIdx.x;
			
 
				-    int _temp = index;
			
 
				-    const int c_col = _temp % channels;
			
 
				-    _temp /= channels;
			
 
				-    const int sampling_index = _temp;
			
 
				-    const int m_col = _temp % num_heads;
			
 
				-    _temp /= num_heads;
			
 
				-    const int q_col = _temp % query_length;
			
 
				-    _temp /= query_length;
			
 
				-    const int b_col = _temp;
			
 
				-
			
 
				-    const data_t top_grad = grad_col[index];
			
 
				-
			
 
				-    int data_weight_ptr = sampling_index * num_levels * num_points;
			
 
				-    int data_loc_w_ptr = data_weight_ptr << 1;
			
 
				-    const int grad_sampling_ptr = data_weight_ptr;
			
 
				-    grad_sampling_loc += grad_sampling_ptr << 1;
			
 
				-    grad_attn_weight += grad_sampling_ptr;
			
 
				-    const int grad_weight_stride = 1;
			
 
				-    const int grad_loc_stride = 2;
			
 
				-    const int qid_stride = num_heads * channels;
			
 
				-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
			
 
				-
			
 
				-    for (int l_col = 0; l_col < num_levels; ++l_col) {
			
 
				-      const int level_start_id = data_level_start_index[l_col];
			
 
				-      const int spatial_h_ptr = l_col << 1;
			
 
				-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
			
 
				-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
			
 
				-      const int value_ptr_offset =
			
 
				-          data_value_ptr_init_offset + level_start_id * qid_stride;
			
 
				-      const data_t *data_value_ptr = data_value + value_ptr_offset;
			
 
				-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
			
 
				-
			
 
				-      for (int p_col = 0; p_col < num_points; ++p_col) {
			
 
				-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
			
 
				-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
			
 
				-        const data_t weight = data_attn_weight[data_weight_ptr];
			
 
				-
			
 
				-        const data_t h_im = loc_h * spatial_h - 0.5;
			
 
				-        const data_t w_im = loc_w * spatial_w - 0.5;
			
 
				-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
			
 
				-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
			
 
				-        *(cache_grad_attn_weight + threadIdx.x) = 0;
			
 
				-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
			
 
				-          deformable_attn_bilinear_backward(
			
 
				-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
			
 
				-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
			
 
				-              cache_grad_sampling_loc + (threadIdx.x << 1),
			
 
				-              cache_grad_attn_weight + threadIdx.x);
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-        if (tid == 0) {
			
 
				-          data_t _grad_w = cache_grad_sampling_loc[0],
			
 
				-                 _grad_h = cache_grad_sampling_loc[1],
			
 
				-                 _grad_a = cache_grad_attn_weight[0];
			
 
				-          int sid = 2;
			
 
				-          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
			
 
				-            _grad_w += cache_grad_sampling_loc[sid];
			
 
				-            _grad_h += cache_grad_sampling_loc[sid + 1];
			
 
				-            _grad_a += cache_grad_attn_weight[tid];
			
 
				-            sid += 2;
			
 
				-          }
			
 
				-
			
 
				-          *grad_sampling_loc = _grad_w;
			
 
				-          *(grad_sampling_loc + 1) = _grad_h;
			
 
				-          *grad_attn_weight = _grad_a;
			
 
				-        }
			
 
				-        __syncthreads();
			
 
				-
			
 
				-        data_weight_ptr += 1;
			
 
				-        data_loc_w_ptr += 2;
			
 
				-        grad_attn_weight += grad_weight_stride;
			
 
				-        grad_sampling_loc += grad_loc_stride;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-template <typename data_t>
			
 
				-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2(
			
 
				-    const int n, const data_t *grad_col, const data_t *data_value,
			
 
				-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
			
 
				-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
			
 
				-    const int batch_size, const int value_length, const int num_heads,
			
 
				-    const int channels, const int num_levels, const int query_length,
			
 
				-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  CUDA_KERNEL_LOOP(index, n) {
			
 
				-    extern __shared__ int _s[];
			
 
				-    data_t *cache_grad_sampling_loc = (data_t *)_s;
			
 
				-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
			
 
				-    unsigned int tid = threadIdx.x;
			
 
				-    int _temp = index;
			
 
				-    const int c_col = _temp % channels;
			
 
				-    _temp /= channels;
			
 
				-    const int sampling_index = _temp;
			
 
				-    const int m_col = _temp % num_heads;
			
 
				-    _temp /= num_heads;
			
 
				-    const int q_col = _temp % query_length;
			
 
				-    _temp /= query_length;
			
 
				-    const int b_col = _temp;
			
 
				-
			
 
				-    const data_t top_grad = grad_col[index];
			
 
				-
			
 
				-    int data_weight_ptr = sampling_index * num_levels * num_points;
			
 
				-    int data_loc_w_ptr = data_weight_ptr << 1;
			
 
				-    const int grad_sampling_ptr = data_weight_ptr;
			
 
				-    grad_sampling_loc += grad_sampling_ptr << 1;
			
 
				-    grad_attn_weight += grad_sampling_ptr;
			
 
				-    const int grad_weight_stride = 1;
			
 
				-    const int grad_loc_stride = 2;
			
 
				-    const int qid_stride = num_heads * channels;
			
 
				-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
			
 
				-
			
 
				-    for (int l_col = 0; l_col < num_levels; ++l_col) {
			
 
				-      const int level_start_id = data_level_start_index[l_col];
			
 
				-      const int spatial_h_ptr = l_col << 1;
			
 
				-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
			
 
				-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
			
 
				-      const int value_ptr_offset =
			
 
				-          data_value_ptr_init_offset + level_start_id * qid_stride;
			
 
				-      const data_t *data_value_ptr = data_value + value_ptr_offset;
			
 
				-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
			
 
				-
			
 
				-      for (int p_col = 0; p_col < num_points; ++p_col) {
			
 
				-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
			
 
				-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
			
 
				-        const data_t weight = data_attn_weight[data_weight_ptr];
			
 
				-
			
 
				-        const data_t h_im = loc_h * spatial_h - 0.5;
			
 
				-        const data_t w_im = loc_w * spatial_w - 0.5;
			
 
				-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
			
 
				-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
			
 
				-        *(cache_grad_attn_weight + threadIdx.x) = 0;
			
 
				-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
			
 
				-          deformable_attn_bilinear_backward(
			
 
				-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
			
 
				-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
			
 
				-              cache_grad_sampling_loc + (threadIdx.x << 1),
			
 
				-              cache_grad_attn_weight + threadIdx.x);
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-
			
 
				-        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
			
 
				-             s >>= 1, spre >>= 1) {
			
 
				-          if (tid < s) {
			
 
				-            const unsigned int xid1 = tid << 1;
			
 
				-            const unsigned int xid2 = (tid + s) << 1;
			
 
				-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
			
 
				-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
			
 
				-            cache_grad_sampling_loc[xid1 + 1] +=
			
 
				-                cache_grad_sampling_loc[xid2 + 1];
			
 
				-            if (tid + (s << 1) < spre) {
			
 
				-              cache_grad_attn_weight[tid] +=
			
 
				-                  cache_grad_attn_weight[tid + (s << 1)];
			
 
				-              cache_grad_sampling_loc[xid1] +=
			
 
				-                  cache_grad_sampling_loc[xid2 + (s << 1)];
			
 
				-              cache_grad_sampling_loc[xid1 + 1] +=
			
 
				-                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
			
 
				-            }
			
 
				-          }
			
 
				-          __syncthreads();
			
 
				-        }
			
 
				-
			
 
				-        if (tid == 0) {
			
 
				-          *grad_sampling_loc = cache_grad_sampling_loc[0];
			
 
				-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
			
 
				-          *grad_attn_weight = cache_grad_attn_weight[0];
			
 
				-        }
			
 
				-        __syncthreads();
			
 
				-
			
 
				-        data_weight_ptr += 1;
			
 
				-        data_loc_w_ptr += 2;
			
 
				-        grad_attn_weight += grad_weight_stride;
			
 
				-        grad_sampling_loc += grad_loc_stride;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// backward branch
			
 
				-template <typename data_t>
			
 
				-void deformable_attn_cuda_backward(
			
 
				-    cudaStream_t stream, const data_t *grad_out, const data_t *data_value,
			
 
				-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
			
 
				-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
			
 
				-    const int batch_size, const int value_length, const int num_heads,
			
 
				-    const int channels, const int num_levels, const int query_length,
			
 
				-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
			
 
				-    data_t *grad_attn_weight) {
			
 
				-  const int num_threads =
			
 
				-      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
			
 
				-  const int num_kernels = batch_size * query_length * num_heads * channels;
			
 
				-  const int num_actual_kernels =
			
 
				-      batch_size * query_length * num_heads * channels;
			
 
				-  if (channels > 1024) {
			
 
				-    if ((channels & 1023) == 0) {
			
 
				-      deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks<data_t>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
			
 
				-             num_threads * 3 * sizeof(data_t), stream>>>(
			
 
				-              num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-              data_level_start_index, data_sampling_loc, data_attn_weight,
			
 
				-              batch_size, value_length, num_heads, channels, num_levels,
			
 
				-              query_length, num_points, grad_value, grad_sampling_loc,
			
 
				-              grad_attn_weight);
			
 
				-    } else {
			
 
				-      deformable_attn_cuda_kernel_backward_gm<data_t>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-    }
			
 
				-  } else {
			
 
				-    switch (channels) {
			
 
				-    case 1:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
			
 
				-                                                                         1>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 2:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
			
 
				-                                                                         2>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 4:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
			
 
				-                                                                         4>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 8:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
			
 
				-                                                                         8>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 16:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
			
 
				-                                                                         16>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 32:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
			
 
				-                                                                         32>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 64:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
			
 
				-                                                                         64>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 128:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
			
 
				-                                                                         128>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 256:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
			
 
				-                                                                         256>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 512:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
			
 
				-                                                                         512>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    case 1024:
			
 
				-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
			
 
				-                                                                         1024>
			
 
				-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
			
 
				-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                       data_level_start_index, data_sampling_loc,
			
 
				-                       data_attn_weight, batch_size, value_length, num_heads,
			
 
				-                       channels, num_levels, query_length, num_points,
			
 
				-                       grad_value, grad_sampling_loc, grad_attn_weight);
			
 
				-      break;
			
 
				-    default:
			
 
				-      if (channels < 64) {
			
 
				-        deformable_attn_cuda_kernel_backward_shm_reduce_v1<data_t>
			
 
				-            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
			
 
				-               num_threads * 3 * sizeof(data_t), stream>>>(
			
 
				-                num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                data_level_start_index, data_sampling_loc, data_attn_weight,
			
 
				-                batch_size, value_length, num_heads, channels, num_levels,
			
 
				-                query_length, num_points, grad_value, grad_sampling_loc,
			
 
				-                grad_attn_weight);
			
 
				-      } else {
			
 
				-        deformable_attn_cuda_kernel_backward_shm_reduce_v2<data_t>
			
 
				-            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
			
 
				-               num_threads * 3 * sizeof(data_t), stream>>>(
			
 
				-                num_kernels, grad_out, data_value, data_spatial_shapes,
			
 
				-                data_level_start_index, data_sampling_loc, data_attn_weight,
			
 
				-                batch_size, value_length, num_heads, channels, num_levels,
			
 
				-                query_length, num_points, grad_value, grad_sampling_loc,
			
 
				-                grad_attn_weight);
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// backward
			
 
				-std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
			
 
				-    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
			
 
				-    const paddle::Tensor &value_level_start_index,
			
 
				-    const paddle::Tensor &sampling_locations,
			
 
				-    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) {
			
 
				-
			
 
				-  CHECK_INPUT_GPU(value);
			
 
				-  CHECK_INPUT_GPU(value_spatial_shapes);
			
 
				-  CHECK_INPUT_GPU(value_level_start_index);
			
 
				-  CHECK_INPUT_GPU(sampling_locations);
			
 
				-  CHECK_INPUT_GPU(attention_weights);
			
 
				-  CHECK_INPUT_GPU(grad_out);
			
 
				-
			
 
				-  const int batch_size = value.shape()[0];
			
 
				-  const int value_length = value.shape()[1];
			
 
				-  const int num_heads = value.shape()[2];
			
 
				-  const int channels = value.shape()[3];
			
 
				-
			
 
				-  const int num_levels = value_spatial_shapes.shape()[0];
			
 
				-  const int query_length = sampling_locations.shape()[1];
			
 
				-  const int num_points = sampling_locations.shape()[4];
			
 
				-
			
 
				-  auto grad_value =
			
 
				-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
			
 
				-  auto grad_spatial_shapes =
			
 
				-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
			
 
				-  auto grad_level_start_index =
			
 
				-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
			
 
				-  auto grad_sampling_locations =
			
 
				-      paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(),
			
 
				-                   paddle::GPUPlace());
			
 
				-  auto grad_attention_weights =
			
 
				-      paddle::full(attention_weights.shape(), 0, attention_weights.dtype(),
			
 
				-                   paddle::GPUPlace());
			
 
				-
			
 
				-  deformable_attn_cuda_backward<float>(
			
 
				-      value.stream(), grad_out.data<float>(), value.data<float>(),
			
 
				-      value_spatial_shapes.data<int64_t>(),
			
 
				-      value_level_start_index.data<int64_t>(), sampling_locations.data<float>(),
			
 
				-      attention_weights.data<float>(), batch_size, value_length, num_heads,
			
 
				-      channels, num_levels, query_length, num_points, grad_value.data<float>(),
			
 
				-      grad_sampling_locations.data<float>(),
			
 
				-      grad_attention_weights.data<float>());
			
 
				-
			
 
				-  return {grad_value, grad_spatial_shapes, grad_level_start_index,
			
 
				-          grad_sampling_locations, grad_attention_weights};
			
 
				-}
			
--- a/models/detectors/rtdetr/basic_modules/ext_op/setup_ms_deformable_attn_op.py
+++ b/models/detectors/rtdetr/basic_modules/ext_op/setup_ms_deformable_attn_op.py
@@ -1,7 +0,0 @@
 
				-from paddle.utils.cpp_extension import CUDAExtension, setup
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    setup(
			
 
				-        name='deformable_detr_ops',
			
 
				-        ext_modules=CUDAExtension(
			
 
				-            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
			
--- a/models/detectors/rtdetr/basic_modules/ext_op/test_ms_deformable_attn_op.py
+++ b/models/detectors/rtdetr/basic_modules/ext_op/test_ms_deformable_attn_op.py
@@ -1,140 +0,0 @@
 
				-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-from __future__ import absolute_import
			
 
				-from __future__ import print_function
			
 
				-from __future__ import division
			
 
				-
			
 
				-import os
			
 
				-import sys
			
 
				-import random
			
 
				-import numpy as np
			
 
				-import paddle
			
 
				-# add python path of PaddleDetection to sys.path
			
 
				-parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
			
 
				-if parent_path not in sys.path:
			
 
				-    sys.path.append(parent_path)
			
 
				-
			
 
				-from ppdet.modeling.transformers.utils import deformable_attention_core_func
			
 
				-ms_deform_attn_core_paddle = deformable_attention_core_func
			
 
				-
			
 
				-try:
			
 
				-    gpu_index = int(sys.argv[1])
			
 
				-except:
			
 
				-    gpu_index = 0
			
 
				-print(f'Use gpu {gpu_index} to test...')
			
 
				-paddle.set_device(f'gpu:{gpu_index}')
			
 
				-
			
 
				-try:
			
 
				-    from deformable_detr_ops import ms_deformable_attn
			
 
				-except Exception as e:
			
 
				-    print('import deformable_detr_ops error', e)
			
 
				-    sys.exit(-1)
			
 
				-
			
 
				-paddle.seed(1)
			
 
				-random.seed(1)
			
 
				-np.random.seed(1)
			
 
				-
			
 
				-bs, n_heads, c = 2, 8, 8
			
 
				-query_length, n_levels, n_points = 2, 2, 2
			
 
				-spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
			
 
				-level_start_index = paddle.concat((paddle.to_tensor(
			
 
				-    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
			
 
				-value_length = sum([(H * W).item() for H, W in spatial_shapes])
			
 
				-
			
 
				-
			
 
				-def get_test_tensors(channels):
			
 
				-    value = paddle.rand(
			
 
				-        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
			
 
				-    sampling_locations = paddle.rand(
			
 
				-        [bs, query_length, n_heads, n_levels, n_points, 2],
			
 
				-        dtype=paddle.float32)
			
 
				-    attention_weights = paddle.rand(
			
 
				-        [bs, query_length, n_heads, n_levels, n_points],
			
 
				-        dtype=paddle.float32) + 1e-5
			
 
				-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
			
 
				-        -2, keepdim=True)
			
 
				-
			
 
				-    return [value, sampling_locations, attention_weights]
			
 
				-
			
 
				-
			
 
				-@paddle.no_grad()
			
 
				-def check_forward_equal_with_paddle_float():
			
 
				-    value, sampling_locations, attention_weights = get_test_tensors(c)
			
 
				-
			
 
				-    output_paddle = ms_deform_attn_core_paddle(
			
 
				-        value, spatial_shapes, level_start_index, sampling_locations,
			
 
				-        attention_weights).detach().cpu()
			
 
				-    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
			
 
				-                                     sampling_locations,
			
 
				-                                     attention_weights).detach().cpu()
			
 
				-    fwdok = paddle.allclose(
			
 
				-        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
			
 
				-    max_abs_err = (output_cuda - output_paddle).abs().max().item()
			
 
				-    max_rel_err = (
			
 
				-        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
			
 
				-
			
 
				-    print(
			
 
				-        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-def check_gradient_numerical(channels=4):
			
 
				-    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
			
 
				-        channels)
			
 
				-    value_paddle.stop_gradient = False
			
 
				-    sampling_locations_paddle.stop_gradient = False
			
 
				-    attention_weights_paddle.stop_gradient = False
			
 
				-
			
 
				-    value_cuda = value_paddle.detach().clone()
			
 
				-    sampling_locations_cuda = sampling_locations_paddle.detach().clone()
			
 
				-    attention_weights_cuda = attention_weights_paddle.detach().clone()
			
 
				-    value_cuda.stop_gradient = False
			
 
				-    sampling_locations_cuda.stop_gradient = False
			
 
				-    attention_weights_cuda.stop_gradient = False
			
 
				-
			
 
				-    output_paddle = ms_deform_attn_core_paddle(
			
 
				-        value_paddle, spatial_shapes, level_start_index,
			
 
				-        sampling_locations_paddle, attention_weights_paddle)
			
 
				-    output_paddle.sum().backward()
			
 
				-
			
 
				-    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
			
 
				-                                     level_start_index, sampling_locations_cuda,
			
 
				-                                     attention_weights_cuda)
			
 
				-    output_cuda.sum().backward()
			
 
				-
			
 
				-    res = paddle.allclose(
			
 
				-        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
			
 
				-    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
			
 
				-
			
 
				-    res = paddle.allclose(
			
 
				-        sampling_locations_paddle.grad,
			
 
				-        sampling_locations_cuda.grad,
			
 
				-        rtol=1e-2,
			
 
				-        atol=1e-3).item()
			
 
				-    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
			
 
				-
			
 
				-    res = paddle.allclose(
			
 
				-        attention_weights_paddle.grad,
			
 
				-        attention_weights_cuda.grad,
			
 
				-        rtol=1e-2,
			
 
				-        atol=1e-3).item()
			
 
				-    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    check_forward_equal_with_paddle_float()
			
 
				-
			
 
				-    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
			
 
				-        check_gradient_numerical(channels)
			
--- a/models/detectors/rtdetr/basic_modules/fpn.py
+++ b/models/detectors/rtdetr/basic_modules/fpn.py
@@ -1,200 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from typing import List
			
 
				-
			
 
				-try:
			
 
				-    from .basic import BasicConv, RTCBlock
			
 
				-    from .transformer import TransformerEncoder
			
 
				-except:
			
 
				-    from  basic import BasicConv, RTCBlock
			
 
				-    from  transformer import TransformerEncoder
			
 
				-
			
 
				-
			
 
				-# Build PaFPN
			
 
				-def build_fpn(cfg, in_dims, out_dim):
			
 
				-    if cfg['fpn'] == 'hybrid_encoder':
			
 
				-        return HybridEncoder(in_dims     = in_dims,
			
 
				-                             out_dim     = out_dim,
			
 
				-                             num_blocks  = cfg['fpn_num_blocks'],
			
 
				-                             act_type    = cfg['fpn_act'],
			
 
				-                             norm_type   = cfg['fpn_norm'],
			
 
				-                             depthwise   = cfg['fpn_depthwise'],
			
 
				-                             num_heads   = cfg['en_num_heads'],
			
 
				-                             num_layers  = cfg['en_num_layers'],
			
 
				-                             ffn_dim     = cfg['en_ffn_dim'],
			
 
				-                             dropout     = cfg['en_dropout'],
			
 
				-                             pe_temperature = cfg['pe_temperature'],
			
 
				-                             en_act_type    = cfg['en_act'],
			
 
				-                             )
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown PaFPN: <{}>".format(cfg['fpn']))
			
 
				-
			
 
				-
			
 
				-# ----------------- Feature Pyramid Network -----------------
			
 
				-## Hybrid Encoder (Transformer encoder + Convolutional PaFPN)
			
 
				-class HybridEncoder(nn.Module):
			
 
				-    def __init__(self, 
			
 
				-                 in_dims        :List  = [256, 512, 1024],
			
 
				-                 out_dim        :int   = 256,
			
 
				-                 num_blocks     :int   = 3,
			
 
				-                 act_type       :str   = 'silu',
			
 
				-                 norm_type      :str   = 'BN',
			
 
				-                 depthwise      :bool  = False,
			
 
				-                 # Transformer's parameters
			
 
				-                 num_heads      :int   = 8,
			
 
				-                 num_layers     :int   = 1,
			
 
				-                 ffn_dim        :int   = 1024,
			
 
				-                 dropout        :float = 0.1,
			
 
				-                 pe_temperature :float = 10000.,
			
 
				-                 en_act_type    :str   = 'gelu'
			
 
				-                 ) -> None:
			
 
				-        super(HybridEncoder, self).__init__()
			
 
				-        print('==============================')
			
 
				-        print('FPN: {}'.format("RTC-PaFPN"))
			
 
				-        # ---------------- Basic parameters ----------------
			
 
				-        self.in_dims = in_dims
			
 
				-        self.out_dim = out_dim
			
 
				-        self.out_dims = [self.out_dim] * len(in_dims)
			
 
				-        self.num_heads = num_heads
			
 
				-        self.num_layers = num_layers
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        c3, c4, c5 = in_dims
			
 
				-
			
 
				-        # ---------------- Input projs ----------------
			
 
				-        self.reduce_layer_1 = BasicConv(c5, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.reduce_layer_2 = BasicConv(c4, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.reduce_layer_3 = BasicConv(c3, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-
			
 
				-        # ---------------- Downsample ----------------
			
 
				-        self.dowmsample_layer_1 = BasicConv(self.out_dim, self.out_dim,
			
 
				-                                            kernel_size=3, padding=1, stride=2,
			
 
				-                                            act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        self.dowmsample_layer_2 = BasicConv(self.out_dim, self.out_dim,
			
 
				-                                            kernel_size=3, padding=1, stride=2,
			
 
				-                                            act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-
			
 
				-        # ---------------- Transformer Encoder ----------------
			
 
				-        self.transformer_encoder = TransformerEncoder(d_model        = self.out_dim,
			
 
				-                                                      num_heads      = num_heads,
			
 
				-                                                      num_layers     = num_layers,
			
 
				-                                                      ffn_dim        = ffn_dim,
			
 
				-                                                      pe_temperature = pe_temperature,
			
 
				-                                                      dropout        = dropout,
			
 
				-                                                      act_type       = en_act_type
			
 
				-                                                      )
			
 
				-
			
 
				-        # ---------------- Top dwon FPN ----------------
			
 
				-        ## P5 -> P4
			
 
				-        self.top_down_layer_1 = RTCBlock(in_dim      = self.out_dim * 2,
			
 
				-                                         out_dim     = self.out_dim,
			
 
				-                                         num_blocks  = num_blocks,
			
 
				-                                         shortcut    = False,
			
 
				-                                         act_type    = act_type,
			
 
				-                                         norm_type   = norm_type,
			
 
				-                                         depthwise   = depthwise,
			
 
				-                                         )
			
 
				-        ## P4 -> P3
			
 
				-        self.top_down_layer_2 = RTCBlock(in_dim      = self.out_dim * 2,
			
 
				-                                         out_dim     = self.out_dim,
			
 
				-                                         num_blocks  = num_blocks,
			
 
				-                                         shortcut    = False,
			
 
				-                                         act_type    = act_type,
			
 
				-                                         norm_type   = norm_type,
			
 
				-                                         depthwise   = depthwise,
			
 
				-                                         )
			
 
				-        
			
 
				-        # ---------------- Bottom up PAN----------------
			
 
				-        ## P3 -> P4
			
 
				-        self.bottom_up_layer_1 = RTCBlock(in_dim      = self.out_dim * 2,
			
 
				-                                          out_dim     = self.out_dim,
			
 
				-                                          num_blocks  = num_blocks,
			
 
				-                                          shortcut    = False,
			
 
				-                                          act_type    = act_type,
			
 
				-                                          norm_type   = norm_type,
			
 
				-                                          depthwise   = depthwise,
			
 
				-                                          )
			
 
				-        ## P4 -> P5
			
 
				-        self.bottom_up_layer_2 = RTCBlock(in_dim      = self.out_dim * 2,
			
 
				-                                          out_dim     = self.out_dim,
			
 
				-                                          num_blocks  = num_blocks,
			
 
				-                                          shortcut    = False,
			
 
				-                                          act_type    = act_type,
			
 
				-                                          norm_type   = norm_type,
			
 
				-                                          depthwise   = depthwise,
			
 
				-                                          )
			
 
				-
			
 
				-        self.init_weights()
			
 
				-  
			
 
				-    def init_weights(self):
			
 
				-        """Initialize the parameters."""
			
 
				-        for m in self.modules():
			
 
				-            if isinstance(m, torch.nn.Conv2d):
			
 
				-                # In order to be consistent with the source code,
			
 
				-                # reset the Conv2d initialization parameters
			
 
				-                m.reset_parameters()
			
 
				-
			
 
				-    def forward(self, features):
			
 
				-        c3, c4, c5 = features
			
 
				-
			
 
				-        # -------- Input projs --------
			
 
				-        p5 = self.reduce_layer_1(c5)
			
 
				-        p4 = self.reduce_layer_2(c4)
			
 
				-        p3 = self.reduce_layer_3(c3)
			
 
				-
			
 
				-        # -------- Transformer encoder --------
			
 
				-        p5 = self.transformer_encoder(p5)
			
 
				-
			
 
				-        # -------- Top down FPN --------
			
 
				-        p5_up = F.interpolate(p5, scale_factor=2.0)
			
 
				-        p4 = self.top_down_layer_1(torch.cat([p4, p5_up], dim=1))
			
 
				-
			
 
				-        p4_up = F.interpolate(p4, scale_factor=2.0)
			
 
				-        p3 = self.top_down_layer_2(torch.cat([p3, p4_up], dim=1))
			
 
				-
			
 
				-        # -------- Bottom up PAN --------
			
 
				-        p3_ds = self.dowmsample_layer_1(p3)
			
 
				-        p4 = self.bottom_up_layer_1(torch.cat([p4, p3_ds], dim=1))
			
 
				-
			
 
				-        p4_ds = self.dowmsample_layer_2(p4)
			
 
				-        p5 = self.bottom_up_layer_2(torch.cat([p5, p4_ds], dim=1))
			
 
				-
			
 
				-        out_feats = [p3, p4, p5]
			
 
				-        
			
 
				-        return out_feats
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    cfg = {
			
 
				-        'fpn': 'hybrid_encoder',
			
 
				-        'fpn_act': 'silu',
			
 
				-        'fpn_norm': 'BN',
			
 
				-        'fpn_depthwise': False,
			
 
				-        'fpn_num_blocks': 3,
			
 
				-        'fpn_expansion': 0.5,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 1,
			
 
				-        'en_ffn_dim': 1024,
			
 
				-        'en_dropout': 0.0,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'en_act': 'gelu',
			
 
				-    }
			
 
				-    fpn_dims = [256, 512, 1024]
			
 
				-    out_dim = 256
			
 
				-    pyramid_feats = [torch.randn(1, fpn_dims[0], 80, 80), torch.randn(1, fpn_dims[1], 40, 40), torch.randn(1, fpn_dims[2], 20, 20)]
			
 
				-    model = build_fpn(cfg, fpn_dims, out_dim)
			
 
				-
			
 
				-    t0 = time.time()
			
 
				-    outputs = model(pyramid_feats)
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
 
				-    for out in outputs:
			
 
				-        print(out.shape)
			
 
				-
			
 
				-    print('==============================')
			
 
				-    flops, params = profile(model, inputs=(pyramid_feats, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/models/detectors/rtdetr/basic_modules/transformer.py
+++ b/models/detectors/rtdetr/basic_modules/transformer.py
@@ -1,462 +0,0 @@
 
				-import math
			
 
				-import copy
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-try:
			
 
				-    from .basic import FFN
			
 
				-except:
			
 
				-    from  basic import FFN
			
 
				-
			
 
				-
			
 
				-def get_clones(module, N):
			
 
				-    if N <= 0:
			
 
				-        return None
			
 
				-    else:
			
 
				-        return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
			
 
				-
			
 
				-def inverse_sigmoid(x, eps=1e-5):
			
 
				-    x = x.clamp(min=0., max=1.)
			
 
				-    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
			
 
				-
			
 
				-
			
 
				-# ----------------- Basic Transformer Ops -----------------
			
 
				-def multi_scale_deformable_attn_pytorch(
			
 
				-    value: torch.Tensor,
			
 
				-    value_spatial_shapes: torch.Tensor,
			
 
				-    sampling_locations: torch.Tensor,
			
 
				-    attention_weights: torch.Tensor,
			
 
				-) -> torch.Tensor:
			
 
				-
			
 
				-    bs, _, num_heads, embed_dims = value.shape
			
 
				-    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
			
 
				-    
			
 
				-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
			
 
				-    sampling_grids = 2 * sampling_locations - 1
			
 
				-    sampling_value_list = []
			
 
				-    for level, (H_, W_) in enumerate(value_spatial_shapes):
			
 
				-        # bs, H_*W_, num_heads, embed_dims ->
			
 
				-        # bs, H_*W_, num_heads*embed_dims ->
			
 
				-        # bs, num_heads*embed_dims, H_*W_ ->
			
 
				-        # bs*num_heads, embed_dims, H_, W_
			
 
				-        value_l_ = (
			
 
				-            value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
			
 
				-        )
			
 
				-        # bs, num_queries, num_heads, num_points, 2 ->
			
 
				-        # bs, num_heads, num_queries, num_points, 2 ->
			
 
				-        # bs*num_heads, num_queries, num_points, 2
			
 
				-        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
			
 
				-        # bs*num_heads, embed_dims, num_queries, num_points
			
 
				-        sampling_value_l_ = F.grid_sample(
			
 
				-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
			
 
				-        )
			
 
				-        sampling_value_list.append(sampling_value_l_)
			
 
				-    # (bs, num_queries, num_heads, num_levels, num_points) ->
			
 
				-    # (bs, num_heads, num_queries, num_levels, num_points) ->
			
 
				-    # (bs, num_heads, 1, num_queries, num_levels*num_points)
			
 
				-    attention_weights = attention_weights.transpose(1, 2).reshape(
			
 
				-        bs * num_heads, 1, num_queries, num_levels * num_points
			
 
				-    )
			
 
				-    output = (
			
 
				-        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
			
 
				-        .sum(-1)
			
 
				-        .view(bs, num_heads * embed_dims, num_queries)
			
 
				-    )
			
 
				-    return output.transpose(1, 2).contiguous()
			
 
				-
			
 
				-class MSDeformableAttention(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 embed_dim=256,
			
 
				-                 num_heads=8,
			
 
				-                 num_levels=4,
			
 
				-                 num_points=4):
			
 
				-        """
			
 
				-        Multi-Scale Deformable Attention Module
			
 
				-        """
			
 
				-        super(MSDeformableAttention, self).__init__()
			
 
				-        self.embed_dim = embed_dim
			
 
				-        self.num_heads = num_heads
			
 
				-        self.num_levels = num_levels
			
 
				-        self.num_points = num_points
			
 
				-        self.total_points = num_heads * num_levels * num_points
			
 
				-
			
 
				-        self.head_dim = embed_dim // num_heads
			
 
				-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
			
 
				-
			
 
				-        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2)
			
 
				-        self.attention_weights = nn.Linear(embed_dim, self.total_points)
			
 
				-        self.value_proj = nn.Linear(embed_dim, embed_dim)
			
 
				-        self.output_proj = nn.Linear(embed_dim, embed_dim)
			
 
				-        
			
 
				-        try:
			
 
				-            # use cuda op
			
 
				-            from deformable_detr_ops import ms_deformable_attn
			
 
				-            self.ms_deformable_attn_core = ms_deformable_attn
			
 
				-        except:
			
 
				-            # use torch func
			
 
				-            self.ms_deformable_attn_core = multi_scale_deformable_attn_pytorch
			
 
				-
			
 
				-        self._reset_parameters()
			
 
				-
			
 
				-    def _reset_parameters(self):
			
 
				-        """
			
 
				-        Default initialization for Parameters of Module.
			
 
				-        """
			
 
				-        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
			
 
				-        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (
			
 
				-            2.0 * math.pi / self.num_heads
			
 
				-        )
			
 
				-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
			
 
				-        grid_init = (
			
 
				-            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
			
 
				-            .view(self.num_heads, 1, 1, 2)
			
 
				-            .repeat(1, self.num_levels, self.num_points, 1)
			
 
				-        )
			
 
				-        for i in range(self.num_points):
			
 
				-            grid_init[:, :, i, :] *= i + 1
			
 
				-        with torch.no_grad():
			
 
				-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
			
 
				-
			
 
				-        # attention weight
			
 
				-        nn.init.constant_(self.attention_weights.weight, 0.0)
			
 
				-        nn.init.constant_(self.attention_weights.bias, 0.0)
			
 
				-
			
 
				-        # proj
			
 
				-        nn.init.xavier_uniform_(self.value_proj.weight)
			
 
				-        nn.init.constant_(self.value_proj.bias, 0.0)
			
 
				-        nn.init.xavier_uniform_(self.output_proj.weight)
			
 
				-        nn.init.constant_(self.output_proj.bias, 0.0)
			
 
				-
			
 
				-    def forward(self,
			
 
				-                query,
			
 
				-                reference_points,
			
 
				-                value,
			
 
				-                value_spatial_shapes,
			
 
				-                value_mask=None):
			
 
				-        """
			
 
				-        Args:
			
 
				-            query (Tensor): [bs, query_length, C]
			
 
				-            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
			
 
				-                bottom-right (1, 1), including padding area
			
 
				-            value (Tensor): [bs, value_length, C]
			
 
				-            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
			
 
				-            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
			
 
				-
			
 
				-        Returns:
			
 
				-            output (Tensor): [bs, Length_{query}, C]
			
 
				-        """
			
 
				-        bs, num_query = query.shape[:2]
			
 
				-        num_value = value.shape[1]
			
 
				-        assert sum([s[0] * s[1] for s in value_spatial_shapes]) == num_value
			
 
				-
			
 
				-        # Value projection
			
 
				-        value = self.value_proj(value)
			
 
				-        # fill "0" for the padding part
			
 
				-        if value_mask is not None:
			
 
				-            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
			
 
				-            value *= value_mask
			
 
				-        # [bs, all_hw, 256] -> [bs, all_hw, num_head, head_dim]
			
 
				-        value = value.reshape([bs, num_value, self.num_heads, -1])
			
 
				-
			
 
				-        # [bs, all_hw, num_head, nun_level, num_sample_point, num_offset]
			
 
				-        sampling_offsets = self.sampling_offsets(query).reshape(
			
 
				-            [bs, num_query, self.num_heads, self.num_levels, self.num_points, 2])
			
 
				-        # [bs, all_hw, num_head, nun_level*num_sample_point]
			
 
				-        attention_weights = self.attention_weights(query).reshape(
			
 
				-            [bs, num_query, self.num_heads, self.num_levels * self.num_points])
			
 
				-        # [bs, all_hw, num_head, nun_level, num_sample_point]
			
 
				-        attention_weights = attention_weights.softmax(-1).reshape(
			
 
				-            [bs, num_query, self.num_heads, self.num_levels, self.num_points])
			
 
				-
			
 
				-        # [bs, num_query, num_heads, num_levels, num_points, 2]
			
 
				-        if reference_points.shape[-1] == 2:
			
 
				-            # reference_points   [bs, all_hw, num_sample_point, 2] -> [bs, all_hw, 1, num_sample_point, 1, 2]
			
 
				-            # sampling_offsets   [bs, all_hw, nun_head, num_level, num_sample_point, 2]
			
 
				-            # offset_normalizer  [4, 2] -> [1, 1, 1, num_sample_point, 1, 2]
			
 
				-            # references_points + sampling_offsets
			
 
				-            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
			
 
				-                [1, 1, 1, self.num_levels, 1, 2])
			
 
				-            sampling_locations = (
			
 
				-                reference_points[:, :, None, :, None, :]
			
 
				-                + sampling_offsets / offset_normalizer
			
 
				-            )
			
 
				-        elif reference_points.shape[-1] == 4:
			
 
				-            sampling_locations = (
			
 
				-                reference_points[:, :, None, :, None, :2]
			
 
				-                + sampling_offsets
			
 
				-                / self.num_points
			
 
				-                * reference_points[:, :, None, :, None, 2:]
			
 
				-                * 0.5)
			
 
				-        else:
			
 
				-            raise ValueError(
			
 
				-                "Last dim of reference_points must be 2 or 4, but get {} instead.".
			
 
				-                format(reference_points.shape[-1]))
			
 
				-
			
 
				-        # Multi-scale Deformable attention
			
 
				-        output = self.ms_deformable_attn_core(
			
 
				-            value, value_spatial_shapes, sampling_locations, attention_weights)
			
 
				-        
			
 
				-        # Output project
			
 
				-        output = self.output_proj(output)
			
 
				-
			
 
				-        return output
			
 
				-
			
 
				-
			
 
				-# ----------------- Transformer modules -----------------
			
 
				-## Transformer Encoder layer
			
 
				-class TransformerEncoderLayer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 d_model         :int   = 256,
			
 
				-                 num_heads       :int   = 8,
			
 
				-                 ffn_dim         :int   = 1024,
			
 
				-                 dropout         :float = 0.1,
			
 
				-                 act_type        :str   = "relu",
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.dropout = dropout
			
 
				-        self.act_type = act_type
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        # Multi-head Self-Attn
			
 
				-        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-        self.norm = nn.LayerNorm(d_model)
			
 
				-
			
 
				-        # Feedforwaed Network
			
 
				-        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
			
 
				-
			
 
				-    def with_pos_embed(self, tensor, pos):
			
 
				-        return tensor if pos is None else tensor + pos
			
 
				-
			
 
				-    def forward(self, src, pos_embed):
			
 
				-        """
			
 
				-        Input:
			
 
				-            src:       [torch.Tensor] -> [B, N, C]
			
 
				-            pos_embed: [torch.Tensor] -> [B, N, C]
			
 
				-        Output:
			
 
				-            src:       [torch.Tensor] -> [B, N, C]
			
 
				-        """
			
 
				-        q = k = self.with_pos_embed(src, pos_embed)
			
 
				-
			
 
				-        # -------------- MHSA --------------
			
 
				-        src2 = self.self_attn(q, k, value=src)[0]
			
 
				-        src = src + self.dropout(src2)
			
 
				-        src = self.norm(src)
			
 
				-
			
 
				-        # -------------- FFN --------------
			
 
				-        src = self.ffn(src)
			
 
				-        
			
 
				-        return src
			
 
				-
			
 
				-## Transformer Encoder
			
 
				-class TransformerEncoder(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 d_model        :int   = 256,
			
 
				-                 num_heads      :int   = 8,
			
 
				-                 num_layers     :int   = 1,
			
 
				-                 ffn_dim        :int   = 1024,
			
 
				-                 pe_temperature : float = 10000.,
			
 
				-                 dropout        :float = 0.1,
			
 
				-                 act_type       :str   = "relu",
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.num_layers = num_layers
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.dropout = dropout
			
 
				-        self.act_type = act_type
			
 
				-        self.pe_temperature = pe_temperature
			
 
				-        self.pos_embed = None
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.encoder_layers = get_clones(
			
 
				-            TransformerEncoderLayer(d_model, num_heads, ffn_dim, dropout, act_type), num_layers)
			
 
				-
			
 
				-    def build_2d_sincos_position_embedding(self, device, w, h, embed_dim=256, temperature=10000.):
			
 
				-        assert embed_dim % 4 == 0, \
			
 
				-            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
			
 
				-        
			
 
				-        # ----------- Check cahed pos_embed -----------
			
 
				-        if self.pos_embed is not None and \
			
 
				-            self.pos_embed.shape[2:] == [h, w]:
			
 
				-            return self.pos_embed
			
 
				-        
			
 
				-        # ----------- Generate grid coords -----------
			
 
				-        grid_w = torch.arange(int(w), dtype=torch.float32)
			
 
				-        grid_h = torch.arange(int(h), dtype=torch.float32)
			
 
				-        grid_w, grid_h = torch.meshgrid([grid_w, grid_h])  # shape: [H, W]
			
 
				-
			
 
				-        pos_dim = embed_dim // 4
			
 
				-        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
			
 
				-        omega = 1. / (temperature**omega)
			
 
				-
			
 
				-        out_w = grid_w.flatten()[..., None] @ omega[None] # shape: [N, C]
			
 
				-        out_h = grid_h.flatten()[..., None] @ omega[None] # shape: [N, C]
			
 
				-
			
 
				-        # shape: [1, N, C]
			
 
				-        pos_embed = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h),torch.cos(out_h)], dim=1)[None, :, :]
			
 
				-        pos_embed = pos_embed.to(device)
			
 
				-        self.pos_embed = pos_embed
			
 
				-
			
 
				-        return pos_embed
			
 
				-
			
 
				-    def forward(self, src):
			
 
				-        """
			
 
				-        Input:
			
 
				-            src:  [torch.Tensor] -> [B, C, H, W]
			
 
				-        Output:
			
 
				-            src:  [torch.Tensor] -> [B, C, H, W]
			
 
				-        """
			
 
				-        # -------- Transformer encoder --------
			
 
				-        channels, fmp_h, fmp_w = src.shape[1:]
			
 
				-        # [B, C, H, W] -> [B, N, C], N=HxW
			
 
				-        src_flatten = src.flatten(2).permute(0, 2, 1).contiguous()
			
 
				-        memory = src_flatten
			
 
				-
			
 
				-        # PosEmbed: [1, N, C]
			
 
				-        pos_embed = self.build_2d_sincos_position_embedding(
			
 
				-            src.device, fmp_w, fmp_h, channels, self.pe_temperature)
			
 
				-        
			
 
				-        # Transformer Encoder layer
			
 
				-        for encoder in self.encoder_layers:
			
 
				-            memory = encoder(memory, pos_embed=pos_embed)
			
 
				-
			
 
				-        # Output: [B, N, C] -> [B, C, N] -> [B, C, H, W]
			
 
				-        src = memory.permute(0, 2, 1).contiguous()
			
 
				-        src = src.view([-1, channels, fmp_h, fmp_w])
			
 
				-
			
 
				-        return src
			
 
				-
			
 
				-## Transformer Decoder layer
			
 
				-class DeformableTransformerDecoderLayer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 d_model     :int   = 256,
			
 
				-                 num_heads   :int   = 8,
			
 
				-                 num_levels  :int   = 3,
			
 
				-                 num_points  :int   = 4,
			
 
				-                 ffn_dim     :int   = 1024,
			
 
				-                 dropout     :float = 0.1,
			
 
				-                 act_type    :str   = "relu",
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.num_levels = num_levels
			
 
				-        self.num_points = num_points
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.dropout = dropout
			
 
				-        self.act_type = act_type
			
 
				-        # ---------------- Network parameters ----------------
			
 
				-        ## Multi-head Self-Attn
			
 
				-        self.self_attn  = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
			
 
				-        self.dropout1 = nn.Dropout(dropout)
			
 
				-        self.norm1 = nn.LayerNorm(d_model)
			
 
				-        ## CrossAttention
			
 
				-        self.cross_attn = MSDeformableAttention(d_model, num_heads, num_levels, num_points)
			
 
				-        self.dropout2 = nn.Dropout(dropout)
			
 
				-        self.norm2 = nn.LayerNorm(d_model)
			
 
				-        ## FFN
			
 
				-        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
			
 
				-
			
 
				-    def with_pos_embed(self, tensor, pos):
			
 
				-        return tensor if pos is None else tensor + pos
			
 
				-
			
 
				-    def forward(self,
			
 
				-                tgt,
			
 
				-                reference_points,
			
 
				-                memory,
			
 
				-                memory_spatial_shapes,
			
 
				-                attn_mask=None,
			
 
				-                memory_mask=None,
			
 
				-                query_pos_embed=None):
			
 
				-        # ---------------- MSHA for Object Query -----------------
			
 
				-        q = k = self.with_pos_embed(tgt, query_pos_embed)
			
 
				-        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)[0]
			
 
				-        tgt = tgt + self.dropout1(tgt2)
			
 
				-        tgt = self.norm1(tgt)
			
 
				-
			
 
				-        # ---------------- CMHA for Object Query and Image-feature -----------------
			
 
				-        tgt2 = self.cross_attn(self.with_pos_embed(tgt, query_pos_embed),
			
 
				-                               reference_points,
			
 
				-                               memory,
			
 
				-                               memory_spatial_shapes,
			
 
				-                               memory_mask)
			
 
				-        tgt = tgt + self.dropout2(tgt2)
			
 
				-        tgt = self.norm2(tgt)
			
 
				-
			
 
				-        # ---------------- FeedForward Network -----------------
			
 
				-        tgt = self.ffn(tgt)
			
 
				-
			
 
				-        return tgt
			
 
				-
			
 
				-## Transformer Decoder
			
 
				-class DeformableTransformerDecoder(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 d_model        :int   = 256,
			
 
				-                 num_heads      :int   = 8,
			
 
				-                 num_layers     :int   = 1,
			
 
				-                 num_levels     :int   = 3,
			
 
				-                 num_points     :int   = 4,
			
 
				-                 ffn_dim        :int   = 1024,
			
 
				-                 dropout        :float = 0.1,
			
 
				-                 act_type       :str   = "relu",
			
 
				-                 return_intermediate :bool = False,
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.num_layers = num_layers
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.dropout = dropout
			
 
				-        self.act_type = act_type
			
 
				-        self.pos_embed = None
			
 
				-        # ----------- Network parameters -----------
			
 
				-        self.decoder_layers = get_clones(
			
 
				-            DeformableTransformerDecoderLayer(d_model, num_heads, num_levels, num_points, ffn_dim, dropout, act_type), num_layers)
			
 
				-        self.num_layers = num_layers
			
 
				-        self.return_intermediate = return_intermediate
			
 
				-
			
 
				-    def forward(self,
			
 
				-                tgt,
			
 
				-                ref_points_unact,
			
 
				-                memory,
			
 
				-                memory_spatial_shapes,
			
 
				-                bbox_head,
			
 
				-                score_head,
			
 
				-                query_pos_head,
			
 
				-                attn_mask=None,
			
 
				-                memory_mask=None):
			
 
				-        output = tgt
			
 
				-        dec_out_bboxes = []
			
 
				-        dec_out_logits = []
			
 
				-        ref_points_detach = F.sigmoid(ref_points_unact)
			
 
				-        for i, layer in enumerate(self.decoder_layers):
			
 
				-            ref_points_input = ref_points_detach.unsqueeze(2)
			
 
				-            query_pos_embed = query_pos_head(ref_points_detach)
			
 
				-
			
 
				-            output = layer(output, ref_points_input, memory,
			
 
				-                           memory_spatial_shapes, attn_mask,
			
 
				-                           memory_mask, query_pos_embed)
			
 
				-
			
 
				-            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
			
 
				-
			
 
				-            dec_out_logits.append(score_head[i](output))
			
 
				-            if i == 0:
			
 
				-                dec_out_bboxes.append(inter_ref_bbox)
			
 
				-            else:
			
 
				-                dec_out_bboxes.append(
			
 
				-                    F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
			
 
				-
			
 
				-            ref_points = inter_ref_bbox
			
 
				-            ref_points_detach = inter_ref_bbox.detach() if self.training else inter_ref_bbox
			
 
				-
			
 
				-        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
			
 
				-
			
--- a/models/detectors/rtdetr/build.py
+++ b/models/detectors/rtdetr/build.py
@@ -1,37 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .loss import build_criterion
			
 
				-from .rtdetr import RT_DETR
			
 
				-
			
 
				-
			
 
				-# build object detector
			
 
				-def build_rtdetr(args, cfg, num_classes=80, trainable=False, deploy=False):
			
 
				-    print('==============================')
			
 
				-    print('Build {} ...'.format(args.model.upper()))
			
 
				-    
			
 
				-    print('==============================')
			
 
				-    print('Model Configuration: \n', cfg)
			
 
				-    
			
 
				-    # -------------- Build RT-DETR --------------
			
 
				-    model = RT_DETR(cfg             = cfg,
			
 
				-                    num_classes     = num_classes,
			
 
				-                    nms_thresh      = args.nms_thresh,
			
 
				-                    conf_thresh     = args.conf_thresh,
			
 
				-                    topk            = 300,
			
 
				-                    onnx_deploy     = deploy,
			
 
				-                    no_multi_labels = args.no_multi_labels,
			
 
				-                    use_nms         = True,   # NMS is beneficial 
			
 
				-                    nms_class_agnostic = args.nms_class_agnostic
			
 
				-                    )
			
 
				-            
			
 
				-    # -------------- Build criterion --------------
			
 
				-    criterion = None
			
 
				-    if trainable:
			
 
				-        # build criterion for training
			
 
				-        criterion = build_criterion(cfg, num_classes)
			
 
				-        
			
 
				-    return model, criterion
			
--- a/models/detectors/rtdetr/loss.py
+++ b/models/detectors/rtdetr/loss.py
@@ -1,201 +0,0 @@
 
				-"""
			
 
				-reference: 
			
 
				-https://github.com/facebookresearch/detr/blob/main/models/detr.py
			
 
				-
			
 
				-by lyuwenyu
			
 
				-"""
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-try:
			
 
				-    from .loss_utils import box_cxcywh_to_xyxy, box_iou, generalized_box_iou
			
 
				-    from .loss_utils import is_dist_avail_and_initialized, get_world_size
			
 
				-    from .matcher import HungarianMatcher
			
 
				-except:
			
 
				-    from loss_utils import box_cxcywh_to_xyxy, box_iou, generalized_box_iou
			
 
				-    from loss_utils import is_dist_avail_and_initialized, get_world_size
			
 
				-    from matcher import HungarianMatcher
			
 
				-
			
 
				-
			
 
				-# --------------- Criterion for RT-DETR ---------------
			
 
				-def build_criterion(cfg, num_classes=80):
			
 
				-    matcher = HungarianMatcher(cfg['matcher_hpy'], alpha=0.25, gamma=2.0)
			
 
				-    weight_dict = {'loss_cls':  cfg['loss_coeff']['class'],
			
 
				-                   'loss_box':  cfg['loss_coeff']['bbox'],
			
 
				-                   'loss_giou': cfg['loss_coeff']['giou']}
			
 
				-    criterion = Criterion(matcher, weight_dict, num_classes=num_classes)
			
 
				-
			
 
				-    return criterion
			
 
				-
			
 
				-
			
 
				-class Criterion(nn.Module):
			
 
				-    """ This class computes the loss for DETR.
			
 
				-    The process happens in two steps:
			
 
				-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
			
 
				-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
			
 
				-    """
			
 
				-    def __init__(self, matcher, weight_dict, num_classes=80):
			
 
				-        """ Create the criterion.
			
 
				-        Parameters:
			
 
				-            num_classes: number of object categories, omitting the special no-object category
			
 
				-            matcher: module able to compute a matching between targets and proposals
			
 
				-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
			
 
				-            eos_coef: relative classification weight applied to the no-object category
			
 
				-            losses: list of all the losses to be applied. See get_loss for list of available losses.
			
 
				-        """
			
 
				-        super().__init__()
			
 
				-        self.num_classes = num_classes
			
 
				-        self.matcher = matcher
			
 
				-        self.weight_dict = weight_dict
			
 
				-        self.losses = ['labels', 'boxes']
			
 
				-
			
 
				-        self.alpha = 0.75  # For VFL
			
 
				-        self.gamma = 2.0
			
 
				-
			
 
				-    def loss_labels(self, outputs, targets, indices, num_boxes):
			
 
				-        "Compute variable focal loss"
			
 
				-        assert 'pred_boxes' in outputs
			
 
				-        idx = self._get_src_permutation_idx(indices)
			
 
				-        # Compute IoU between pred and target
			
 
				-        src_boxes = outputs['pred_boxes'][idx]
			
 
				-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
			
 
				-        ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))
			
 
				-        ious = torch.diag(ious).detach()
			
 
				-
			
 
				-        # One-hot class label
			
 
				-        src_logits = outputs['pred_logits']
			
 
				-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
			
 
				-        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
			
 
				-                                    dtype=torch.int64, device=src_logits.device)
			
 
				-        target_classes[idx] = target_classes_o
			
 
				-        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
			
 
				-
			
 
				-        # Iou-aware class label
			
 
				-        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
			
 
				-        target_score_o[idx] = ious.to(target_score_o.dtype)
			
 
				-        target_score = target_score_o.unsqueeze(-1) * target
			
 
				-
			
 
				-        # Compute VFL
			
 
				-        pred_score = F.sigmoid(src_logits).detach()
			
 
				-        weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
			
 
				-        
			
 
				-        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none')
			
 
				-        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
			
 
				-
			
 
				-        return {'loss_cls': loss}
			
 
				-
			
 
				-    def loss_boxes(self, outputs, targets, indices, num_boxes):
			
 
				-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
			
 
				-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
			
 
				-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
			
 
				-        """
			
 
				-        assert 'pred_boxes' in outputs
			
 
				-        idx = self._get_src_permutation_idx(indices)
			
 
				-        src_boxes = outputs['pred_boxes'][idx]
			
 
				-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
			
 
				-
			
 
				-        losses = {}
			
 
				-
			
 
				-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
			
 
				-        losses['loss_box'] = loss_bbox.sum() / num_boxes
			
 
				-
			
 
				-        loss_giou = 1 - torch.diag(generalized_box_iou(
			
 
				-                box_cxcywh_to_xyxy(src_boxes),
			
 
				-                box_cxcywh_to_xyxy(target_boxes)))
			
 
				-        losses['loss_giou'] = loss_giou.sum() / num_boxes
			
 
				-        return losses
			
 
				-
			
 
				-    def _get_src_permutation_idx(self, indices):
			
 
				-        # permute predictions following indices
			
 
				-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
			
 
				-        src_idx = torch.cat([src for (src, _) in indices])
			
 
				-        return batch_idx, src_idx
			
 
				-
			
 
				-    def _get_tgt_permutation_idx(self, indices):
			
 
				-        # permute targets following indices
			
 
				-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
			
 
				-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
			
 
				-        return batch_idx, tgt_idx
			
 
				-
			
 
				-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
			
 
				-        loss_map = {
			
 
				-            'boxes': self.loss_boxes,
			
 
				-            'labels': self.loss_labels,
			
 
				-        }
			
 
				-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
			
 
				-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
			
 
				-
			
 
				-    def forward(self, outputs, targets):
			
 
				-        """ This performs the loss computation.
			
 
				-        Parameters:
			
 
				-             outputs: dict of tensors, see the output specification of the model for the format
			
 
				-             targets: list of dicts, such that len(targets) == batch_size.
			
 
				-                      The expected keys in each dict depends on the losses applied, see each loss' doc
			
 
				-        """
			
 
				-        outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k}
			
 
				-
			
 
				-        # Retrieve the matching between the outputs of the last layer and the targets
			
 
				-        indices = self.matcher(outputs_without_aux, targets)
			
 
				-
			
 
				-        # Compute the average number of target boxes accross all nodes, for normalization purposes
			
 
				-        num_boxes = sum(len(t["labels"]) for t in targets)
			
 
				-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
			
 
				-        if is_dist_avail_and_initialized():
			
 
				-            torch.distributed.all_reduce(num_boxes)
			
 
				-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
			
 
				-
			
 
				-        # Compute all the requested losses
			
 
				-        losses = {}
			
 
				-        for loss in self.losses:
			
 
				-            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes)
			
 
				-            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
			
 
				-            losses.update(l_dict)
			
 
				-
			
 
				-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
			
 
				-        if 'aux_outputs' in outputs:
			
 
				-            for i, aux_outputs in enumerate(outputs['aux_outputs']):
			
 
				-                indices = self.matcher(aux_outputs, targets)
			
 
				-                for loss in self.losses:
			
 
				-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes)
			
 
				-                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
			
 
				-                    l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()}
			
 
				-                    losses.update(l_dict)
			
 
				-
			
 
				-        # In case of cdn auxiliary losses. For rtdetr
			
 
				-        if 'dn_aux_outputs' in outputs:
			
 
				-            assert 'dn_meta' in outputs, ''
			
 
				-            indices = self.get_cdn_matched_indices(outputs['dn_meta'], targets)
			
 
				-            num_boxes = num_boxes * outputs['dn_meta']['dn_num_group']
			
 
				-
			
 
				-            for i, aux_outputs in enumerate(outputs['dn_aux_outputs']):
			
 
				-                # indices = self.matcher(aux_outputs, targets)
			
 
				-                for loss in self.losses:
			
 
				-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes)
			
 
				-                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
			
 
				-                    l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()}
			
 
				-                    losses.update(l_dict)
			
 
				-
			
 
				-        return losses
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def get_cdn_matched_indices(dn_meta, targets):
			
 
				-        '''get_cdn_matched_indices
			
 
				-        '''
			
 
				-        dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
			
 
				-        num_gts = [len(t['labels']) for t in targets]
			
 
				-        device = targets[0]['labels'].device
			
 
				-        
			
 
				-        dn_match_indices = []
			
 
				-        for i, num_gt in enumerate(num_gts):
			
 
				-            if num_gt > 0:
			
 
				-                gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device)
			
 
				-                gt_idx = gt_idx.tile(dn_num_group)
			
 
				-                assert len(dn_positive_idx[i]) == len(gt_idx)
			
 
				-                dn_match_indices.append((dn_positive_idx[i], gt_idx))
			
 
				-            else:
			
 
				-                dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \
			
 
				-                    torch.zeros(0, dtype=torch.int64,  device=device)))
			
 
				-        
			
 
				-        return dn_match_indices
			
--- a/models/detectors/rtdetr/loss_utils.py
+++ b/models/detectors/rtdetr/loss_utils.py
@@ -1,240 +0,0 @@
 
				-import math
			
 
				-import torch
			
 
				-import torch.nn.functional as F
			
 
				-import torch.distributed as dist
			
 
				-from torchvision.ops.boxes import box_area
			
 
				-
			
 
				-
			
 
				-# ------------------------- For loss -------------------------
			
 
				-## FocalLoss
			
 
				-def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
			
 
				-    """
			
 
				-    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
			
 
				-    Args:
			
 
				-        inputs: A float tensor of arbitrary shape.
			
 
				-                The predictions for each example.
			
 
				-        targets: A float tensor with the same shape as inputs. Stores the binary
			
 
				-                 classification label for each element in inputs
			
 
				-                (0 for the negative class and 1 for the positive class).
			
 
				-        alpha: (optional) Weighting factor in range (0,1) to balance
			
 
				-                positive vs negative examples. Default = -1 (no weighting).
			
 
				-        gamma: Exponent of the modulating factor (1 - p_t) to
			
 
				-               balance easy vs hard examples.
			
 
				-    Returns:
			
 
				-        Loss tensor
			
 
				-    """
			
 
				-    prob = inputs.sigmoid()
			
 
				-    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
			
 
				-    p_t = prob * targets + (1 - prob) * (1 - targets)
			
 
				-    loss = ce_loss * ((1 - p_t) ** gamma)
			
 
				-
			
 
				-    if alpha >= 0:
			
 
				-        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
			
 
				-        loss = alpha_t * loss
			
 
				-
			
 
				-    return loss.mean(1).sum() / num_boxes
			
 
				-
			
 
				-## Variable FocalLoss
			
 
				-def varifocal_loss_with_logits(pred_logits,
			
 
				-                               gt_score,
			
 
				-                               label,
			
 
				-                               normalizer=1.0,
			
 
				-                               alpha=0.75,
			
 
				-                               gamma=2.0):
			
 
				-    pred_score = F.sigmoid(pred_logits)
			
 
				-    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
			
 
				-    loss = F.binary_cross_entropy_with_logits(pred_logits, gt_score, reduction='none')
			
 
				-    loss = loss * weight
			
 
				-
			
 
				-    return loss.mean(1).sum() / normalizer
			
 
				-
			
 
				-## InverseSigmoid
			
 
				-def inverse_sigmoid(x, eps=1e-5):
			
 
				-    x = x.clamp(min=0, max=1)
			
 
				-    x1 = x.clamp(min=eps)
			
 
				-    x2 = (1 - x).clamp(min=eps)
			
 
				-    return torch.log(x1/x2)
			
 
				-
			
 
				-## GIoU loss
			
 
				-class GIoULoss(object):
			
 
				-    """ Modified GIoULoss from Paddle-Paddle"""
			
 
				-    def __init__(self, eps=1e-10, reduction='none'):
			
 
				-        self.eps = eps
			
 
				-        self.reduction = reduction
			
 
				-        assert reduction in ('none', 'mean', 'sum')
			
 
				-
			
 
				-    def bbox_overlap(self, box1, box2, eps=1e-10):
			
 
				-        """calculate the iou of box1 and box2
			
 
				-        Args:
			
 
				-            box1 (Tensor): box1 with the shape (..., 4)
			
 
				-            box2 (Tensor): box1 with the shape (..., 4)
			
 
				-            eps (float): epsilon to avoid divide by zero
			
 
				-        Return:
			
 
				-            iou (Tensor): iou of box1 and box2
			
 
				-            overlap (Tensor): overlap of box1 and box2
			
 
				-            union (Tensor): union of box1 and box2
			
 
				-        """
			
 
				-        x1, y1, x2, y2 = box1
			
 
				-        x1g, y1g, x2g, y2g = box2
			
 
				-
			
 
				-        xkis1 = torch.max(x1, x1g)
			
 
				-        ykis1 = torch.max(y1, y1g)
			
 
				-        xkis2 = torch.min(x2, x2g)
			
 
				-        ykis2 = torch.min(y2, y2g)
			
 
				-        w_inter = (xkis2 - xkis1).clip(0)
			
 
				-        h_inter = (ykis2 - ykis1).clip(0)
			
 
				-        overlap = w_inter * h_inter
			
 
				-
			
 
				-        area1 = (x2 - x1) * (y2 - y1)
			
 
				-        area2 = (x2g - x1g) * (y2g - y1g)
			
 
				-        union = area1 + area2 - overlap + eps
			
 
				-        iou = overlap / union
			
 
				-
			
 
				-        return iou, overlap, union
			
 
				-
			
 
				-    def __call__(self, pbox, gbox):
			
 
				-        # x1, y1, x2, y2 = torch.split(pbox, 4, dim=-1)
			
 
				-        # x1g, y1g, x2g, y2g = torch.split(gbox, 4, dim=-1)
			
 
				-        x1, y1, x2, y2 = torch.chunk(pbox, 4, dim=-1)
			
 
				-        x1g, y1g, x2g, y2g = torch.chunk(gbox, 4, dim=-1)
			
 
				-        box1 = [x1, y1, x2, y2]
			
 
				-        box2 = [x1g, y1g, x2g, y2g]
			
 
				-        iou, _, union = self.bbox_overlap(box1, box2, self.eps)
			
 
				-        xc1 = torch.min(x1, x1g)
			
 
				-        yc1 = torch.min(y1, y1g)
			
 
				-        xc2 = torch.max(x2, x2g)
			
 
				-        yc2 = torch.max(y2, y2g)
			
 
				-
			
 
				-        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
			
 
				-        miou = iou - ((area_c - union) / area_c)
			
 
				-        giou = 1 - miou
			
 
				-
			
 
				-        if self.reduction == 'none':
			
 
				-            loss = giou
			
 
				-        elif self.reduction == 'sum':
			
 
				-            loss = giou.sum()
			
 
				-        elif self.reduction == 'mean':
			
 
				-            loss = giou.mean()
			
 
				-
			
 
				-        return loss
			
 
				-
			
 
				-
			
 
				-# ------------------------- For box -------------------------
			
 
				-def box_cxcywh_to_xyxy(x):
			
 
				-    x_c, y_c, w, h = x.unbind(-1)
			
 
				-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
			
 
				-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
			
 
				-    return torch.stack(b, dim=-1)
			
 
				-
			
 
				-def box_xyxy_to_cxcywh(x):
			
 
				-    x0, y0, x1, y1 = x.unbind(-1)
			
 
				-    b = [(x0 + x1) / 2, (y0 + y1) / 2,
			
 
				-         (x1 - x0), (y1 - y0)]
			
 
				-    return torch.stack(b, dim=-1)
			
 
				-
			
 
				-def box_iou(boxes1, boxes2):
			
 
				-    area1 = box_area(boxes1)
			
 
				-    area2 = box_area(boxes2)
			
 
				-
			
 
				-    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
			
 
				-    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
			
 
				-
			
 
				-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
			
 
				-    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
			
 
				-
			
 
				-    union = area1[:, None] + area2 - inter
			
 
				-
			
 
				-    iou = inter / union
			
 
				-    return iou, union
			
 
				-
			
 
				-def generalized_box_iou(boxes1, boxes2):
			
 
				-    """
			
 
				-    Generalized IoU from https://giou.stanford.edu/
			
 
				-
			
 
				-    The boxes should be in [x0, y0, x1, y1] format
			
 
				-
			
 
				-    Returns a [N, M] pairwise matrix, where N = len(boxes1)
			
 
				-    and M = len(boxes2)
			
 
				-    """
			
 
				-    # degenerate boxes gives inf / nan results
			
 
				-    # so do an early check
			
 
				-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
			
 
				-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
			
 
				-    iou, union = box_iou(boxes1, boxes2)
			
 
				-
			
 
				-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
			
 
				-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
			
 
				-
			
 
				-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
			
 
				-    area = wh[:, :, 0] * wh[:, :, 1]
			
 
				-
			
 
				-    return iou - (area - union) / area
			
 
				-
			
 
				-def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
			
 
				-    """Modified from Paddle-paddle
			
 
				-    Args:
			
 
				-        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
			
 
				-        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
			
 
				-        giou (bool): whether use giou or not, default False
			
 
				-        diou (bool): whether use diou or not, default False
			
 
				-        ciou (bool): whether use ciou or not, default False
			
 
				-        eps (float): epsilon to avoid divide by zero
			
 
				-    Return:
			
 
				-        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
			
 
				-    """
			
 
				-    px1, py1, px2, py2 = torch.chunk(box1, 4, -1)
			
 
				-    gx1, gy1, gx2, gy2 = torch.chunk(box2, 4, -1)
			
 
				-    x1 = torch.max(px1, gx1)
			
 
				-    y1 = torch.max(py1, gy1)
			
 
				-    x2 = torch.min(px2, gx2)
			
 
				-    y2 = torch.min(py2, gy2)
			
 
				-
			
 
				-    overlap = ((x2 - x1).clamp(0)) * ((y2 - y1).clamp(0))
			
 
				-
			
 
				-    area1 = (px2 - px1) * (py2 - py1)
			
 
				-    area1 = area1.clamp(0)
			
 
				-
			
 
				-    area2 = (gx2 - gx1) * (gy2 - gy1)
			
 
				-    area2 = area2.clamp(0)
			
 
				-
			
 
				-    union = area1 + area2 - overlap + eps
			
 
				-    iou = overlap / union
			
 
				-
			
 
				-    if giou or ciou or diou:
			
 
				-        # convex w, h
			
 
				-        cw = torch.max(px2, gx2) - torch.min(px1, gx1)
			
 
				-        ch = torch.max(py2, gy2) - torch.min(py1, gy1)
			
 
				-        if giou:
			
 
				-            c_area = cw * ch + eps
			
 
				-            return iou - (c_area - union) / c_area
			
 
				-        else:
			
 
				-            # convex diagonal squared
			
 
				-            c2 = cw**2 + ch**2 + eps
			
 
				-            # center distance
			
 
				-            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
			
 
				-            if diou:
			
 
				-                return iou - rho2 / c2
			
 
				-            else:
			
 
				-                w1, h1 = px2 - px1, py2 - py1 + eps
			
 
				-                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
			
 
				-                delta = torch.atan(w1 / h1) - torch.atan(w2 / h2)
			
 
				-                v = (4 / math.pi**2) * torch.pow(delta, 2)
			
 
				-                alpha = v / (1 + eps - iou + v)
			
 
				-                alpha.requires_grad_ = False
			
 
				-                return iou - (rho2 / c2 + v * alpha)
			
 
				-    else:
			
 
				-        return iou
			
 
				-
			
 
				-
			
 
				-# ------------------------- For distributed -------------------------
			
 
				-def is_dist_avail_and_initialized():
			
 
				-    if not dist.is_available():
			
 
				-        return False
			
 
				-    if not dist.is_initialized():
			
 
				-        return False
			
 
				-    return True
			
 
				-
			
 
				-def get_world_size():
			
 
				-    if not is_dist_avail_and_initialized():
			
 
				-        return 1
			
 
				-    return dist.get_world_size()
			
--- a/models/detectors/rtdetr/matcher.py
+++ b/models/detectors/rtdetr/matcher.py
@@ -1,90 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from scipy.optimize import linear_sum_assignment
			
 
				-
			
 
				-try:
			
 
				-    from .loss_utils import box_cxcywh_to_xyxy, generalized_box_iou
			
 
				-except:
			
 
				-    from  loss_utils import box_cxcywh_to_xyxy, generalized_box_iou
			
 
				-
			
 
				-
			
 
				-class HungarianMatcher(nn.Module):
			
 
				-    """This class computes an assignment between the targets and the predictions of the network
			
 
				-
			
 
				-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
			
 
				-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
			
 
				-    while the others are un-matched (and thus treated as non-objects).
			
 
				-    """
			
 
				-
			
 
				-    __share__ = ['use_focal_loss', ]
			
 
				-
			
 
				-    def __init__(self, weight_dict, alpha=0.25, gamma=2.0):
			
 
				-        """Creates the matcher
			
 
				-
			
 
				-        Params:
			
 
				-            cost_class: This is the relative weight of the classification error in the matching cost
			
 
				-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
			
 
				-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
			
 
				-        """
			
 
				-        super().__init__()
			
 
				-        self.cost_class = weight_dict['cost_class']
			
 
				-        self.cost_bbox = weight_dict['cost_bbox']
			
 
				-        self.cost_giou = weight_dict['cost_giou']
			
 
				-
			
 
				-        self.alpha = alpha
			
 
				-        self.gamma = gamma
			
 
				-
			
 
				-        assert self.cost_class != 0 or self.cost_bbox != 0 or self.cost_giou != 0, "all costs cant be 0"
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def forward(self, outputs, targets):
			
 
				-        """ Performs the matching
			
 
				-
			
 
				-        Params:
			
 
				-            outputs: This is a dict that contains at least these entries:
			
 
				-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
			
 
				-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
			
 
				-
			
 
				-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
			
 
				-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
			
 
				-                           objects in the target) containing the class labels
			
 
				-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
			
 
				-
			
 
				-        Returns:
			
 
				-            A list of size batch_size, containing tuples of (index_i, index_j) where:
			
 
				-                - index_i is the indices of the selected predictions (in order)
			
 
				-                - index_j is the indices of the corresponding selected targets (in order)
			
 
				-            For each batch element, it holds:
			
 
				-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
			
 
				-        """
			
 
				-        bs, num_queries = outputs["pred_logits"].shape[:2]
			
 
				-
			
 
				-        # We flatten to compute the cost matrices in a batch
			
 
				-        out_prob = F.sigmoid(outputs["pred_logits"].flatten(0, 1))
			
 
				-        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
			
 
				-
			
 
				-        # Also concat the target labels and boxes
			
 
				-        tgt_ids = torch.cat([v["labels"] for v in targets])
			
 
				-        tgt_bbox = torch.cat([v["boxes"] for v in targets])
			
 
				-
			
 
				-        # Compute the classification cost
			
 
				-        out_prob = out_prob[:, tgt_ids]
			
 
				-        neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(1 - out_prob + 1e-8).log())
			
 
				-        pos_cost_class = self.alpha * ((1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
			
 
				-        cost_class = pos_cost_class - neg_cost_class        
			
 
				-
			
 
				-        # Compute the L1 cost between boxes
			
 
				-        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
			
 
				-
			
 
				-        # Compute the giou cost betwen boxes
			
 
				-        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
			
 
				-        
			
 
				-        # Final cost matrix
			
 
				-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
			
 
				-        C = C.view(bs, num_queries, -1).cpu()
			
 
				-
			
 
				-        sizes = [len(v["boxes"]) for v in targets]
			
 
				-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
			
 
				-
			
 
				-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
			
--- a/models/detectors/rtdetr/rtdetr.py
+++ b/models/detectors/rtdetr/rtdetr.py
@@ -1,239 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-try:
			
 
				-    from .basic_modules.basic import multiclass_nms
			
 
				-    from .rtdetr_encoder import build_image_encoder
			
 
				-    from .rtdetr_decoder import build_transformer
			
 
				-except:
			
 
				-    from  basic_modules.basic import multiclass_nms
			
 
				-    from  rtdetr_encoder import build_image_encoder
			
 
				-    from  rtdetr_decoder import build_transformer
			
 
				-
			
 
				-
			
 
				-# Real-time DETR
			
 
				-class RT_DETR(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 cfg,
			
 
				-                 num_classes = 80,
			
 
				-                 conf_thresh = 0.1,
			
 
				-                 nms_thresh  = 0.5,
			
 
				-                 topk        = 300,
			
 
				-                 onnx_deploy = False,
			
 
				-                 no_multi_labels = False,
			
 
				-                 use_nms     = False,
			
 
				-                 nms_class_agnostic = False,
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic setting -----------
			
 
				-        self.num_classes = num_classes
			
 
				-        self.num_topk = topk
			
 
				-        self.onnx_deploy = onnx_deploy
			
 
				-        ## Post-process parameters
			
 
				-        self.use_nms = use_nms
			
 
				-        self.nms_thresh = nms_thresh
			
 
				-        self.conf_thresh = conf_thresh
			
 
				-        self.no_multi_labels = no_multi_labels
			
 
				-        self.nms_class_agnostic = nms_class_agnostic
			
 
				-
			
 
				-        # ----------- Network setting -----------
			
 
				-        ## Image encoder
			
 
				-        self.image_encoder = build_image_encoder(cfg)
			
 
				-        self.fpn_dims = self.image_encoder.fpn_dims
			
 
				-
			
 
				-        ## Detect decoder
			
 
				-        self.detect_decoder = build_transformer(cfg, self.fpn_dims, num_classes, return_intermediate=self.training)
			
 
				-
			
 
				-    def deploy(self):
			
 
				-        assert not self.training
			
 
				-        for m in self.modules():
			
 
				-            if hasattr(m, 'convert_to_deploy'):
			
 
				-                m.convert_to_deploy()
			
 
				-        return self 
			
 
				-
			
 
				-    def post_process(self, box_pred, cls_pred):
			
 
				-        # xywh -> xyxy
			
 
				-        box_preds_x1y1 = box_pred[..., :2] - 0.5 * box_pred[..., 2:]
			
 
				-        box_preds_x2y2 = box_pred[..., :2] + 0.5 * box_pred[..., 2:]
			
 
				-        box_pred = torch.cat([box_preds_x1y1, box_preds_x2y2], dim=-1)
			
 
				-        
			
 
				-        cls_pred = cls_pred[0]
			
 
				-        box_pred = box_pred[0]
			
 
				-        if self.no_multi_labels:
			
 
				-            # [M,]
			
 
				-            scores, labels = torch.max(cls_pred.sigmoid(), dim=1)
			
 
				-
			
 
				-            # Keep top k top scoring indices only.
			
 
				-            num_topk = min(self.num_topk, box_pred.size(0))
			
 
				-
			
 
				-            # Topk candidates
			
 
				-            predicted_prob, topk_idxs = scores.sort(descending=True)
			
 
				-            topk_scores = predicted_prob[:num_topk]
			
 
				-            topk_idxs = topk_idxs[:num_topk]
			
 
				-
			
 
				-            # Filter out the proposals with low confidence score
			
 
				-            keep_idxs = topk_scores > self.conf_thresh
			
 
				-            topk_idxs = topk_idxs[keep_idxs]
			
 
				-
			
 
				-            # Top-k results
			
 
				-            topk_scores = topk_scores[keep_idxs]
			
 
				-            topk_labels = labels[topk_idxs]
			
 
				-            topk_bboxes = box_pred[topk_idxs]
			
 
				-
			
 
				-        else:
			
 
				-            # Top-k select
			
 
				-            cls_pred = cls_pred.flatten().sigmoid_()
			
 
				-            box_pred = box_pred
			
 
				-
			
 
				-            # Keep top k top scoring indices only.
			
 
				-            num_topk = min(self.num_topk, box_pred.size(0))
			
 
				-
			
 
				-            # Topk candidates
			
 
				-            predicted_prob, topk_idxs = cls_pred.sort(descending=True)
			
 
				-            topk_scores = predicted_prob[:num_topk]
			
 
				-            topk_idxs = topk_idxs[:self.num_topk]
			
 
				-
			
 
				-            # Filter out the proposals with low confidence score
			
 
				-            keep_idxs = topk_scores > self.conf_thresh
			
 
				-            topk_scores = topk_scores[keep_idxs]
			
 
				-            topk_idxs = topk_idxs[keep_idxs]
			
 
				-            topk_box_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
			
 
				-
			
 
				-            ## Top-k results
			
 
				-            topk_labels = topk_idxs % self.num_classes
			
 
				-            topk_bboxes = box_pred[topk_box_idxs]
			
 
				-
			
 
				-        if not self.onnx_deploy:
			
 
				-            topk_scores = topk_scores.cpu().numpy()
			
 
				-            topk_labels = topk_labels.cpu().numpy()
			
 
				-            topk_bboxes = topk_bboxes.cpu().numpy()
			
 
				-
			
 
				-            # nms
			
 
				-            if self.use_nms:
			
 
				-                topk_scores, topk_labels, topk_bboxes = multiclass_nms(
			
 
				-                    topk_scores, topk_labels, topk_bboxes, self.nms_thresh, self.num_classes, self.nms_class_agnostic)
			
 
				-
			
 
				-        return topk_bboxes, topk_scores, topk_labels
			
 
				-    
			
 
				-    def forward(self, x, targets=None):
			
 
				-        # ----------- Image Encoder -----------
			
 
				-        pyramid_feats = self.image_encoder(x)
			
 
				-
			
 
				-        # ----------- Transformer -----------
			
 
				-        outputs = self.detect_decoder(pyramid_feats, targets)
			
 
				-
			
 
				-        if not self.training:
			
 
				-            img_h, img_w = x.shape[2:]
			
 
				-            box_pred = outputs["pred_boxes"]
			
 
				-            cls_pred = outputs["pred_logits"]
			
 
				-
			
 
				-            # rescale bbox
			
 
				-            box_pred[..., [0, 2]] *= img_h
			
 
				-            box_pred[..., [1, 3]] *= img_w
			
 
				-            
			
 
				-            # post-process
			
 
				-            bboxes, scores, labels = self.post_process(box_pred, cls_pred)
			
 
				-
			
 
				-            outputs = {
			
 
				-                "scores": scores,
			
 
				-                "labels": labels,
			
 
				-                "bboxes": bboxes,
			
 
				-            }
			
 
				-
			
 
				-        return outputs
			
 
				-        
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    from loss import build_criterion
			
 
				-
			
 
				-    # Model config
			
 
				-    cfg = {
			
 
				-        # Image Encoder - Backbone
			
 
				-        'backbone': 'resnet101',
			
 
				-        'backbone_norm': 'BN',
			
 
				-        'res5_dilation': False,
			
 
				-        'pretrained': False,
			
 
				-        'pretrained_weight': 'imagenet1k_v1',
			
 
				-        'freeze_at': 0,
			
 
				-        'freeze_stem_only': False,
			
 
				-        'out_stride': [8, 16, 32],
			
 
				-        'max_stride': 32,
			
 
				-        # Image Encoder - FPN
			
 
				-        'fpn': 'hybrid_encoder',
			
 
				-        'fpn_num_blocks': 3,
			
 
				-        'fpn_expansion': 0.5,
			
 
				-        'fpn_act': 'silu',
			
 
				-        'fpn_norm': 'BN',
			
 
				-        'fpn_depthwise': False,
			
 
				-        'hidden_dim': 384,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 1,
			
 
				-        'en_ffn_dim': 2048,
			
 
				-        'en_dropout': 0.0,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'en_act': 'gelu',
			
 
				-        # Transformer Decoder
			
 
				-        'transformer': 'rtdetr_transformer',
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_num_layers': 6,
			
 
				-        'de_ffn_dim': 2048,
			
 
				-        'de_dropout': 0.0,
			
 
				-        'de_act': 'gelu',
			
 
				-        'de_num_points': 4,
			
 
				-        'num_queries': 300,
			
 
				-        'learnt_init_query': False,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'dn_num_denoising': 100,
			
 
				-        'dn_label_noise_ratio': 0.5,
			
 
				-        'dn_box_noise_scale': 1,
			
 
				-        # Matcher
			
 
				-        'matcher_hpy': {'cost_class': 2.0,
			
 
				-                        'cost_bbox': 5.0,
			
 
				-                        'cost_giou': 2.0,},
			
 
				-        # Loss
			
 
				-        'use_vfl': True,
			
 
				-        'loss_coeff': {'class': 1,
			
 
				-                       'bbox': 5,
			
 
				-                       'giou': 2,
			
 
				-                       'no_object': 0.1,},
			
 
				-        }
			
 
				-    bs = 1
			
 
				-    # Create a batch of images & targets
			
 
				-    image = torch.randn(bs, 3, 640, 640).cuda()
			
 
				-    targets = [{
			
 
				-        'labels': torch.tensor([2, 4, 5, 8]).long().cuda(),
			
 
				-        'boxes':  torch.tensor([[0, 0, 10, 10], [12, 23, 56, 70], [0, 10, 20, 30], [50, 60, 55, 150]]).float().cuda() / 640.
			
 
				-    }] * bs
			
 
				-
			
 
				-    # Create model
			
 
				-    model = RT_DETR(cfg, num_classes=20)
			
 
				-    model.train().cuda()
			
 
				-
			
 
				-    # Create criterion
			
 
				-    criterion = build_criterion(cfg, num_classes=20)
			
 
				-
			
 
				-    # Model inference
			
 
				-    outputs = model(image, targets)
			
 
				-
			
 
				-    # Compute loss
			
 
				-    loss = criterion(outputs, targets)
			
 
				-    for k in loss.keys():
			
 
				-        print("{} : {}".format(k, loss[k].item()))
			
 
				-
			
 
				-    # Inference
			
 
				-    with torch.no_grad():
			
 
				-        model.eval()
			
 
				-        model.deploy()
			
 
				-        t0 = time.time()
			
 
				-        outputs = model(image)
			
 
				-        t1 = time.time()
			
 
				-        print('Infer time: ', t1 - t0)
			
 
				-
			
 
				-    print('==============================')
			
 
				-    model.eval()
			
 
				-    flops, params = profile(model, inputs=(image, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/models/detectors/rtdetr/rtdetr_decoder.py
+++ b/models/detectors/rtdetr/rtdetr_decoder.py
@@ -1,375 +0,0 @@
 
				-import math
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from torch.nn.init import constant_, xavier_uniform_, uniform_, normal_
			
 
				-from typing import List
			
 
				-
			
 
				-try:
			
 
				-    from .basic_modules.basic import BasicConv, MLP
			
 
				-    from .basic_modules.transformer import DeformableTransformerDecoder
			
 
				-    from .basic_modules.dn_compoments import get_contrastive_denoising_training_group
			
 
				-except:
			
 
				-    from  basic_modules.basic import BasicConv, MLP
			
 
				-    from  basic_modules.transformer import DeformableTransformerDecoder
			
 
				-    from  basic_modules.dn_compoments import get_contrastive_denoising_training_group
			
 
				-
			
 
				-
			
 
				-def build_transformer(cfg, in_dims, num_classes, return_intermediate=False):
			
 
				-    if cfg['transformer'] == 'rtdetr_transformer':
			
 
				-        return RTDETRTransformer(in_dims             = in_dims,
			
 
				-                                 hidden_dim          = cfg['hidden_dim'],
			
 
				-                                 strides             = cfg['out_stride'],
			
 
				-                                 num_classes         = num_classes,
			
 
				-                                 num_queries         = cfg['num_queries'],
			
 
				-                                 pos_embed_type      = 'sine',
			
 
				-                                 num_heads           = cfg['de_num_heads'],
			
 
				-                                 num_layers          = cfg['de_num_layers'],
			
 
				-                                 num_levels          = len(cfg['out_stride']),
			
 
				-                                 num_points          = cfg['de_num_points'],
			
 
				-                                 ffn_dim           = cfg['de_ffn_dim'],
			
 
				-                                 dropout             = cfg['de_dropout'],
			
 
				-                                 act_type            = cfg['de_act'],
			
 
				-                                 return_intermediate = return_intermediate,
			
 
				-                                 num_denoising       = cfg['dn_num_denoising'],
			
 
				-                                 label_noise_ratio   = cfg['dn_label_noise_ratio'],
			
 
				-                                 box_noise_scale     = cfg['dn_box_noise_scale'],
			
 
				-                                 learnt_init_query   = cfg['learnt_init_query'],
			
 
				-                                 )
			
 
				-
			
 
				-
			
 
				-# ----------------- Dencoder for Detection task -----------------
			
 
				-## RTDETR's Transformer for Detection task
			
 
				-class RTDETRTransformer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 # basic parameters
			
 
				-                 in_dims        :List = [256, 512, 1024],
			
 
				-                 hidden_dim     :int  = 256,
			
 
				-                 strides        :List = [8, 16, 32],
			
 
				-                 num_classes    :int  = 80,
			
 
				-                 num_queries    :int  = 300,
			
 
				-                 pos_embed_type :str  = 'sine',
			
 
				-                 # transformer parameters
			
 
				-                 num_heads      :int   = 8,
			
 
				-                 num_layers     :int   = 1,
			
 
				-                 num_levels     :int   = 3,
			
 
				-                 num_points     :int   = 4,
			
 
				-                 ffn_dim        :int   = 1024,
			
 
				-                 dropout        :float = 0.1,
			
 
				-                 act_type       :str   = "relu",
			
 
				-                 return_intermediate :bool = False,
			
 
				-                 # Denoising parameters
			
 
				-                 num_denoising       :int  = 100,
			
 
				-                 label_noise_ratio   :float = 0.5,
			
 
				-                 box_noise_scale     :float = 1.0,
			
 
				-                 learnt_init_query   :bool  = False,
			
 
				-                 aux_loss            :bool  = True
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # --------------- Basic setting ---------------
			
 
				-        ## Basic parameters
			
 
				-        self.in_dims = in_dims
			
 
				-        self.strides = strides
			
 
				-        self.num_queries = num_queries
			
 
				-        self.pos_embed_type = pos_embed_type
			
 
				-        self.num_classes = num_classes
			
 
				-        self.eps = 1e-2
			
 
				-        self.aux_loss = aux_loss
			
 
				-        ## Transformer parameters
			
 
				-        self.num_heads  = num_heads
			
 
				-        self.num_layers = num_layers
			
 
				-        self.num_levels = num_levels
			
 
				-        self.num_points = num_points
			
 
				-        self.ffn_dim  = ffn_dim
			
 
				-        self.dropout    = dropout
			
 
				-        self.act_type   = act_type
			
 
				-        self.return_intermediate = return_intermediate
			
 
				-        ## Denoising parameters
			
 
				-        self.num_denoising = num_denoising
			
 
				-        self.label_noise_ratio = label_noise_ratio
			
 
				-        self.box_noise_scale = box_noise_scale
			
 
				-        self.learnt_init_query = learnt_init_query
			
 
				-
			
 
				-        # --------------- Network setting ---------------
			
 
				-        ## Input proj layers
			
 
				-        self.input_proj_layers = nn.ModuleList(
			
 
				-            BasicConv(in_dims[i], hidden_dim, kernel_size=1, act_type=None, norm_type="BN")
			
 
				-            for i in range(num_levels)
			
 
				-        )
			
 
				-
			
 
				-        ## Deformable transformer decoder
			
 
				-        self.decoder = DeformableTransformerDecoder(
			
 
				-                                    d_model    = hidden_dim,
			
 
				-                                    num_heads  = num_heads,
			
 
				-                                    num_layers = num_layers,
			
 
				-                                    num_levels = num_levels,
			
 
				-                                    num_points = num_points,
			
 
				-                                    ffn_dim  = ffn_dim,
			
 
				-                                    dropout    = dropout,
			
 
				-                                    act_type   = act_type,
			
 
				-                                    return_intermediate = return_intermediate
			
 
				-                                    )
			
 
				-        
			
 
				-        ## Detection head for Encoder
			
 
				-        self.enc_output = nn.Sequential(
			
 
				-            nn.Linear(hidden_dim, hidden_dim),
			
 
				-            nn.LayerNorm(hidden_dim)
			
 
				-            )
			
 
				-        self.enc_class_head = nn.Linear(hidden_dim, num_classes)
			
 
				-        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
			
 
				-
			
 
				-        ## Detection head for Decoder
			
 
				-        self.dec_class_head = nn.ModuleList([
			
 
				-            nn.Linear(hidden_dim, num_classes)
			
 
				-            for _ in range(num_layers)
			
 
				-        ])
			
 
				-        self.dec_bbox_head = nn.ModuleList([
			
 
				-            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
			
 
				-            for _ in range(num_layers)
			
 
				-        ])
			
 
				-
			
 
				-        ## Object query
			
 
				-        if learnt_init_query:
			
 
				-            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
			
 
				-        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
			
 
				-
			
 
				-        ## Denoising part
			
 
				-        if num_denoising > 0: 
			
 
				-            self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes)
			
 
				-
			
 
				-        self._reset_parameters()
			
 
				-
			
 
				-    def _reset_parameters(self):
			
 
				-        # class and bbox head init
			
 
				-        prior_prob = 0.01
			
 
				-        cls_bias_init = float(-math.log((1 - prior_prob) / prior_prob))
			
 
				-
			
 
				-        nn.init.constant_(self.enc_class_head.bias, cls_bias_init)
			
 
				-        nn.init.constant_(self.enc_bbox_head.layers[-1].weight, 0.)
			
 
				-        nn.init.constant_(self.enc_bbox_head.layers[-1].bias, 0.)
			
 
				-        for cls_, reg_ in zip(self.dec_class_head, self.dec_bbox_head):
			
 
				-            nn.init.constant_(cls_.bias, cls_bias_init)
			
 
				-            nn.init.constant_(reg_.layers[-1].weight, 0.)
			
 
				-            nn.init.constant_(reg_.layers[-1].bias, 0.)
			
 
				-
			
 
				-        nn.init.xavier_uniform_(self.enc_output[0].weight)
			
 
				-        if self.learnt_init_query:
			
 
				-            nn.init.xavier_uniform_(self.tgt_embed.weight)
			
 
				-        nn.init.xavier_uniform_(self.query_pos_head.layers[0].weight)
			
 
				-        nn.init.xavier_uniform_(self.query_pos_head.layers[1].weight)
			
 
				-
			
 
				-    @torch.jit.unused
			
 
				-    def _set_aux_loss(self, outputs_class, outputs_coord):
			
 
				-        # this is a workaround to make torchscript happy, as torchscript
			
 
				-        # doesn't support dictionary with non-homogeneous values, such
			
 
				-        # as a dict having both a Tensor and a list.
			
 
				-        return [{'pred_logits': a, 'pred_boxes': b}
			
 
				-                for a, b in zip(outputs_class, outputs_coord)]
			
 
				-
			
 
				-    def generate_anchors(self, spatial_shapes, grid_size=0.05):
			
 
				-        anchors = []
			
 
				-        for lvl, (h, w) in enumerate(spatial_shapes):
			
 
				-            grid_y, grid_x = torch.meshgrid(torch.arange(h), torch.arange(w))
			
 
				-            # [H, W, 2]
			
 
				-            grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
			
 
				-
			
 
				-            valid_WH = torch.as_tensor([w, h]).float()
			
 
				-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
			
 
				-            wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl)
			
 
				-            # [H, W, 4] -> [1, N, 4], N=HxW
			
 
				-            anchors.append(torch.cat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4))
			
 
				-        # List[L, 1, N_i, 4] -> [1, N, 4], N=N_0 + N_1 + N_2 + ...
			
 
				-        anchors = torch.cat(anchors, dim=1)
			
 
				-        valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True)
			
 
				-        anchors = torch.log(anchors / (1 - anchors))
			
 
				-        # Equal to operation: anchors = torch.masked_fill(anchors, ~valid_mask, torch.as_tensor(float("inf")))
			
 
				-        anchors = torch.where(valid_mask, anchors, torch.inf)
			
 
				-        
			
 
				-        return anchors, valid_mask
			
 
				-    
			
 
				-    def get_encoder_input(self, feats):
			
 
				-        # get projection features
			
 
				-        proj_feats = [self.input_proj_layers[i](feat) for i, feat in enumerate(feats)]
			
 
				-
			
 
				-        # get encoder inputs
			
 
				-        feat_flatten = []
			
 
				-        spatial_shapes = []
			
 
				-        level_start_index = [0, ]
			
 
				-        for i, feat in enumerate(proj_feats):
			
 
				-            _, _, h, w = feat.shape
			
 
				-            spatial_shapes.append([h, w])
			
 
				-            # [l], start index of each level
			
 
				-            level_start_index.append(h * w + level_start_index[-1])
			
 
				-            # [B, C, H, W] -> [B, N, C], N=HxW
			
 
				-            feat_flatten.append(feat.flatten(2).permute(0, 2, 1).contiguous())
			
 
				-
			
 
				-        # [B, N, C], N = N_0 + N_1 + ...
			
 
				-        feat_flatten = torch.cat(feat_flatten, dim=1)
			
 
				-        level_start_index.pop()
			
 
				-
			
 
				-        return (feat_flatten, spatial_shapes, level_start_index)
			
 
				-
			
 
				-    def get_decoder_input(self,
			
 
				-                          memory,
			
 
				-                          spatial_shapes,
			
 
				-                          denoising_class=None,
			
 
				-                          denoising_bbox_unact=None):
			
 
				-        bs, _, _ = memory.shape
			
 
				-        # Prepare input for decoder
			
 
				-        anchors, valid_mask = self.generate_anchors(spatial_shapes)
			
 
				-        anchors = anchors.to(memory.device)
			
 
				-        valid_mask = valid_mask.to(memory.device)
			
 
				-        
			
 
				-        # Process encoder's output
			
 
				-        memory = torch.where(valid_mask, memory, torch.as_tensor(0., device=memory.device))
			
 
				-        output_memory = self.enc_output(memory)
			
 
				-
			
 
				-        # Head for encoder's output : [bs, num_quries, c]
			
 
				-        enc_outputs_class = self.enc_class_head(output_memory)
			
 
				-        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
			
 
				-
			
 
				-        # Topk proposals from encoder's output
			
 
				-        topk = self.num_queries
			
 
				-        topk_ind = torch.topk(enc_outputs_class.max(-1)[0], topk, dim=1)[1]  # [bs, num_queries]
			
 
				-        enc_topk_logits = torch.gather(
			
 
				-            enc_outputs_class, 1, topk_ind.unsqueeze(-1).repeat(1, 1, self.num_classes))  # [bs, num_queries, nc]
			
 
				-        reference_points_unact = torch.gather(
			
 
				-            enc_outputs_coord_unact, 1, topk_ind.unsqueeze(-1).repeat(1, 1, 4))    # [bs, num_queries, 4]
			
 
				-        enc_topk_bboxes = F.sigmoid(reference_points_unact)
			
 
				-
			
 
				-        if denoising_bbox_unact is not None:
			
 
				-            reference_points_unact = torch.cat(
			
 
				-                [denoising_bbox_unact, reference_points_unact], dim=1)
			
 
				-
			
 
				-        # Extract region features
			
 
				-        if self.learnt_init_query:
			
 
				-            # [num_queries, c] -> [b, num_queries, c]
			
 
				-            target = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
			
 
				-        else:
			
 
				-            # [num_queries, c] -> [b, num_queries, c]
			
 
				-            target = torch.gather(output_memory, 1, topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
			
 
				-            target = target.detach()
			
 
				-        
			
 
				-        if denoising_class is not None:
			
 
				-            target = torch.cat([denoising_class, target], dim=1)
			
 
				-
			
 
				-        return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits
			
 
				-    
			
 
				-    def forward(self, feats, targets=None):
			
 
				-        # input projection and embedding
			
 
				-        memory, spatial_shapes, _ = self.get_encoder_input(feats)
			
 
				-
			
 
				-        # prepare denoising training
			
 
				-        if self.training and self.num_denoising > 0:
			
 
				-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
			
 
				-                get_contrastive_denoising_training_group(targets, \
			
 
				-                                                         self.num_classes, 
			
 
				-                                                         self.num_queries, 
			
 
				-                                                         self.denoising_class_embed, 
			
 
				-                                                         num_denoising=self.num_denoising, 
			
 
				-                                                         label_noise_ratio=self.label_noise_ratio, 
			
 
				-                                                         box_noise_scale=self.box_noise_scale, )
			
 
				-        else:
			
 
				-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
			
 
				-
			
 
				-        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
			
 
				-            self.get_decoder_input(
			
 
				-            memory, spatial_shapes, denoising_class, denoising_bbox_unact)
			
 
				-
			
 
				-        # decoder
			
 
				-        out_bboxes, out_logits = self.decoder(target,
			
 
				-                                              init_ref_points_unact,
			
 
				-                                              memory,
			
 
				-                                              spatial_shapes,
			
 
				-                                              self.dec_bbox_head,
			
 
				-                                              self.dec_class_head,
			
 
				-                                              self.query_pos_head,
			
 
				-                                              attn_mask)
			
 
				-
			
 
				-        if self.training and dn_meta is not None:
			
 
				-            dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2)
			
 
				-            dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2)
			
 
				-
			
 
				-        out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
			
 
				-
			
 
				-        if self.training and self.aux_loss:
			
 
				-            out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1])
			
 
				-            out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes]))
			
 
				-            
			
 
				-            if self.training and dn_meta is not None:
			
 
				-                out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes)
			
 
				-                out['dn_meta'] = dn_meta
			
 
				-
			
 
				-        return out
			
 
				-
			
 
				-
			
 
				-# ----------------- Dencoder for Segmentation task -----------------
			
 
				-## RTDETR's Transformer for Segmentation task
			
 
				-class SegTransformerDecoder(nn.Module):
			
 
				-    def __init__(self, ):
			
 
				-        super().__init__()
			
 
				-        # TODO: design seg-decoder
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return
			
 
				-
			
 
				-
			
 
				-# ----------------- Dencoder for Pose estimation task -----------------
			
 
				-## RTDETR's Transformer for Pose estimation task
			
 
				-class PosTransformerDecoder(nn.Module):
			
 
				-    def __init__(self, ):
			
 
				-        super().__init__()
			
 
				-        # TODO: design seg-decoder
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    cfg = {
			
 
				-        'out_stride': [8, 16, 32],
			
 
				-        # Transformer Decoder
			
 
				-        'transformer': 'rtdetr_transformer',
			
 
				-        'hidden_dim': 256,
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_num_layers': 6,
			
 
				-        'de_ffn_dim': 1024,
			
 
				-        'de_dropout': 0.1,
			
 
				-        'de_act': 'gelu',
			
 
				-        'de_num_points': 4,
			
 
				-        'num_queries': 300,
			
 
				-        'learnt_init_query': False,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'dn_num_denoising': 100,
			
 
				-        'dn_label_noise_ratio': 0.5,
			
 
				-        'dn_box_noise_scale': 1,
			
 
				-    }
			
 
				-    bs = 1
			
 
				-    hidden_dim = cfg['hidden_dim']
			
 
				-    in_dims = [hidden_dim] * 3
			
 
				-    targets = [{
			
 
				-        'labels': torch.tensor([2, 4, 5, 8]).long(),
			
 
				-        'boxes':  torch.tensor([[0, 0, 10, 10], [12, 23, 56, 70], [0, 10, 20, 30], [50, 60, 55, 150]]).float()
			
 
				-    }] * bs
			
 
				-    pyramid_feats = [torch.randn(bs, hidden_dim, 80, 80),
			
 
				-                     torch.randn(bs, hidden_dim, 40, 40),
			
 
				-                     torch.randn(bs, hidden_dim, 20, 20)]
			
 
				-    model = build_transformer(cfg, in_dims, 80, True)
			
 
				-    model.train()
			
 
				-
			
 
				-    t0 = time.time()
			
 
				-    outputs = model(pyramid_feats, targets)
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
 
				-
			
 
				-    print(outputs["pred_logits"].shape)
			
 
				-    print(outputs["pred_boxes"].shape)
			
 
				-
			
 
				-    print('==============================')
			
 
				-    model.eval()
			
 
				-    flops, params = profile(model, inputs=(pyramid_feats, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/models/detectors/rtdetr/rtdetr_encoder.py
+++ b/models/detectors/rtdetr/rtdetr_encoder.py
@@ -1,86 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-try:
			
 
				-    from .basic_modules.backbone import build_backbone
			
 
				-    from .basic_modules.fpn      import build_fpn
			
 
				-except:
			
 
				-    from  basic_modules.backbone import build_backbone
			
 
				-    from  basic_modules.fpn      import build_fpn
			
 
				-
			
 
				-
			
 
				-# ----------------- Image Encoder -----------------
			
 
				-def build_image_encoder(cfg):
			
 
				-    return ImageEncoder(cfg)
			
 
				-
			
 
				-class ImageEncoder(nn.Module):
			
 
				-    def __init__(self, cfg):
			
 
				-        super().__init__()
			
 
				-        # ---------------- Basic settings ----------------
			
 
				-        ## Basic parameters
			
 
				-        self.cfg = cfg
			
 
				-        ## Network parameters
			
 
				-        self.strides = cfg['out_stride']
			
 
				-        self.hidden_dim = cfg['hidden_dim']
			
 
				-        self.num_levels = len(self.strides)
			
 
				-        
			
 
				-        # ---------------- Network settings ----------------
			
 
				-        ## Backbone Network
			
 
				-        self.backbone, fpn_feat_dims = build_backbone(cfg, pretrained=cfg['pretrained'])
			
 
				-
			
 
				-        ## Feature Pyramid Network
			
 
				-        self.fpn = build_fpn(cfg, fpn_feat_dims, self.hidden_dim)
			
 
				-        self.fpn_dims = self.fpn.out_dims
			
 
				-        
			
 
				-    def forward(self, x):
			
 
				-        pyramid_feats = self.backbone(x)
			
 
				-        pyramid_feats = self.fpn(pyramid_feats)
			
 
				-
			
 
				-        return pyramid_feats
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    cfg = {
			
 
				-        'width': 1.0,
			
 
				-        'depth': 1.0,
			
 
				-        'out_stride': [8, 16, 32],
			
 
				-        # Image Encoder - Backbone
			
 
				-        'backbone': 'resnet18',
			
 
				-        'backbone_norm': 'BN',
			
 
				-        'res5_dilation': False,
			
 
				-        'pretrained': True,
			
 
				-        'pretrained_weight': 'imagenet1k_v1',
			
 
				-        # Image Encoder - FPN
			
 
				-        'fpn': 'hybrid_encoder',
			
 
				-        'fpn_act': 'silu',
			
 
				-        'fpn_norm': 'BN',
			
 
				-        'fpn_depthwise': False,
			
 
				-        'hidden_dim': 256,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 1,
			
 
				-        'en_mlp_ratio': 4.0,
			
 
				-        'en_dropout': 0.1,
			
 
				-        'pe_temperature': 10000.,
			
 
				-        'en_act': 'gelu',
			
 
				-    }
			
 
				-    x = torch.rand(2, 3, 640, 640)
			
 
				-    model = build_image_encoder(cfg)
			
 
				-    model.train()
			
 
				-
			
 
				-    t0 = time.time()
			
 
				-    outputs = model(x)
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
 
				-    for out in outputs:
			
 
				-        print(out.shape)
			
 
				-
			
 
				-    print('==============================')
			
 
				-    model.eval()
			
 
				-    x = torch.rand(1, 3, 640, 640)
			
 
				-    flops, params = profile(model, inputs=(x, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/models/detectors/yolov8/yolov8_backbone.py
+++ b/models/detectors/yolov8/yolov8_backbone.py
@@ -76,7 +76,7 @@ class Yolov8Backbone(nn.Module):
 
				         c3 = self.layer_3(c2)
			
 
				         c4 = self.layer_4(c3)
			
 
				         c5 = self.layer_5(c4)
			
 
				-
			
 
				+        print(c3)
			
 
				         outputs = [c3, c4, c5]
			
 
				 
			
 
				         return outputs
			
--- a/train.sh
+++ b/train.sh
@@ -10,14 +10,7 @@ RESUME=$7
 
				 # MODEL setting
			
 
				 IMAGE_SIZE=640
			
 
				 FIND_UNUSED_PARAMS=False
			
 
				-if [[ $MODEL == *"rtdetr"* ]]; then
			
 
				-    # Epoch setting
			
 
				-    MAX_EPOCH=72
			
 
				-    WP_EPOCH=-1
			
 
				-    EVAL_EPOCH=1
			
 
				-    NO_AUG_EPOCH=-1
			
 
				-    FIND_UNUSED_PARAMS=True
			
 
				-elif [[ $MODEL == *"yolov8"* ]]; then
			
 
				+if [[ $MODEL == *"yolov8"* ]]; then
			
 
				     # Epoch setting
			
 
				     MAX_EPOCH=500
			
 
				     WP_EPOCH=3