yjh0410 1 year ago
parent
commit
4a379c0f58

+ 0 - 5
config/__init__.py

@@ -91,8 +91,6 @@ from .model_config.yolov5_config import yolov5_cfg, yolov5_adamw_cfg
 from .model_config.yolov7_config import yolov7_cfg
 from .model_config.yolov8_config import yolov8_cfg
 from .model_config.yolox_config  import yolox_cfg
-## Real-time DETR series
-from .model_config.rtdetr_config import rtdetr_cfg
 
 def build_model_config(args):
     print('==============================')
@@ -124,9 +122,6 @@ def build_model_config(args):
     # YOLOX
     elif args.model in ['yolox_n', 'yolox_s', 'yolox_m', 'yolox_l', 'yolox_x']:
         cfg = yolox_cfg[args.model]
-    # RT-DETR
-    elif args.model in ['rtdetr_r18', 'rtdetr_r34', 'rtdetr_r50', 'rtdetr_r101']:
-        cfg = rtdetr_cfg[args.model]
 
     return cfg
 

+ 0 - 175
config/model_config/rtdetr_config.py

@@ -1,175 +0,0 @@
-# Real-time Transformer-based Object Detector
-
-
-# ------------------- Det task --------------------
-rtdetr_cfg = {
-    'rtdetr_r18':{
-        # ---------------- Model config ----------------
-        ## Image Encoder - Backbone
-        'backbone': 'resnet18',
-        'backbone_norm': 'BN',
-        'pretrained': True,
-        'pretrained_weight': 'imagenet1k_v1',
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'out_stride': [8, 16, 32],
-        'max_stride': 32,
-        ## Image Encoder - FPN
-        'fpn': 'hybrid_encoder',
-        'fpn_num_blocks': 3,
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        'hidden_dim': 256,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_ffn_dim': 1024,
-        'en_dropout': 0.0,
-        'pe_temperature': 10000.,
-        'en_act': 'gelu',
-        # Transformer Decoder
-        'transformer': 'rtdetr_transformer',
-        'de_num_heads': 8,
-        'de_num_layers': 3,
-        'de_ffn_dim': 1024,
-        'de_dropout': 0.0,
-        'de_act': 'relu',
-        'de_num_points': 4,
-        'num_queries': 300,
-        'learnt_init_query': False,
-        'pe_temperature': 10000.,
-        'dn_num_denoising': 100,
-        'dn_label_noise_ratio': 0.5,
-        'dn_box_noise_scale': 1,
-        # ---------------- Assignment config ----------------
-        'matcher_hpy': {'cost_class': 2.0,
-                        'cost_bbox': 5.0,
-                        'cost_giou': 2.0,},
-        # ---------------- Loss config ----------------
-        'use_vfl': True,
-        'loss_coeff': {'class': 1,
-                       'bbox': 5,
-                       'giou': 2,},
-        # ---------------- Train config ----------------
-        ## input
-        'multi_scale': [0.5, 1.25],   # 320 -> 800
-        'trans_type': 'rtdetr_base',
-        # ---------------- Train config ----------------
-        'trainer_type': 'rtdetr',
-    },
-
-    'rtdetr_r50':{
-        # ---------------- Model config ----------------
-        ## Image Encoder - Backbone
-        'backbone': 'resnet50',
-        'backbone_norm': 'FrozeBN',
-        'pretrained': True,
-        'pretrained_weight': 'imagenet1k_v2',
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'out_stride': [8, 16, 32],
-        'max_stride': 32,
-        ## Image Encoder - FPN
-        'fpn': 'hybrid_encoder',
-        'fpn_num_blocks': 3,
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        'hidden_dim': 256,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_ffn_dim': 2048,
-        'en_dropout': 0.0,
-        'pe_temperature': 10000.,
-        'en_act': 'gelu',
-        # Transformer Decoder
-        'transformer': 'rtdetr_transformer',
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_ffn_dim': 2048,
-        'de_dropout': 0.0,
-        'de_act': 'relu',
-        'de_num_points': 4,
-        'num_queries': 300,
-        'learnt_init_query': False,
-        'pe_temperature': 10000.,
-        'dn_num_denoising': 100,
-        'dn_label_noise_ratio': 0.5,
-        'dn_box_noise_scale': 1,
-        # Head
-        'det_head': 'dino_head',
-        # ---------------- Assignment config ----------------
-        'matcher_hpy': {'cost_class': 2.0,
-                        'cost_bbox': 5.0,
-                        'cost_giou': 2.0,},
-        # ---------------- Loss config ----------------
-        'use_vfl': True,
-        'loss_coeff': {'class': 1,
-                       'bbox': 5,
-                       'giou': 2,},
-        # ---------------- Train config ----------------
-        ## input
-        'multi_scale': [0.5, 1.25],   # 320 -> 800
-        'trans_type': 'rtdetr_base',
-        # ---------------- Train config ----------------
-        'trainer_type': 'rtdetr',
-    },
-
-    'rtdetr_r101':{
-        # ---------------- Model config ----------------
-        ## Image Encoder - Backbone
-        'backbone': 'resnet101',
-        'backbone_norm': 'FrozeBN',
-        'pretrained': True,
-        'pretrained_weight': 'imagenet1k_v2',
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'out_stride': [8, 16, 32],
-        'max_stride': 32,
-        ## Image Encoder - FPN
-        'fpn': 'hybrid_encoder',
-        'fpn_num_blocks': 4,
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        'hidden_dim': 384,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_ffn_dim': 2048,
-        'en_dropout': 0.0,
-        'pe_temperature': 10000.,
-        'en_act': 'gelu',
-        # Transformer Decoder
-        'transformer': 'rtdetr_transformer',
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_ffn_dim': 2048,
-        'de_dropout': 0.0,
-        'de_act': 'relu',
-        'de_num_points': 4,
-        'num_queries': 300,
-        'learnt_init_query': False,
-        'pe_temperature': 10000.,
-        'dn_num_denoising': 100,
-        'dn_label_noise_ratio': 0.5,
-        'dn_box_noise_scale': 1,
-        # Head
-        'det_head': 'dino_head',
-        # ---------------- Assignment config ----------------
-        'matcher_hpy': {'cost_class': 2.0,
-                        'cost_bbox': 5.0,
-                        'cost_giou': 2.0,},
-        # ---------------- Loss config ----------------
-        'use_vfl': True,
-        'loss_coeff': {'class': 1,
-                       'bbox': 5,
-                       'giou': 2,},
-        # ---------------- Train config ----------------
-        ## input
-        'multi_scale': [0.5, 1.25],   # 320 -> 800
-        'trans_type': 'rtdetr_base',
-        # ---------------- Train config ----------------
-        'trainer_type': 'rtdetr',
-    },
-
-}

+ 0 - 6
models/detectors/__init__.py

@@ -11,8 +11,6 @@ from .yolov5.build import build_yolov5
 from .yolov7.build import build_yolov7
 from .yolov8.build import build_yolov8
 from .yolox.build import build_yolox
-# Real-time DETR series
-from .rtdetr.build import build_rtdetr
 
 
 # build object detector
@@ -62,10 +60,6 @@ def build_model(args,
     elif args.model in ['yolox_n_adamw', 'yolox_s_adamw', 'yolox_m_adamw', 'yolox_l_adamw', 'yolox_x_adamw']:
         model, criterion = build_yolox(
             args, model_cfg, device, num_classes, trainable, deploy)
-    # RT-DETR
-    elif args.model in ['rtdetr_r18', 'rtdetr_r34', 'rtdetr_r50', 'rtdetr_r101']:
-        model, criterion = build_rtdetr(
-            args, model_cfg, num_classes, trainable, deploy)
 
     if trainable:
         # Load pretrained weight

+ 0 - 59
models/detectors/rtdetr/README.md

@@ -1,59 +0,0 @@
-# Real-time Transformer-based Object Detector:
-This model is not yet complete.
-
-## Results on the COCO-val
-|     Model    | Batch | Scale | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight | Los |
-|--------------|-------|-------|------------------------|-------------------|-------------------|--------------------|--------|-----|
-| RT-DETR-R18  | 4xb4  |  640  |           45.5         |        63.0       |        66.8       |        21.0        | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/detr_series_ckpt/rtdetr_r18_coco.pth) | [log](https://github.com/yjh0410/RT-ODLab/releases/download/detr_series_ckpt/RT-DETR-R18-COCO.txt)|
-| RT-DETR-R50  | 4xb4  |  640  |           50.2         |        68.5       |       113.7       |        40.4        | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/detr_series_ckpt/rtdetr_r50_coco.pth) | [log](https://github.com/yjh0410/RT-ODLab/releases/download/detr_series_ckpt/RT-DETR-R50-COCO.txt)|
-| RT-DETR-R101 | 4xb4  |  640  |                        |                   |                   |                    |  | |
-
-- For the backbone of the image encoder, we use the IN-1K classification pretrained weight from torchvision, which is different from the official
-RT-DETR. It might be hard to train RT-DETR from scratch without IN-1K pretrained weight.
-- For the HybridEncoder, we use the C2f of YOLOv8 rather than the CSPRepLayer.
-- For training, we train RT-DETR series with 6x (~72 epochs) schedule on COCO and use ModelEMA trick. We close the fp16 training trick.
-- For data augmentation, we use the `color jitter`, `random hflip`, `random crop`, and multi-scale training trick.
-- For optimizer, we use AdamW with weight decay 0.0001 and base per image lr 0.0001 / 16.
-- For learning rate scheduler, we use constant learning rate (=0.0001), following the official setting.
-- For post-processing, we think it is still a little helpful to deploy NMS even if it is not essential.
-
-## Train RT-DETR
-### Single GPU
-Taking training RT-DETR-R18 on COCO as the example,
-```Shell
-python train.py --cuda -d coco --root path/to/coco -m rtdetr_r18 -bs 16 -size 640 --max_epoch 72 --eval_epoch 1 --ema --multi_scale 
-```
-
-### Multi GPU
-Taking training RT-DETR-R18 on COCO with 4 GPUs as the example,
-```Shell
-python -m torch.distributed.run --nproc_per_node=4 train.py --cuda -dist -d coco --root /data/datasets/ -m rtdetr_r18 -bs 16 -size 640 --max_epoch 72 --eval_epoch 1 --ema --sybn --multi_scale 
-```
-
-## Test RT-DETR
-Taking testing RT-DETR-R18 on COCO-val as the example,
-```Shell
-python test.py --cuda -d coco --root path/to/coco -m rtdetr_r18 --weight path/to/rtdetr_r18.pth -size 640 -ct 0.4 --show 
-```
-
-## Evaluate RT-DETR
-Taking evaluating RT-DETR-R18 on COCO-val as the example,
-```Shell
-python eval.py --cuda -d coco --root path/to/coco -m rtdetr_r18 --weight path/to/rtdetr_r18.pth -size 640
-```
-
-## Demo
-### Detect with Image
-```Shell
-python demo.py --mode image --path_to_img path/to/image_dirs/ --cuda -m rtdetr_r18 --weight path/to/weight -size 640 -ct 0.4 --show
-```
-
-### Detect with Video
-```Shell
-python demo.py --mode video --path_to_vid path/to/video --cuda -m rtdetr_r18 --weight path/to/weight -size 640 -ct 0.4 --show --gif
-```
-
-### Detect with Camera
-```Shell
-python demo.py --mode camera --cuda -m rtdetr_r18 --weight path/to/weight -size 640 -ct 0.4 --show --gif
-```

+ 0 - 134
models/detectors/rtdetr/basic_modules/backbone.py

@@ -1,134 +0,0 @@
-import torch
-import torchvision
-from torch import nn
-from torchvision.models._utils import IntermediateLayerGetter
-from torchvision.models.resnet import (ResNet18_Weights,
-                                       ResNet34_Weights,
-                                       ResNet50_Weights,
-                                       ResNet101_Weights)
-try:
-    from .basic import FrozenBatchNorm2d
-except:
-    from basic  import FrozenBatchNorm2d
-   
-
-# IN1K pretrained weights
-pretrained_urls = {
-    # ResNet series
-    'resnet18':  ResNet18_Weights,
-    'resnet34':  ResNet34_Weights,
-    'resnet50':  ResNet50_Weights,
-    'resnet101': ResNet101_Weights,
-    # ShuffleNet series
-}
-
-
-# ----------------- Model functions -----------------
-## Build backbone network
-def build_backbone(cfg, pretrained):
-    print('==============================')
-    print('Backbone: {}'.format(cfg['backbone']))
-    # ResNet
-    if 'resnet' in cfg['backbone']:
-        pretrained_weight = cfg['pretrained_weight'] if pretrained else None
-        model, feats = build_resnet(cfg, pretrained_weight)
-    elif 'svnetv2' in cfg['backbone']:
-        pretrained_weight = cfg['pretrained_weight'] if pretrained else None
-        model, feats = build_scnetv2(cfg, pretrained_weight)
-    else:
-        raise NotImplementedError("Unknown backbone: <>.".format(cfg['backbone']))
-    
-    return model, feats
-
-
-# ----------------- ResNet Backbone -----------------
-class ResNet(nn.Module):
-    """ResNet backbone with frozen BatchNorm."""
-    def __init__(self,
-                 name: str,
-                 norm_type: str,
-                 pretrained_weights: str = "imagenet1k_v1",
-                 freeze_at: int = -1,
-                 freeze_stem_only: bool = False):
-        super().__init__()
-        # Pretrained
-        assert pretrained_weights in [None, "imagenet1k_v1", "imagenet1k_v2"]
-        if pretrained_weights is not None:
-            if name in ('resnet18', 'resnet34'):
-                pretrained_weights = pretrained_urls[name].IMAGENET1K_V1
-            else:
-                if pretrained_weights == "imagenet1k_v1":
-                    pretrained_weights = pretrained_urls[name].IMAGENET1K_V1
-                else:
-                    pretrained_weights = pretrained_urls[name].IMAGENET1K_V2
-        else:
-            pretrained_weights = None
-        print('ImageNet pretrained weight: ', pretrained_weights)
-        # Norm layer
-        if norm_type == 'BN':
-            norm_layer = nn.BatchNorm2d
-        elif norm_type == 'FrozeBN':
-            norm_layer = FrozenBatchNorm2d
-        # Backbone
-        backbone = getattr(torchvision.models, name)(norm_layer=norm_layer, weights=pretrained_weights)
-        return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
-        self.feat_dims = [128, 256, 512] if name in ('resnet18', 'resnet34') else [512, 1024, 2048]
-        # Freeze
-        if freeze_at >= 0:
-            for name, parameter in backbone.named_parameters():
-                if freeze_stem_only:
-                    if 'layer1' not in name and 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
-                        parameter.requires_grad_(False)
-                else:
-                    if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
-                        parameter.requires_grad_(False)
-
-    def forward(self, x):
-        xs = self.body(x)
-        fmp_list = []
-        for name, fmp in xs.items():
-            fmp_list.append(fmp)
-
-        return fmp_list
-
-def build_resnet(cfg, pretrained_weight=None):
-    # ResNet series
-    backbone = ResNet(cfg['backbone'],
-                      cfg['backbone_norm'],
-                      pretrained_weight,
-                      cfg['freeze_at'],
-                      cfg['freeze_stem_only'])
-
-    return backbone, backbone.feat_dims
-
-
-# ----------------- ShuffleNet Backbone -----------------
-## TODO: Add shufflenet-v2
-class ShuffleNetv2:
-    pass
-
-def build_scnetv2(cfg, pretrained_weight=None):
-    return
-
-
-if __name__ == '__main__':
-    cfg = {
-        'backbone':      'resnet18',
-        'backbone_norm': 'BN',
-        'pretrained': True,
-        'freeze_at': -1,
-        'freeze_stem_only': True,
-        'pretrained_weight': 'imagenet1k_v1',
-    }
-    model, feat_dim = build_backbone(cfg, cfg['pretrained'])
-    print(feat_dim)
-
-    x = torch.randn(2, 3, 320, 320)
-    output = model(x)
-    for y in output:
-        print(y.size())
-
-    # for n, p in model.named_parameters():
-    #     print(n.split(".")[-1])
-

+ 0 - 278
models/detectors/rtdetr/basic_modules/basic.py

@@ -1,278 +0,0 @@
-import numpy as np
-import copy
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-# ---------------------------- NMS ----------------------------
-## basic NMS
-def nms(bboxes, scores, nms_thresh):
-    """"Pure Python NMS."""
-    x1 = bboxes[:, 0]  #xmin
-    y1 = bboxes[:, 1]  #ymin
-    x2 = bboxes[:, 2]  #xmax
-    y2 = bboxes[:, 3]  #ymax
-
-    areas = (x2 - x1) * (y2 - y1)
-    order = scores.argsort()[::-1]
-
-    keep = []
-    while order.size > 0:
-        i = order[0]
-        keep.append(i)
-        # compute iou
-        xx1 = np.maximum(x1[i], x1[order[1:]])
-        yy1 = np.maximum(y1[i], y1[order[1:]])
-        xx2 = np.minimum(x2[i], x2[order[1:]])
-        yy2 = np.minimum(y2[i], y2[order[1:]])
-
-        w = np.maximum(1e-10, xx2 - xx1)
-        h = np.maximum(1e-10, yy2 - yy1)
-        inter = w * h
-
-        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
-        #reserve all the boundingbox whose ovr less than thresh
-        inds = np.where(iou <= nms_thresh)[0]
-        order = order[inds + 1]
-
-    return keep
-
-## class-agnostic NMS 
-def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh):
-    # nms
-    keep = nms(bboxes, scores, nms_thresh)
-    scores = scores[keep]
-    labels = labels[keep]
-    bboxes = bboxes[keep]
-
-    return scores, labels, bboxes
-
-## class-aware NMS 
-def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes):
-    # nms
-    keep = np.zeros(len(bboxes), dtype=np.int32)
-    for i in range(num_classes):
-        inds = np.where(labels == i)[0]
-        if len(inds) == 0:
-            continue
-        c_bboxes = bboxes[inds]
-        c_scores = scores[inds]
-        c_keep = nms(c_bboxes, c_scores, nms_thresh)
-        keep[inds[c_keep]] = 1
-    keep = np.where(keep > 0)
-    scores = scores[keep]
-    labels = labels[keep]
-    bboxes = bboxes[keep]
-
-    return scores, labels, bboxes
-
-## multi-class NMS 
-def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False):
-    if class_agnostic:
-        return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh)
-    else:
-        return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes)
-
-
-# ----------------- MLP modules -----------------
-class MLP(nn.Module):
-    def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([in_dim] + h, h + [out_dim]))
-
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
-
-class FFN(nn.Module):
-    def __init__(self, d_model=256, ffn_dim=1024, dropout=0., act_type='relu'):
-        super().__init__()
-        self.ffn_dim = ffn_dim
-        self.linear1 = nn.Linear(d_model, self.ffn_dim)
-        self.activation = get_activation(act_type)
-        self.dropout2 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(self.ffn_dim, d_model)
-        self.dropout3 = nn.Dropout(dropout)
-        self.norm = nn.LayerNorm(d_model)
-
-    def forward(self, src):
-        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
-        src = src + self.dropout3(src2)
-        src = self.norm(src)
-        
-        return src
-    
-
-# ----------------- Basic CNN Ops -----------------
-def get_conv2d(c1, c2, k, p, s, g, bias=False):
-    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, groups=g, bias=bias)
-
-    return conv
-
-def get_activation(act_type=None):
-    if act_type == 'relu':
-        return nn.ReLU(inplace=True)
-    elif act_type == 'lrelu':
-        return nn.LeakyReLU(0.1, inplace=True)
-    elif act_type == 'mish':
-        return nn.Mish(inplace=True)
-    elif act_type == 'silu':
-        return nn.SiLU(inplace=True)
-    elif act_type == 'gelu':
-        return nn.GELU()
-    elif act_type is None:
-        return nn.Identity()
-    else:
-        raise NotImplementedError
-        
-def get_norm(norm_type, dim):
-    if norm_type == 'BN':
-        return nn.BatchNorm2d(dim)
-    elif norm_type == 'GN':
-        return nn.GroupNorm(num_groups=32, num_channels=dim)
-    elif norm_type is None:
-        return nn.Identity()
-    else:
-        raise NotImplementedError
-
-def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
-    """3x3 convolution with padding"""
-    return nn.Conv2d(
-        in_planes,
-        out_planes,
-        kernel_size=3,
-        stride=stride,
-        padding=dilation,
-        groups=groups,
-        bias=False,
-        dilation=dilation,
-    )
-
-def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
-    """1x1 convolution"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-class FrozenBatchNorm2d(torch.nn.Module):
-    def __init__(self, n):
-        super(FrozenBatchNorm2d, self).__init__()
-        self.register_buffer("weight", torch.ones(n))
-        self.register_buffer("bias", torch.zeros(n))
-        self.register_buffer("running_mean", torch.zeros(n))
-        self.register_buffer("running_var", torch.ones(n))
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        num_batches_tracked_key = prefix + 'num_batches_tracked'
-        if num_batches_tracked_key in state_dict:
-            del state_dict[num_batches_tracked_key]
-
-        super(FrozenBatchNorm2d, self)._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict,
-            missing_keys, unexpected_keys, error_msgs)
-
-    def forward(self, x):
-        # move reshapes to the beginning
-        # to make it fuser-friendly
-        w = self.weight.reshape(1, -1, 1, 1)
-        b = self.bias.reshape(1, -1, 1, 1)
-        rv = self.running_var.reshape(1, -1, 1, 1)
-        rm = self.running_mean.reshape(1, -1, 1, 1)
-        eps = 1e-5
-        scale = w * (rv + eps).rsqrt()
-        bias = b - rm * scale
-        return x * scale + bias
-    
-class BasicConv(nn.Module):
-    def __init__(self, 
-                 in_dim,                   # in channels
-                 out_dim,                  # out channels 
-                 kernel_size=1,            # kernel size 
-                 padding=0,                # padding
-                 stride=1,                 # padding
-                 act_type  :str = 'lrelu', # activation
-                 norm_type :str = 'BN',    # normalization
-                 depthwise :bool = False
-                ):
-        super(BasicConv, self).__init__()
-        add_bias = False if norm_type else True
-        self.depthwise = depthwise
-        if not depthwise:
-            self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
-            self.norm = get_norm(norm_type, out_dim)
-        else:
-            self.conv1 = get_conv2d(in_dim, in_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
-            self.norm1 = get_norm(norm_type, in_dim)
-            self.conv2 = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
-            self.norm2 = get_norm(norm_type, out_dim)
-        self.act  = get_activation(act_type)
-
-    def forward(self, x):
-        if not self.depthwise:
-            return self.act(self.norm(self.conv(x)))
-        else:
-            # Depthwise conv
-            x = self.norm1(self.conv1(x))
-            # Pointwise conv
-            x = self.norm2(self.conv2(x))
-            return x
-
-
-# ----------------- CNN Modules -----------------
-class Bottleneck(nn.Module):
-    def __init__(self,
-                 in_dim,
-                 out_dim,
-                 expand_ratio = 0.5,
-                 kernel_sizes = [3, 3],
-                 shortcut     = True,
-                 act_type     = 'silu',
-                 norm_type    = 'BN',
-                 depthwise    = False,):
-        super(Bottleneck, self).__init__()
-        inter_dim = int(out_dim * expand_ratio)
-        paddings = [k // 2 for k in kernel_sizes]
-        self.cv1 = BasicConv(in_dim, inter_dim,
-                             kernel_size=kernel_sizes[0], padding=paddings[0],
-                             act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        self.cv2 = BasicConv(inter_dim, out_dim,
-                             kernel_size=kernel_sizes[1], padding=paddings[1],
-                             act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        self.shortcut = shortcut and in_dim == out_dim
-
-    def forward(self, x):
-        h = self.cv2(self.cv1(x))
-
-        return x + h if self.shortcut else h
-
-class RTCBlock(nn.Module):
-    def __init__(self,
-                 in_dim,
-                 out_dim,
-                 num_blocks = 1,
-                 shortcut   = False,
-                 act_type   = 'silu',
-                 norm_type  = 'BN',
-                 depthwise  = False,):
-        super(RTCBlock, self).__init__()
-        self.inter_dim = out_dim // 2
-        self.conv1 = BasicConv(in_dim, self.inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.conv2 = BasicConv(in_dim, self.inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.cmodules = nn.ModuleList([Bottleneck(self.inter_dim, self.inter_dim,
-                                                   1.0, [3, 3], shortcut,
-                                                   act_type, norm_type, depthwise)
-                                                   for _ in range(num_blocks)])
-        self.conv3 = BasicConv(self.inter_dim * (2 + num_blocks), out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-
-    def forward(self, x):
-        x1, x2 = self.conv1(x), self.conv2(x)
-        out = [x1, x2]
-        for m in self.cmodules:
-            x2 = m(x2)
-            out.append(x2)
-
-        return self.conv3(torch.cat(out, dim=1))
-    

+ 0 - 109
models/detectors/rtdetr/basic_modules/dn_compoments.py

@@ -1,109 +0,0 @@
-import torch
-
-
-def inverse_sigmoid(x, eps=1e-5):
-    x = x.clamp(min=0., max=1.)
-    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
-
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return torch.stack(b, dim=-1)
-
-def box_xyxy_to_cxcywh(x):
-    x0, y0, x1, y1 = x.unbind(-1)
-    b = [(x0 + x1) / 2, (y0 + y1) / 2,
-         (x1 - x0), (y1 - y0)]
-    return torch.stack(b, dim=-1)
-
-def get_contrastive_denoising_training_group(targets,
-                                             num_classes,
-                                             num_queries,
-                                             class_embed,
-                                             num_denoising=100,
-                                             label_noise_ratio=0.5,
-                                             box_noise_scale=1.0,):
-    if num_denoising <= 0:
-        return None, None, None, None
-
-    num_gts = [len(t['labels']) for t in targets]
-    device = targets[0]['labels'].device
-    
-    max_gt_num = max(num_gts)
-    if max_gt_num == 0:
-        return None, None, None, None
-
-    num_group = num_denoising // max_gt_num
-    num_group = 1 if num_group == 0 else num_group
-    # pad gt to max_num of a batch
-    bs = len(num_gts)
-
-    input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device)
-    input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device)
-    pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device)
-
-    for i in range(bs):
-        num_gt = num_gts[i]
-        if num_gt > 0:
-            input_query_class[i, :num_gt] = targets[i]['labels']
-            input_query_bbox[i, :num_gt] = targets[i]['boxes']
-            pad_gt_mask[i, :num_gt] = 1
-    # each group has positive and negative queries.
-    input_query_class = input_query_class.tile([1, 2 * num_group])
-    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
-    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
-    # positive and negative mask
-    negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device)
-    negative_gt_mask[:, max_gt_num:] = 1
-    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
-    positive_gt_mask = 1 - negative_gt_mask
-    # contrastive denoising training positive index
-    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
-    dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
-    dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts])
-    # total denoising queries
-    num_denoising = int(max_gt_num * 2 * num_group)
-
-    if label_noise_ratio > 0:
-        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
-        # randomly put a new one here
-        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
-        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
-
-    if box_noise_scale > 0:
-        known_bbox = box_cxcywh_to_xyxy(input_query_bbox)
-        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
-        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
-        rand_part = torch.rand_like(input_query_bbox)
-        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
-        rand_part *= rand_sign
-        known_bbox += rand_part * diff
-        known_bbox.clip_(min=0.0, max=1.0)
-        input_query_bbox = box_xyxy_to_cxcywh(known_bbox)
-        input_query_bbox = inverse_sigmoid(input_query_bbox)
-    input_query_class = class_embed(input_query_class)
-
-    tgt_size = num_denoising + num_queries
-    # attn_mask = torch.ones([tgt_size, tgt_size], device=device) < 0
-    attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device)
-    # match query cannot see the reconstruction
-    attn_mask[num_denoising:, :num_denoising] = True
-    
-    # reconstruct cannot see each other
-    for i in range(num_group):
-        if i == 0:
-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
-        if i == num_group - 1:
-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True
-        else:
-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
-            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True
-        
-    dn_meta = {
-        "dn_positive_idx": dn_positive_idx,
-        "dn_num_group": num_group,
-        "dn_num_split": [num_denoising, num_queries]
-    }
-
-    return input_query_class, input_query_bbox, attn_mask, dn_meta

+ 0 - 85
models/detectors/rtdetr/basic_modules/ext_op/README.md

@@ -1,85 +0,0 @@
-# Multi-scale deformable attention自定义OP编译
-该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
-
-## 1. 环境依赖
-- Paddle >= 2.3.2
-- gcc 8.2
-
-## 2. 安装
-请在当前路径下进行编译安装
-```
-cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/
-python setup_ms_deformable_attn_op.py install
-```
-
-编译完成后即可使用,以下为`ms_deformable_attn`的使用示例
-```
-# 引入自定义op
-from deformable_detr_ops import ms_deformable_attn
-
-# 构造fake input tensor
-bs, n_heads, c = 2, 8, 8
-query_length, n_levels, n_points = 2, 2, 2
-spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
-level_start_index = paddle.concat((paddle.to_tensor(
-    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
-value_length = sum([(H * W).item() for H, W in spatial_shapes])
-
-def get_test_tensors(channels):
-    value = paddle.rand(
-        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
-    sampling_locations = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points, 2],
-        dtype=paddle.float32)
-    attention_weights = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points],
-        dtype=paddle.float32) + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-        -2, keepdim=True)
-    return [value, sampling_locations, attention_weights]
-
-value, sampling_locations, attention_weights = get_test_tensors(c)
-
-output = ms_deformable_attn(value,
-                            spatial_shapes,
-                            level_start_index,
-                            sampling_locations,
-                            attention_weights)
-```
-
-## 3. 单元测试
-可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示:
-```
-python test_ms_deformable_attn_op.py
-```
-运行成功后,打印如下:
-```
-*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
-*tensor1 True check_gradient_numerical(D=30)
-*tensor2 True check_gradient_numerical(D=30)
-*tensor3 True check_gradient_numerical(D=30)
-*tensor1 True check_gradient_numerical(D=32)
-*tensor2 True check_gradient_numerical(D=32)
-*tensor3 True check_gradient_numerical(D=32)
-*tensor1 True check_gradient_numerical(D=64)
-*tensor2 True check_gradient_numerical(D=64)
-*tensor3 True check_gradient_numerical(D=64)
-*tensor1 True check_gradient_numerical(D=71)
-*tensor2 True check_gradient_numerical(D=71)
-*tensor3 True check_gradient_numerical(D=71)
-*tensor1 True check_gradient_numerical(D=128)
-*tensor2 True check_gradient_numerical(D=128)
-*tensor3 True check_gradient_numerical(D=128)
-*tensor1 True check_gradient_numerical(D=1024)
-*tensor2 True check_gradient_numerical(D=1024)
-*tensor3 True check_gradient_numerical(D=1024)
-*tensor1 True check_gradient_numerical(D=1025)
-*tensor2 True check_gradient_numerical(D=1025)
-*tensor3 True check_gradient_numerical(D=1025)
-*tensor1 True check_gradient_numerical(D=2048)
-*tensor2 True check_gradient_numerical(D=2048)
-*tensor3 True check_gradient_numerical(D=2048)
-*tensor1 True check_gradient_numerical(D=3096)
-*tensor2 True check_gradient_numerical(D=3096)
-*tensor3 True check_gradient_numerical(D=3096)
-```

+ 0 - 65
models/detectors/rtdetr/basic_modules/ext_op/ms_deformable_attn_op.cc

@@ -1,65 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/extension.h"
-
-#include <vector>
-
-// declare GPU implementation
-std::vector<paddle::Tensor>
-MSDeformableAttnCUDAForward(const paddle::Tensor &value,
-                            const paddle::Tensor &value_spatial_shapes,
-                            const paddle::Tensor &value_level_start_index,
-                            const paddle::Tensor &sampling_locations,
-                            const paddle::Tensor &attention_weights);
-
-std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
-    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
-    const paddle::Tensor &value_level_start_index,
-    const paddle::Tensor &sampling_locations,
-    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
-
-//// CPU not implemented
-
-std::vector<std::vector<int64_t>>
-MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
-                           std::vector<int64_t> value_spatial_shapes_shape,
-                           std::vector<int64_t> value_level_start_index_shape,
-                           std::vector<int64_t> sampling_locations_shape,
-                           std::vector<int64_t> attention_weights_shape) {
-  return {{value_shape[0], sampling_locations_shape[1],
-           value_shape[2] * value_shape[3]}};
-}
-
-std::vector<paddle::DataType>
-MSDeformableAttnInferDtype(paddle::DataType value_dtype,
-                           paddle::DataType value_spatial_shapes_dtype,
-                           paddle::DataType value_level_start_index_dtype,
-                           paddle::DataType sampling_locations_dtype,
-                           paddle::DataType attention_weights_dtype) {
-  return {value_dtype};
-}
-
-PD_BUILD_OP(ms_deformable_attn)
-    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
-             "AttentionWeights"})
-    .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
-
-PD_BUILD_GRAD_OP(ms_deformable_attn)
-    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
-             "AttentionWeights", paddle::Grad("Out")})
-    .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
-              paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
-              paddle::Grad("AttentionWeights")})
-    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));

+ 0 - 1073
models/detectors/rtdetr/basic_modules/ext_op/ms_deformable_attn_op.cu

@@ -1,1073 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/extension.h"
-
-#define CUDA_KERNEL_LOOP(i, n)                                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
-       i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads) {
-  return (N + num_threads - 1) / num_threads;
-}
-
-// forward bilinear
-template <typename data_t>
-__device__ data_t deformable_attn_bilinear_forward(
-    const data_t *&bottom_data, const int &height, const int &width,
-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
-    const int &m, const int &c) {
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const data_t lh = h - h_low;
-  const data_t lw = w - w_low;
-  const data_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  data_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0) {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  data_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  data_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  data_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
-
-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-// forward kernel
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_forward(
-    const int n, const data_t *data_value, const int64_t *data_spatial_shapes,
-    const int64_t *data_level_start_index, const data_t *data_sampling_loc,
-    const data_t *data_attn_weight, const int batch_size,
-    const int value_length, const int num_heads, const int channels,
-    const int num_levels, const int query_length, const int num_points,
-    data_t *output_data_ptr) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    data_t *data_ptr = output_data_ptr + index;
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-    data_t col = 0;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset +
-                                                   level_start_id * qid_stride);
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          col += deformable_attn_bilinear_forward(
-                     data_value_ptr, spatial_h, spatial_w, num_heads, channels,
-                     h_im, w_im, m_col, c_col) *
-                 weight;
-        }
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-      }
-    }
-    *data_ptr = col;
-  }
-}
-
-#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-// forward
-std::vector<paddle::Tensor>
-MSDeformableAttnCUDAForward(const paddle::Tensor &value,
-                            const paddle::Tensor &value_spatial_shapes,
-                            const paddle::Tensor &value_level_start_index,
-                            const paddle::Tensor &sampling_locations,
-                            const paddle::Tensor &attention_weights) {
-
-  CHECK_INPUT_GPU(value);
-  CHECK_INPUT_GPU(value_spatial_shapes);
-  CHECK_INPUT_GPU(value_level_start_index);
-  CHECK_INPUT_GPU(sampling_locations);
-  CHECK_INPUT_GPU(attention_weights);
-
-  const int batch_size = value.shape()[0];
-  const int value_length = value.shape()[1];
-  const int num_heads = value.shape()[2];
-  const int channels = value.shape()[3];
-
-  const int num_levels = value_spatial_shapes.shape()[0];
-  const int query_length = sampling_locations.shape()[1];
-  const int num_points = sampling_locations.shape()[4];
-
-  auto output = paddle::full({batch_size, query_length, num_heads * channels},
-                             0, value.dtype(), paddle::GPUPlace());
-
-  const int num_kernels = batch_size * query_length * num_heads * channels;
-  deformable_attn_cuda_kernel_forward<float>
-      <<<GET_BLOCKS(num_kernels, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
-         value.stream()>>>(num_kernels, value.data<float>(),
-                           value_spatial_shapes.data<int64_t>(),
-                           value_level_start_index.data<int64_t>(),
-                           sampling_locations.data<float>(),
-                           attention_weights.data<float>(), batch_size,
-                           value_length, num_heads, channels, num_levels,
-                           query_length, num_points, output.data<float>());
-  return {output};
-}
-
-// backward bilinear
-template <typename data_t>
-__device__ void deformable_attn_bilinear_backward(
-    const data_t *&bottom_data, const int &height, const int &width,
-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
-    const int &m, const int &c, const data_t &top_grad,
-    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const data_t lh = h - h_low;
-  const data_t lw = w - w_low;
-  const data_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const data_t top_grad_value = top_grad * attn_weight;
-  data_t grad_h_weight = 0, grad_w_weight = 0;
-
-  data_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0) {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
-  }
-  data_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
-  }
-  data_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
-  }
-  data_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
-  }
-
-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  *grad_attn_weight = top_grad * val;
-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
-}
-
-template <typename data_t>
-__device__ void deformable_attn_bilinear_backward_gm(
-    const data_t *&bottom_data, const int &height, const int &width,
-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
-    const int &m, const int &c, const data_t &top_grad,
-    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const data_t lh = h - h_low;
-  const data_t lw = w - w_low;
-  const data_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const data_t top_grad_value = top_grad * attn_weight;
-  data_t grad_h_weight = 0, grad_w_weight = 0;
-
-  data_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0) {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
-  }
-  data_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
-  }
-  data_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
-  }
-  data_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
-  }
-
-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val);
-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
-}
-
-// backward kernels
-// channels > 1024
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    data_t *cache_grad_sampling_loc = (data_t *)_s;
-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-
-        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
-             s >>= 1, spre >>= 1) {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] +=
-                cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre) {
-              cache_grad_attn_weight[tid] +=
-                  cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] +=
-                  cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] +=
-                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0) {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_backward_gm(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward_gm(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              grad_sampling_loc, grad_attn_weight);
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-// channels <= 1024
-template <typename data_t, unsigned int blockSize>
-__global__ void
-deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ data_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-        if (tid == 0) {
-          data_t _grad_w = cache_grad_sampling_loc[0],
-                 _grad_h = cache_grad_sampling_loc[1],
-                 _grad_a = cache_grad_attn_weight[0];
-          int sid = 2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid) {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename data_t, unsigned int blockSize>
-__global__ void
-deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ data_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-
-        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] +=
-                cache_grad_sampling_loc[xid2 + 1];
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0) {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    data_t *cache_grad_sampling_loc = (data_t *)_s;
-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-        if (tid == 0) {
-          data_t _grad_w = cache_grad_sampling_loc[0],
-                 _grad_h = cache_grad_sampling_loc[1],
-                 _grad_a = cache_grad_attn_weight[0];
-          int sid = 2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    data_t *cache_grad_sampling_loc = (data_t *)_s;
-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-
-        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
-             s >>= 1, spre >>= 1) {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] +=
-                cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre) {
-              cache_grad_attn_weight[tid] +=
-                  cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] +=
-                  cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] +=
-                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0) {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-// backward branch
-template <typename data_t>
-void deformable_attn_cuda_backward(
-    cudaStream_t stream, const data_t *grad_out, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  const int num_threads =
-      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
-  const int num_kernels = batch_size * query_length * num_heads * channels;
-  const int num_actual_kernels =
-      batch_size * query_length * num_heads * channels;
-  if (channels > 1024) {
-    if ((channels & 1023) == 0) {
-      deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks<data_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-             num_threads * 3 * sizeof(data_t), stream>>>(
-              num_kernels, grad_out, data_value, data_spatial_shapes,
-              data_level_start_index, data_sampling_loc, data_attn_weight,
-              batch_size, value_length, num_heads, channels, num_levels,
-              query_length, num_points, grad_value, grad_sampling_loc,
-              grad_attn_weight);
-    } else {
-      deformable_attn_cuda_kernel_backward_gm<data_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-    }
-  } else {
-    switch (channels) {
-    case 1:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         1>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 2:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         2>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 4:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         4>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 8:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         8>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 16:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         16>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 32:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         32>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 64:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         64>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 128:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         128>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 256:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         256>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 512:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         512>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 1024:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         1024>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    default:
-      if (channels < 64) {
-        deformable_attn_cuda_kernel_backward_shm_reduce_v1<data_t>
-            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-               num_threads * 3 * sizeof(data_t), stream>>>(
-                num_kernels, grad_out, data_value, data_spatial_shapes,
-                data_level_start_index, data_sampling_loc, data_attn_weight,
-                batch_size, value_length, num_heads, channels, num_levels,
-                query_length, num_points, grad_value, grad_sampling_loc,
-                grad_attn_weight);
-      } else {
-        deformable_attn_cuda_kernel_backward_shm_reduce_v2<data_t>
-            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-               num_threads * 3 * sizeof(data_t), stream>>>(
-                num_kernels, grad_out, data_value, data_spatial_shapes,
-                data_level_start_index, data_sampling_loc, data_attn_weight,
-                batch_size, value_length, num_heads, channels, num_levels,
-                query_length, num_points, grad_value, grad_sampling_loc,
-                grad_attn_weight);
-      }
-    }
-  }
-}
-
-// backward
-std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
-    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
-    const paddle::Tensor &value_level_start_index,
-    const paddle::Tensor &sampling_locations,
-    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) {
-
-  CHECK_INPUT_GPU(value);
-  CHECK_INPUT_GPU(value_spatial_shapes);
-  CHECK_INPUT_GPU(value_level_start_index);
-  CHECK_INPUT_GPU(sampling_locations);
-  CHECK_INPUT_GPU(attention_weights);
-  CHECK_INPUT_GPU(grad_out);
-
-  const int batch_size = value.shape()[0];
-  const int value_length = value.shape()[1];
-  const int num_heads = value.shape()[2];
-  const int channels = value.shape()[3];
-
-  const int num_levels = value_spatial_shapes.shape()[0];
-  const int query_length = sampling_locations.shape()[1];
-  const int num_points = sampling_locations.shape()[4];
-
-  auto grad_value =
-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
-  auto grad_spatial_shapes =
-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
-  auto grad_level_start_index =
-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
-  auto grad_sampling_locations =
-      paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(),
-                   paddle::GPUPlace());
-  auto grad_attention_weights =
-      paddle::full(attention_weights.shape(), 0, attention_weights.dtype(),
-                   paddle::GPUPlace());
-
-  deformable_attn_cuda_backward<float>(
-      value.stream(), grad_out.data<float>(), value.data<float>(),
-      value_spatial_shapes.data<int64_t>(),
-      value_level_start_index.data<int64_t>(), sampling_locations.data<float>(),
-      attention_weights.data<float>(), batch_size, value_length, num_heads,
-      channels, num_levels, query_length, num_points, grad_value.data<float>(),
-      grad_sampling_locations.data<float>(),
-      grad_attention_weights.data<float>());
-
-  return {grad_value, grad_spatial_shapes, grad_level_start_index,
-          grad_sampling_locations, grad_attention_weights};
-}

+ 0 - 7
models/detectors/rtdetr/basic_modules/ext_op/setup_ms_deformable_attn_op.py

@@ -1,7 +0,0 @@
-from paddle.utils.cpp_extension import CUDAExtension, setup
-
-if __name__ == "__main__":
-    setup(
-        name='deformable_detr_ops',
-        ext_modules=CUDAExtension(
-            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))

+ 0 - 140
models/detectors/rtdetr/basic_modules/ext_op/test_ms_deformable_attn_op.py

@@ -1,140 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import os
-import sys
-import random
-import numpy as np
-import paddle
-# add python path of PaddleDetection to sys.path
-parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
-if parent_path not in sys.path:
-    sys.path.append(parent_path)
-
-from ppdet.modeling.transformers.utils import deformable_attention_core_func
-ms_deform_attn_core_paddle = deformable_attention_core_func
-
-try:
-    gpu_index = int(sys.argv[1])
-except:
-    gpu_index = 0
-print(f'Use gpu {gpu_index} to test...')
-paddle.set_device(f'gpu:{gpu_index}')
-
-try:
-    from deformable_detr_ops import ms_deformable_attn
-except Exception as e:
-    print('import deformable_detr_ops error', e)
-    sys.exit(-1)
-
-paddle.seed(1)
-random.seed(1)
-np.random.seed(1)
-
-bs, n_heads, c = 2, 8, 8
-query_length, n_levels, n_points = 2, 2, 2
-spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
-level_start_index = paddle.concat((paddle.to_tensor(
-    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
-value_length = sum([(H * W).item() for H, W in spatial_shapes])
-
-
-def get_test_tensors(channels):
-    value = paddle.rand(
-        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
-    sampling_locations = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points, 2],
-        dtype=paddle.float32)
-    attention_weights = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points],
-        dtype=paddle.float32) + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-        -2, keepdim=True)
-
-    return [value, sampling_locations, attention_weights]
-
-
-@paddle.no_grad()
-def check_forward_equal_with_paddle_float():
-    value, sampling_locations, attention_weights = get_test_tensors(c)
-
-    output_paddle = ms_deform_attn_core_paddle(
-        value, spatial_shapes, level_start_index, sampling_locations,
-        attention_weights).detach().cpu()
-    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
-                                     sampling_locations,
-                                     attention_weights).detach().cpu()
-    fwdok = paddle.allclose(
-        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
-    max_abs_err = (output_cuda - output_paddle).abs().max().item()
-    max_rel_err = (
-        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
-
-    print(
-        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
-    )
-
-
-def check_gradient_numerical(channels=4):
-    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
-        channels)
-    value_paddle.stop_gradient = False
-    sampling_locations_paddle.stop_gradient = False
-    attention_weights_paddle.stop_gradient = False
-
-    value_cuda = value_paddle.detach().clone()
-    sampling_locations_cuda = sampling_locations_paddle.detach().clone()
-    attention_weights_cuda = attention_weights_paddle.detach().clone()
-    value_cuda.stop_gradient = False
-    sampling_locations_cuda.stop_gradient = False
-    attention_weights_cuda.stop_gradient = False
-
-    output_paddle = ms_deform_attn_core_paddle(
-        value_paddle, spatial_shapes, level_start_index,
-        sampling_locations_paddle, attention_weights_paddle)
-    output_paddle.sum().backward()
-
-    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
-                                     level_start_index, sampling_locations_cuda,
-                                     attention_weights_cuda)
-    output_cuda.sum().backward()
-
-    res = paddle.allclose(
-        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
-    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
-
-    res = paddle.allclose(
-        sampling_locations_paddle.grad,
-        sampling_locations_cuda.grad,
-        rtol=1e-2,
-        atol=1e-3).item()
-    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
-
-    res = paddle.allclose(
-        attention_weights_paddle.grad,
-        attention_weights_cuda.grad,
-        rtol=1e-2,
-        atol=1e-3).item()
-    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
-
-
-if __name__ == '__main__':
-    check_forward_equal_with_paddle_float()
-
-    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
-        check_gradient_numerical(channels)

+ 0 - 200
models/detectors/rtdetr/basic_modules/fpn.py

@@ -1,200 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import List
-
-try:
-    from .basic import BasicConv, RTCBlock
-    from .transformer import TransformerEncoder
-except:
-    from  basic import BasicConv, RTCBlock
-    from  transformer import TransformerEncoder
-
-
-# Build PaFPN
-def build_fpn(cfg, in_dims, out_dim):
-    if cfg['fpn'] == 'hybrid_encoder':
-        return HybridEncoder(in_dims     = in_dims,
-                             out_dim     = out_dim,
-                             num_blocks  = cfg['fpn_num_blocks'],
-                             act_type    = cfg['fpn_act'],
-                             norm_type   = cfg['fpn_norm'],
-                             depthwise   = cfg['fpn_depthwise'],
-                             num_heads   = cfg['en_num_heads'],
-                             num_layers  = cfg['en_num_layers'],
-                             ffn_dim     = cfg['en_ffn_dim'],
-                             dropout     = cfg['en_dropout'],
-                             pe_temperature = cfg['pe_temperature'],
-                             en_act_type    = cfg['en_act'],
-                             )
-    else:
-        raise NotImplementedError("Unknown PaFPN: <{}>".format(cfg['fpn']))
-
-
-# ----------------- Feature Pyramid Network -----------------
-## Hybrid Encoder (Transformer encoder + Convolutional PaFPN)
-class HybridEncoder(nn.Module):
-    def __init__(self, 
-                 in_dims        :List  = [256, 512, 1024],
-                 out_dim        :int   = 256,
-                 num_blocks     :int   = 3,
-                 act_type       :str   = 'silu',
-                 norm_type      :str   = 'BN',
-                 depthwise      :bool  = False,
-                 # Transformer's parameters
-                 num_heads      :int   = 8,
-                 num_layers     :int   = 1,
-                 ffn_dim        :int   = 1024,
-                 dropout        :float = 0.1,
-                 pe_temperature :float = 10000.,
-                 en_act_type    :str   = 'gelu'
-                 ) -> None:
-        super(HybridEncoder, self).__init__()
-        print('==============================')
-        print('FPN: {}'.format("RTC-PaFPN"))
-        # ---------------- Basic parameters ----------------
-        self.in_dims = in_dims
-        self.out_dim = out_dim
-        self.out_dims = [self.out_dim] * len(in_dims)
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.ffn_dim = ffn_dim
-        c3, c4, c5 = in_dims
-
-        # ---------------- Input projs ----------------
-        self.reduce_layer_1 = BasicConv(c5, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.reduce_layer_2 = BasicConv(c4, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-        self.reduce_layer_3 = BasicConv(c3, self.out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
-
-        # ---------------- Downsample ----------------
-        self.dowmsample_layer_1 = BasicConv(self.out_dim, self.out_dim,
-                                            kernel_size=3, padding=1, stride=2,
-                                            act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-        self.dowmsample_layer_2 = BasicConv(self.out_dim, self.out_dim,
-                                            kernel_size=3, padding=1, stride=2,
-                                            act_type=act_type, norm_type=norm_type, depthwise=depthwise)
-
-        # ---------------- Transformer Encoder ----------------
-        self.transformer_encoder = TransformerEncoder(d_model        = self.out_dim,
-                                                      num_heads      = num_heads,
-                                                      num_layers     = num_layers,
-                                                      ffn_dim        = ffn_dim,
-                                                      pe_temperature = pe_temperature,
-                                                      dropout        = dropout,
-                                                      act_type       = en_act_type
-                                                      )
-
-        # ---------------- Top dwon FPN ----------------
-        ## P5 -> P4
-        self.top_down_layer_1 = RTCBlock(in_dim      = self.out_dim * 2,
-                                         out_dim     = self.out_dim,
-                                         num_blocks  = num_blocks,
-                                         shortcut    = False,
-                                         act_type    = act_type,
-                                         norm_type   = norm_type,
-                                         depthwise   = depthwise,
-                                         )
-        ## P4 -> P3
-        self.top_down_layer_2 = RTCBlock(in_dim      = self.out_dim * 2,
-                                         out_dim     = self.out_dim,
-                                         num_blocks  = num_blocks,
-                                         shortcut    = False,
-                                         act_type    = act_type,
-                                         norm_type   = norm_type,
-                                         depthwise   = depthwise,
-                                         )
-        
-        # ---------------- Bottom up PAN----------------
-        ## P3 -> P4
-        self.bottom_up_layer_1 = RTCBlock(in_dim      = self.out_dim * 2,
-                                          out_dim     = self.out_dim,
-                                          num_blocks  = num_blocks,
-                                          shortcut    = False,
-                                          act_type    = act_type,
-                                          norm_type   = norm_type,
-                                          depthwise   = depthwise,
-                                          )
-        ## P4 -> P5
-        self.bottom_up_layer_2 = RTCBlock(in_dim      = self.out_dim * 2,
-                                          out_dim     = self.out_dim,
-                                          num_blocks  = num_blocks,
-                                          shortcut    = False,
-                                          act_type    = act_type,
-                                          norm_type   = norm_type,
-                                          depthwise   = depthwise,
-                                          )
-
-        self.init_weights()
-  
-    def init_weights(self):
-        """Initialize the parameters."""
-        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                # In order to be consistent with the source code,
-                # reset the Conv2d initialization parameters
-                m.reset_parameters()
-
-    def forward(self, features):
-        c3, c4, c5 = features
-
-        # -------- Input projs --------
-        p5 = self.reduce_layer_1(c5)
-        p4 = self.reduce_layer_2(c4)
-        p3 = self.reduce_layer_3(c3)
-
-        # -------- Transformer encoder --------
-        p5 = self.transformer_encoder(p5)
-
-        # -------- Top down FPN --------
-        p5_up = F.interpolate(p5, scale_factor=2.0)
-        p4 = self.top_down_layer_1(torch.cat([p4, p5_up], dim=1))
-
-        p4_up = F.interpolate(p4, scale_factor=2.0)
-        p3 = self.top_down_layer_2(torch.cat([p3, p4_up], dim=1))
-
-        # -------- Bottom up PAN --------
-        p3_ds = self.dowmsample_layer_1(p3)
-        p4 = self.bottom_up_layer_1(torch.cat([p4, p3_ds], dim=1))
-
-        p4_ds = self.dowmsample_layer_2(p4)
-        p5 = self.bottom_up_layer_2(torch.cat([p5, p4_ds], dim=1))
-
-        out_feats = [p3, p4, p5]
-        
-        return out_feats
-
-
-if __name__ == '__main__':
-    import time
-    from thop import profile
-    cfg = {
-        'fpn': 'hybrid_encoder',
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        'fpn_num_blocks': 3,
-        'fpn_expansion': 0.5,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_ffn_dim': 1024,
-        'en_dropout': 0.0,
-        'pe_temperature': 10000.,
-        'en_act': 'gelu',
-    }
-    fpn_dims = [256, 512, 1024]
-    out_dim = 256
-    pyramid_feats = [torch.randn(1, fpn_dims[0], 80, 80), torch.randn(1, fpn_dims[1], 40, 40), torch.randn(1, fpn_dims[2], 20, 20)]
-    model = build_fpn(cfg, fpn_dims, out_dim)
-
-    t0 = time.time()
-    outputs = model(pyramid_feats)
-    t1 = time.time()
-    print('Time: ', t1 - t0)
-    for out in outputs:
-        print(out.shape)
-
-    print('==============================')
-    flops, params = profile(model, inputs=(pyramid_feats, ), verbose=False)
-    print('==============================')
-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
-    print('Params : {:.2f} M'.format(params / 1e6))

+ 0 - 462
models/detectors/rtdetr/basic_modules/transformer.py

@@ -1,462 +0,0 @@
-import math
-import copy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-try:
-    from .basic import FFN
-except:
-    from  basic import FFN
-
-
-def get_clones(module, N):
-    if N <= 0:
-        return None
-    else:
-        return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
-
-def inverse_sigmoid(x, eps=1e-5):
-    x = x.clamp(min=0., max=1.)
-    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
-
-
-# ----------------- Basic Transformer Ops -----------------
-def multi_scale_deformable_attn_pytorch(
-    value: torch.Tensor,
-    value_spatial_shapes: torch.Tensor,
-    sampling_locations: torch.Tensor,
-    attention_weights: torch.Tensor,
-) -> torch.Tensor:
-
-    bs, _, num_heads, embed_dims = value.shape
-    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    
-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for level, (H_, W_) in enumerate(value_spatial_shapes):
-        # bs, H_*W_, num_heads, embed_dims ->
-        # bs, H_*W_, num_heads*embed_dims ->
-        # bs, num_heads*embed_dims, H_*W_ ->
-        # bs*num_heads, embed_dims, H_, W_
-        value_l_ = (
-            value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
-        )
-        # bs, num_queries, num_heads, num_points, 2 ->
-        # bs, num_heads, num_queries, num_points, 2 ->
-        # bs*num_heads, num_queries, num_points, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
-        # bs*num_heads, embed_dims, num_queries, num_points
-        sampling_value_l_ = F.grid_sample(
-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
-        )
-        sampling_value_list.append(sampling_value_l_)
-    # (bs, num_queries, num_heads, num_levels, num_points) ->
-    # (bs, num_heads, num_queries, num_levels, num_points) ->
-    # (bs, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(
-        bs * num_heads, 1, num_queries, num_levels * num_points
-    )
-    output = (
-        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
-        .sum(-1)
-        .view(bs, num_heads * embed_dims, num_queries)
-    )
-    return output.transpose(1, 2).contiguous()
-
-class MSDeformableAttention(nn.Module):
-    def __init__(self,
-                 embed_dim=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=4):
-        """
-        Multi-Scale Deformable Attention Module
-        """
-        super(MSDeformableAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.num_levels = num_levels
-        self.num_points = num_points
-        self.total_points = num_heads * num_levels * num_points
-
-        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-
-        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2)
-        self.attention_weights = nn.Linear(embed_dim, self.total_points)
-        self.value_proj = nn.Linear(embed_dim, embed_dim)
-        self.output_proj = nn.Linear(embed_dim, embed_dim)
-        
-        try:
-            # use cuda op
-            from deformable_detr_ops import ms_deformable_attn
-            self.ms_deformable_attn_core = ms_deformable_attn
-        except:
-            # use torch func
-            self.ms_deformable_attn_core = multi_scale_deformable_attn_pytorch
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        """
-        Default initialization for Parameters of Module.
-        """
-        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
-        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (
-            2.0 * math.pi / self.num_heads
-        )
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (
-            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
-            .view(self.num_heads, 1, 1, 2)
-            .repeat(1, self.num_levels, self.num_points, 1)
-        )
-        for i in range(self.num_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-
-        # attention weight
-        nn.init.constant_(self.attention_weights.weight, 0.0)
-        nn.init.constant_(self.attention_weights.bias, 0.0)
-
-        # proj
-        nn.init.xavier_uniform_(self.value_proj.weight)
-        nn.init.constant_(self.value_proj.bias, 0.0)
-        nn.init.xavier_uniform_(self.output_proj.weight)
-        nn.init.constant_(self.output_proj.bias, 0.0)
-
-    def forward(self,
-                query,
-                reference_points,
-                value,
-                value_spatial_shapes,
-                value_mask=None):
-        """
-        Args:
-            query (Tensor): [bs, query_length, C]
-            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area
-            value (Tensor): [bs, value_length, C]
-            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
-            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
-
-        Returns:
-            output (Tensor): [bs, Length_{query}, C]
-        """
-        bs, num_query = query.shape[:2]
-        num_value = value.shape[1]
-        assert sum([s[0] * s[1] for s in value_spatial_shapes]) == num_value
-
-        # Value projection
-        value = self.value_proj(value)
-        # fill "0" for the padding part
-        if value_mask is not None:
-            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
-            value *= value_mask
-        # [bs, all_hw, 256] -> [bs, all_hw, num_head, head_dim]
-        value = value.reshape([bs, num_value, self.num_heads, -1])
-
-        # [bs, all_hw, num_head, nun_level, num_sample_point, num_offset]
-        sampling_offsets = self.sampling_offsets(query).reshape(
-            [bs, num_query, self.num_heads, self.num_levels, self.num_points, 2])
-        # [bs, all_hw, num_head, nun_level*num_sample_point]
-        attention_weights = self.attention_weights(query).reshape(
-            [bs, num_query, self.num_heads, self.num_levels * self.num_points])
-        # [bs, all_hw, num_head, nun_level, num_sample_point]
-        attention_weights = attention_weights.softmax(-1).reshape(
-            [bs, num_query, self.num_heads, self.num_levels, self.num_points])
-
-        # [bs, num_query, num_heads, num_levels, num_points, 2]
-        if reference_points.shape[-1] == 2:
-            # reference_points   [bs, all_hw, num_sample_point, 2] -> [bs, all_hw, 1, num_sample_point, 1, 2]
-            # sampling_offsets   [bs, all_hw, nun_head, num_level, num_sample_point, 2]
-            # offset_normalizer  [4, 2] -> [1, 1, 1, num_sample_point, 1, 2]
-            # references_points + sampling_offsets
-            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
-                [1, 1, 1, self.num_levels, 1, 2])
-            sampling_locations = (
-                reference_points[:, :, None, :, None, :]
-                + sampling_offsets / offset_normalizer
-            )
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = (
-                reference_points[:, :, None, :, None, :2]
-                + sampling_offsets
-                / self.num_points
-                * reference_points[:, :, None, :, None, 2:]
-                * 0.5)
-        else:
-            raise ValueError(
-                "Last dim of reference_points must be 2 or 4, but get {} instead.".
-                format(reference_points.shape[-1]))
-
-        # Multi-scale Deformable attention
-        output = self.ms_deformable_attn_core(
-            value, value_spatial_shapes, sampling_locations, attention_weights)
-        
-        # Output project
-        output = self.output_proj(output)
-
-        return output
-
-
-# ----------------- Transformer modules -----------------
-## Transformer Encoder layer
-class TransformerEncoderLayer(nn.Module):
-    def __init__(self,
-                 d_model         :int   = 256,
-                 num_heads       :int   = 8,
-                 ffn_dim         :int   = 1024,
-                 dropout         :float = 0.1,
-                 act_type        :str   = "relu",
-                 ):
-        super().__init__()
-        # ----------- Basic parameters -----------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.ffn_dim = ffn_dim
-        self.dropout = dropout
-        self.act_type = act_type
-        # ----------- Basic parameters -----------
-        # Multi-head Self-Attn
-        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
-        self.dropout = nn.Dropout(dropout)
-        self.norm = nn.LayerNorm(d_model)
-
-        # Feedforwaed Network
-        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward(self, src, pos_embed):
-        """
-        Input:
-            src:       [torch.Tensor] -> [B, N, C]
-            pos_embed: [torch.Tensor] -> [B, N, C]
-        Output:
-            src:       [torch.Tensor] -> [B, N, C]
-        """
-        q = k = self.with_pos_embed(src, pos_embed)
-
-        # -------------- MHSA --------------
-        src2 = self.self_attn(q, k, value=src)[0]
-        src = src + self.dropout(src2)
-        src = self.norm(src)
-
-        # -------------- FFN --------------
-        src = self.ffn(src)
-        
-        return src
-
-## Transformer Encoder
-class TransformerEncoder(nn.Module):
-    def __init__(self,
-                 d_model        :int   = 256,
-                 num_heads      :int   = 8,
-                 num_layers     :int   = 1,
-                 ffn_dim        :int   = 1024,
-                 pe_temperature : float = 10000.,
-                 dropout        :float = 0.1,
-                 act_type       :str   = "relu",
-                 ):
-        super().__init__()
-        # ----------- Basic parameters -----------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.ffn_dim = ffn_dim
-        self.dropout = dropout
-        self.act_type = act_type
-        self.pe_temperature = pe_temperature
-        self.pos_embed = None
-        # ----------- Basic parameters -----------
-        self.encoder_layers = get_clones(
-            TransformerEncoderLayer(d_model, num_heads, ffn_dim, dropout, act_type), num_layers)
-
-    def build_2d_sincos_position_embedding(self, device, w, h, embed_dim=256, temperature=10000.):
-        assert embed_dim % 4 == 0, \
-            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
-        
-        # ----------- Check cahed pos_embed -----------
-        if self.pos_embed is not None and \
-            self.pos_embed.shape[2:] == [h, w]:
-            return self.pos_embed
-        
-        # ----------- Generate grid coords -----------
-        grid_w = torch.arange(int(w), dtype=torch.float32)
-        grid_h = torch.arange(int(h), dtype=torch.float32)
-        grid_w, grid_h = torch.meshgrid([grid_w, grid_h])  # shape: [H, W]
-
-        pos_dim = embed_dim // 4
-        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
-        omega = 1. / (temperature**omega)
-
-        out_w = grid_w.flatten()[..., None] @ omega[None] # shape: [N, C]
-        out_h = grid_h.flatten()[..., None] @ omega[None] # shape: [N, C]
-
-        # shape: [1, N, C]
-        pos_embed = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h),torch.cos(out_h)], dim=1)[None, :, :]
-        pos_embed = pos_embed.to(device)
-        self.pos_embed = pos_embed
-
-        return pos_embed
-
-    def forward(self, src):
-        """
-        Input:
-            src:  [torch.Tensor] -> [B, C, H, W]
-        Output:
-            src:  [torch.Tensor] -> [B, C, H, W]
-        """
-        # -------- Transformer encoder --------
-        channels, fmp_h, fmp_w = src.shape[1:]
-        # [B, C, H, W] -> [B, N, C], N=HxW
-        src_flatten = src.flatten(2).permute(0, 2, 1).contiguous()
-        memory = src_flatten
-
-        # PosEmbed: [1, N, C]
-        pos_embed = self.build_2d_sincos_position_embedding(
-            src.device, fmp_w, fmp_h, channels, self.pe_temperature)
-        
-        # Transformer Encoder layer
-        for encoder in self.encoder_layers:
-            memory = encoder(memory, pos_embed=pos_embed)
-
-        # Output: [B, N, C] -> [B, C, N] -> [B, C, H, W]
-        src = memory.permute(0, 2, 1).contiguous()
-        src = src.view([-1, channels, fmp_h, fmp_w])
-
-        return src
-
-## Transformer Decoder layer
-class DeformableTransformerDecoderLayer(nn.Module):
-    def __init__(self,
-                 d_model     :int   = 256,
-                 num_heads   :int   = 8,
-                 num_levels  :int   = 3,
-                 num_points  :int   = 4,
-                 ffn_dim     :int   = 1024,
-                 dropout     :float = 0.1,
-                 act_type    :str   = "relu",
-                 ):
-        super().__init__()
-        # ----------- Basic parameters -----------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.num_levels = num_levels
-        self.num_points = num_points
-        self.ffn_dim = ffn_dim
-        self.dropout = dropout
-        self.act_type = act_type
-        # ---------------- Network parameters ----------------
-        ## Multi-head Self-Attn
-        self.self_attn  = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(d_model)
-        ## CrossAttention
-        self.cross_attn = MSDeformableAttention(d_model, num_heads, num_levels, num_points)
-        self.dropout2 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(d_model)
-        ## FFN
-        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                attn_mask=None,
-                memory_mask=None,
-                query_pos_embed=None):
-        # ---------------- MSHA for Object Query -----------------
-        q = k = self.with_pos_embed(tgt, query_pos_embed)
-        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)[0]
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
-
-        # ---------------- CMHA for Object Query and Image-feature -----------------
-        tgt2 = self.cross_attn(self.with_pos_embed(tgt, query_pos_embed),
-                               reference_points,
-                               memory,
-                               memory_spatial_shapes,
-                               memory_mask)
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
-
-        # ---------------- FeedForward Network -----------------
-        tgt = self.ffn(tgt)
-
-        return tgt
-
-## Transformer Decoder
-class DeformableTransformerDecoder(nn.Module):
-    def __init__(self,
-                 d_model        :int   = 256,
-                 num_heads      :int   = 8,
-                 num_layers     :int   = 1,
-                 num_levels     :int   = 3,
-                 num_points     :int   = 4,
-                 ffn_dim        :int   = 1024,
-                 dropout        :float = 0.1,
-                 act_type       :str   = "relu",
-                 return_intermediate :bool = False,
-                 ):
-        super().__init__()
-        # ----------- Basic parameters -----------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.ffn_dim = ffn_dim
-        self.dropout = dropout
-        self.act_type = act_type
-        self.pos_embed = None
-        # ----------- Network parameters -----------
-        self.decoder_layers = get_clones(
-            DeformableTransformerDecoderLayer(d_model, num_heads, num_levels, num_points, ffn_dim, dropout, act_type), num_layers)
-        self.num_layers = num_layers
-        self.return_intermediate = return_intermediate
-
-    def forward(self,
-                tgt,
-                ref_points_unact,
-                memory,
-                memory_spatial_shapes,
-                bbox_head,
-                score_head,
-                query_pos_head,
-                attn_mask=None,
-                memory_mask=None):
-        output = tgt
-        dec_out_bboxes = []
-        dec_out_logits = []
-        ref_points_detach = F.sigmoid(ref_points_unact)
-        for i, layer in enumerate(self.decoder_layers):
-            ref_points_input = ref_points_detach.unsqueeze(2)
-            query_pos_embed = query_pos_head(ref_points_detach)
-
-            output = layer(output, ref_points_input, memory,
-                           memory_spatial_shapes, attn_mask,
-                           memory_mask, query_pos_embed)
-
-            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
-
-            dec_out_logits.append(score_head[i](output))
-            if i == 0:
-                dec_out_bboxes.append(inter_ref_bbox)
-            else:
-                dec_out_bboxes.append(
-                    F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
-
-            ref_points = inter_ref_bbox
-            ref_points_detach = inter_ref_bbox.detach() if self.training else inter_ref_bbox
-
-        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
-

+ 0 - 37
models/detectors/rtdetr/build.py

@@ -1,37 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-
-import torch
-import torch.nn as nn
-
-from .loss import build_criterion
-from .rtdetr import RT_DETR
-
-
-# build object detector
-def build_rtdetr(args, cfg, num_classes=80, trainable=False, deploy=False):
-    print('==============================')
-    print('Build {} ...'.format(args.model.upper()))
-    
-    print('==============================')
-    print('Model Configuration: \n', cfg)
-    
-    # -------------- Build RT-DETR --------------
-    model = RT_DETR(cfg             = cfg,
-                    num_classes     = num_classes,
-                    nms_thresh      = args.nms_thresh,
-                    conf_thresh     = args.conf_thresh,
-                    topk            = 300,
-                    onnx_deploy     = deploy,
-                    no_multi_labels = args.no_multi_labels,
-                    use_nms         = True,   # NMS is beneficial 
-                    nms_class_agnostic = args.nms_class_agnostic
-                    )
-            
-    # -------------- Build criterion --------------
-    criterion = None
-    if trainable:
-        # build criterion for training
-        criterion = build_criterion(cfg, num_classes)
-        
-    return model, criterion

+ 0 - 201
models/detectors/rtdetr/loss.py

@@ -1,201 +0,0 @@
-"""
-reference: 
-https://github.com/facebookresearch/detr/blob/main/models/detr.py
-
-by lyuwenyu
-"""
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-try:
-    from .loss_utils import box_cxcywh_to_xyxy, box_iou, generalized_box_iou
-    from .loss_utils import is_dist_avail_and_initialized, get_world_size
-    from .matcher import HungarianMatcher
-except:
-    from loss_utils import box_cxcywh_to_xyxy, box_iou, generalized_box_iou
-    from loss_utils import is_dist_avail_and_initialized, get_world_size
-    from matcher import HungarianMatcher
-
-
-# --------------- Criterion for RT-DETR ---------------
-def build_criterion(cfg, num_classes=80):
-    matcher = HungarianMatcher(cfg['matcher_hpy'], alpha=0.25, gamma=2.0)
-    weight_dict = {'loss_cls':  cfg['loss_coeff']['class'],
-                   'loss_box':  cfg['loss_coeff']['bbox'],
-                   'loss_giou': cfg['loss_coeff']['giou']}
-    criterion = Criterion(matcher, weight_dict, num_classes=num_classes)
-
-    return criterion
-
-
-class Criterion(nn.Module):
-    """ This class computes the loss for DETR.
-    The process happens in two steps:
-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
-    """
-    def __init__(self, matcher, weight_dict, num_classes=80):
-        """ Create the criterion.
-        Parameters:
-            num_classes: number of object categories, omitting the special no-object category
-            matcher: module able to compute a matching between targets and proposals
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            eos_coef: relative classification weight applied to the no-object category
-            losses: list of all the losses to be applied. See get_loss for list of available losses.
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.matcher = matcher
-        self.weight_dict = weight_dict
-        self.losses = ['labels', 'boxes']
-
-        self.alpha = 0.75  # For VFL
-        self.gamma = 2.0
-
-    def loss_labels(self, outputs, targets, indices, num_boxes):
-        "Compute variable focal loss"
-        assert 'pred_boxes' in outputs
-        idx = self._get_src_permutation_idx(indices)
-        # Compute IoU between pred and target
-        src_boxes = outputs['pred_boxes'][idx]
-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
-        ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))
-        ious = torch.diag(ious).detach()
-
-        # One-hot class label
-        src_logits = outputs['pred_logits']
-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
-        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
-                                    dtype=torch.int64, device=src_logits.device)
-        target_classes[idx] = target_classes_o
-        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
-
-        # Iou-aware class label
-        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
-        target_score_o[idx] = ious.to(target_score_o.dtype)
-        target_score = target_score_o.unsqueeze(-1) * target
-
-        # Compute VFL
-        pred_score = F.sigmoid(src_logits).detach()
-        weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
-        
-        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none')
-        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
-
-        return {'loss_cls': loss}
-
-    def loss_boxes(self, outputs, targets, indices, num_boxes):
-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
-        """
-        assert 'pred_boxes' in outputs
-        idx = self._get_src_permutation_idx(indices)
-        src_boxes = outputs['pred_boxes'][idx]
-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
-
-        losses = {}
-
-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
-        losses['loss_box'] = loss_bbox.sum() / num_boxes
-
-        loss_giou = 1 - torch.diag(generalized_box_iou(
-                box_cxcywh_to_xyxy(src_boxes),
-                box_cxcywh_to_xyxy(target_boxes)))
-        losses['loss_giou'] = loss_giou.sum() / num_boxes
-        return losses
-
-    def _get_src_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
-        src_idx = torch.cat([src for (src, _) in indices])
-        return batch_idx, src_idx
-
-    def _get_tgt_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
-        return batch_idx, tgt_idx
-
-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
-        loss_map = {
-            'boxes': self.loss_boxes,
-            'labels': self.loss_labels,
-        }
-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
-
-    def forward(self, outputs, targets):
-        """ This performs the loss computation.
-        Parameters:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
-        """
-        outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k}
-
-        # Retrieve the matching between the outputs of the last layer and the targets
-        indices = self.matcher(outputs_without_aux, targets)
-
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
-        num_boxes = sum(len(t["labels"]) for t in targets)
-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
-        if is_dist_avail_and_initialized():
-            torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
-
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes)
-            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
-            losses.update(l_dict)
-
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if 'aux_outputs' in outputs:
-            for i, aux_outputs in enumerate(outputs['aux_outputs']):
-                indices = self.matcher(aux_outputs, targets)
-                for loss in self.losses:
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes)
-                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
-                    l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()}
-                    losses.update(l_dict)
-
-        # In case of cdn auxiliary losses. For rtdetr
-        if 'dn_aux_outputs' in outputs:
-            assert 'dn_meta' in outputs, ''
-            indices = self.get_cdn_matched_indices(outputs['dn_meta'], targets)
-            num_boxes = num_boxes * outputs['dn_meta']['dn_num_group']
-
-            for i, aux_outputs in enumerate(outputs['dn_aux_outputs']):
-                # indices = self.matcher(aux_outputs, targets)
-                for loss in self.losses:
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes)
-                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
-                    l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()}
-                    losses.update(l_dict)
-
-        return losses
-
-    @staticmethod
-    def get_cdn_matched_indices(dn_meta, targets):
-        '''get_cdn_matched_indices
-        '''
-        dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
-        num_gts = [len(t['labels']) for t in targets]
-        device = targets[0]['labels'].device
-        
-        dn_match_indices = []
-        for i, num_gt in enumerate(num_gts):
-            if num_gt > 0:
-                gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device)
-                gt_idx = gt_idx.tile(dn_num_group)
-                assert len(dn_positive_idx[i]) == len(gt_idx)
-                dn_match_indices.append((dn_positive_idx[i], gt_idx))
-            else:
-                dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \
-                    torch.zeros(0, dtype=torch.int64,  device=device)))
-        
-        return dn_match_indices

+ 0 - 240
models/detectors/rtdetr/loss_utils.py

@@ -1,240 +0,0 @@
-import math
-import torch
-import torch.nn.functional as F
-import torch.distributed as dist
-from torchvision.ops.boxes import box_area
-
-
-# ------------------------- For loss -------------------------
-## FocalLoss
-def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
-    """
-    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
-    Args:
-        inputs: A float tensor of arbitrary shape.
-                The predictions for each example.
-        targets: A float tensor with the same shape as inputs. Stores the binary
-                 classification label for each element in inputs
-                (0 for the negative class and 1 for the positive class).
-        alpha: (optional) Weighting factor in range (0,1) to balance
-                positive vs negative examples. Default = -1 (no weighting).
-        gamma: Exponent of the modulating factor (1 - p_t) to
-               balance easy vs hard examples.
-    Returns:
-        Loss tensor
-    """
-    prob = inputs.sigmoid()
-    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
-    p_t = prob * targets + (1 - prob) * (1 - targets)
-    loss = ce_loss * ((1 - p_t) ** gamma)
-
-    if alpha >= 0:
-        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
-        loss = alpha_t * loss
-
-    return loss.mean(1).sum() / num_boxes
-
-## Variable FocalLoss
-def varifocal_loss_with_logits(pred_logits,
-                               gt_score,
-                               label,
-                               normalizer=1.0,
-                               alpha=0.75,
-                               gamma=2.0):
-    pred_score = F.sigmoid(pred_logits)
-    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
-    loss = F.binary_cross_entropy_with_logits(pred_logits, gt_score, reduction='none')
-    loss = loss * weight
-
-    return loss.mean(1).sum() / normalizer
-
-## InverseSigmoid
-def inverse_sigmoid(x, eps=1e-5):
-    x = x.clamp(min=0, max=1)
-    x1 = x.clamp(min=eps)
-    x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1/x2)
-
-## GIoU loss
-class GIoULoss(object):
-    """ Modified GIoULoss from Paddle-Paddle"""
-    def __init__(self, eps=1e-10, reduction='none'):
-        self.eps = eps
-        self.reduction = reduction
-        assert reduction in ('none', 'mean', 'sum')
-
-    def bbox_overlap(self, box1, box2, eps=1e-10):
-        """calculate the iou of box1 and box2
-        Args:
-            box1 (Tensor): box1 with the shape (..., 4)
-            box2 (Tensor): box1 with the shape (..., 4)
-            eps (float): epsilon to avoid divide by zero
-        Return:
-            iou (Tensor): iou of box1 and box2
-            overlap (Tensor): overlap of box1 and box2
-            union (Tensor): union of box1 and box2
-        """
-        x1, y1, x2, y2 = box1
-        x1g, y1g, x2g, y2g = box2
-
-        xkis1 = torch.max(x1, x1g)
-        ykis1 = torch.max(y1, y1g)
-        xkis2 = torch.min(x2, x2g)
-        ykis2 = torch.min(y2, y2g)
-        w_inter = (xkis2 - xkis1).clip(0)
-        h_inter = (ykis2 - ykis1).clip(0)
-        overlap = w_inter * h_inter
-
-        area1 = (x2 - x1) * (y2 - y1)
-        area2 = (x2g - x1g) * (y2g - y1g)
-        union = area1 + area2 - overlap + eps
-        iou = overlap / union
-
-        return iou, overlap, union
-
-    def __call__(self, pbox, gbox):
-        # x1, y1, x2, y2 = torch.split(pbox, 4, dim=-1)
-        # x1g, y1g, x2g, y2g = torch.split(gbox, 4, dim=-1)
-        x1, y1, x2, y2 = torch.chunk(pbox, 4, dim=-1)
-        x1g, y1g, x2g, y2g = torch.chunk(gbox, 4, dim=-1)
-        box1 = [x1, y1, x2, y2]
-        box2 = [x1g, y1g, x2g, y2g]
-        iou, _, union = self.bbox_overlap(box1, box2, self.eps)
-        xc1 = torch.min(x1, x1g)
-        yc1 = torch.min(y1, y1g)
-        xc2 = torch.max(x2, x2g)
-        yc2 = torch.max(y2, y2g)
-
-        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
-        miou = iou - ((area_c - union) / area_c)
-        giou = 1 - miou
-
-        if self.reduction == 'none':
-            loss = giou
-        elif self.reduction == 'sum':
-            loss = giou.sum()
-        elif self.reduction == 'mean':
-            loss = giou.mean()
-
-        return loss
-
-
-# ------------------------- For box -------------------------
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return torch.stack(b, dim=-1)
-
-def box_xyxy_to_cxcywh(x):
-    x0, y0, x1, y1 = x.unbind(-1)
-    b = [(x0 + x1) / 2, (y0 + y1) / 2,
-         (x1 - x0), (y1 - y0)]
-    return torch.stack(b, dim=-1)
-
-def box_iou(boxes1, boxes2):
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
-
-    union = area1[:, None] + area2 - inter
-
-    iou = inter / union
-    return iou, union
-
-def generalized_box_iou(boxes1, boxes2):
-    """
-    Generalized IoU from https://giou.stanford.edu/
-
-    The boxes should be in [x0, y0, x1, y1] format
-
-    Returns a [N, M] pairwise matrix, where N = len(boxes1)
-    and M = len(boxes2)
-    """
-    # degenerate boxes gives inf / nan results
-    # so do an early check
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
-    iou, union = box_iou(boxes1, boxes2)
-
-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    area = wh[:, :, 0] * wh[:, :, 1]
-
-    return iou - (area - union) / area
-
-def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
-    """Modified from Paddle-paddle
-    Args:
-        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
-        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
-        giou (bool): whether use giou or not, default False
-        diou (bool): whether use diou or not, default False
-        ciou (bool): whether use ciou or not, default False
-        eps (float): epsilon to avoid divide by zero
-    Return:
-        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
-    """
-    px1, py1, px2, py2 = torch.chunk(box1, 4, -1)
-    gx1, gy1, gx2, gy2 = torch.chunk(box2, 4, -1)
-    x1 = torch.max(px1, gx1)
-    y1 = torch.max(py1, gy1)
-    x2 = torch.min(px2, gx2)
-    y2 = torch.min(py2, gy2)
-
-    overlap = ((x2 - x1).clamp(0)) * ((y2 - y1).clamp(0))
-
-    area1 = (px2 - px1) * (py2 - py1)
-    area1 = area1.clamp(0)
-
-    area2 = (gx2 - gx1) * (gy2 - gy1)
-    area2 = area2.clamp(0)
-
-    union = area1 + area2 - overlap + eps
-    iou = overlap / union
-
-    if giou or ciou or diou:
-        # convex w, h
-        cw = torch.max(px2, gx2) - torch.min(px1, gx1)
-        ch = torch.max(py2, gy2) - torch.min(py1, gy1)
-        if giou:
-            c_area = cw * ch + eps
-            return iou - (c_area - union) / c_area
-        else:
-            # convex diagonal squared
-            c2 = cw**2 + ch**2 + eps
-            # center distance
-            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
-            if diou:
-                return iou - rho2 / c2
-            else:
-                w1, h1 = px2 - px1, py2 - py1 + eps
-                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
-                delta = torch.atan(w1 / h1) - torch.atan(w2 / h2)
-                v = (4 / math.pi**2) * torch.pow(delta, 2)
-                alpha = v / (1 + eps - iou + v)
-                alpha.requires_grad_ = False
-                return iou - (rho2 / c2 + v * alpha)
-    else:
-        return iou
-
-
-# ------------------------- For distributed -------------------------
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()

+ 0 - 90
models/detectors/rtdetr/matcher.py

@@ -1,90 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from scipy.optimize import linear_sum_assignment
-
-try:
-    from .loss_utils import box_cxcywh_to_xyxy, generalized_box_iou
-except:
-    from  loss_utils import box_cxcywh_to_xyxy, generalized_box_iou
-
-
-class HungarianMatcher(nn.Module):
-    """This class computes an assignment between the targets and the predictions of the network
-
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
-    while the others are un-matched (and thus treated as non-objects).
-    """
-
-    __share__ = ['use_focal_loss', ]
-
-    def __init__(self, weight_dict, alpha=0.25, gamma=2.0):
-        """Creates the matcher
-
-        Params:
-            cost_class: This is the relative weight of the classification error in the matching cost
-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
-        """
-        super().__init__()
-        self.cost_class = weight_dict['cost_class']
-        self.cost_bbox = weight_dict['cost_bbox']
-        self.cost_giou = weight_dict['cost_giou']
-
-        self.alpha = alpha
-        self.gamma = gamma
-
-        assert self.cost_class != 0 or self.cost_bbox != 0 or self.cost_giou != 0, "all costs cant be 0"
-
-    @torch.no_grad()
-    def forward(self, outputs, targets):
-        """ Performs the matching
-
-        Params:
-            outputs: This is a dict that contains at least these entries:
-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
-
-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
-                           objects in the target) containing the class labels
-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
-
-        Returns:
-            A list of size batch_size, containing tuples of (index_i, index_j) where:
-                - index_i is the indices of the selected predictions (in order)
-                - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds:
-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        bs, num_queries = outputs["pred_logits"].shape[:2]
-
-        # We flatten to compute the cost matrices in a batch
-        out_prob = F.sigmoid(outputs["pred_logits"].flatten(0, 1))
-        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
-
-        # Also concat the target labels and boxes
-        tgt_ids = torch.cat([v["labels"] for v in targets])
-        tgt_bbox = torch.cat([v["boxes"] for v in targets])
-
-        # Compute the classification cost
-        out_prob = out_prob[:, tgt_ids]
-        neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(1 - out_prob + 1e-8).log())
-        pos_cost_class = self.alpha * ((1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
-        cost_class = pos_cost_class - neg_cost_class        
-
-        # Compute the L1 cost between boxes
-        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
-
-        # Compute the giou cost betwen boxes
-        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
-        
-        # Final cost matrix
-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
-        C = C.view(bs, num_queries, -1).cpu()
-
-        sizes = [len(v["boxes"]) for v in targets]
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
-
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]

+ 0 - 239
models/detectors/rtdetr/rtdetr.py

@@ -1,239 +0,0 @@
-import torch
-import torch.nn as nn
-
-try:
-    from .basic_modules.basic import multiclass_nms
-    from .rtdetr_encoder import build_image_encoder
-    from .rtdetr_decoder import build_transformer
-except:
-    from  basic_modules.basic import multiclass_nms
-    from  rtdetr_encoder import build_image_encoder
-    from  rtdetr_decoder import build_transformer
-
-
-# Real-time DETR
-class RT_DETR(nn.Module):
-    def __init__(self,
-                 cfg,
-                 num_classes = 80,
-                 conf_thresh = 0.1,
-                 nms_thresh  = 0.5,
-                 topk        = 300,
-                 onnx_deploy = False,
-                 no_multi_labels = False,
-                 use_nms     = False,
-                 nms_class_agnostic = False,
-                 ):
-        super().__init__()
-        # ----------- Basic setting -----------
-        self.num_classes = num_classes
-        self.num_topk = topk
-        self.onnx_deploy = onnx_deploy
-        ## Post-process parameters
-        self.use_nms = use_nms
-        self.nms_thresh = nms_thresh
-        self.conf_thresh = conf_thresh
-        self.no_multi_labels = no_multi_labels
-        self.nms_class_agnostic = nms_class_agnostic
-
-        # ----------- Network setting -----------
-        ## Image encoder
-        self.image_encoder = build_image_encoder(cfg)
-        self.fpn_dims = self.image_encoder.fpn_dims
-
-        ## Detect decoder
-        self.detect_decoder = build_transformer(cfg, self.fpn_dims, num_classes, return_intermediate=self.training)
-
-    def deploy(self):
-        assert not self.training
-        for m in self.modules():
-            if hasattr(m, 'convert_to_deploy'):
-                m.convert_to_deploy()
-        return self 
-
-    def post_process(self, box_pred, cls_pred):
-        # xywh -> xyxy
-        box_preds_x1y1 = box_pred[..., :2] - 0.5 * box_pred[..., 2:]
-        box_preds_x2y2 = box_pred[..., :2] + 0.5 * box_pred[..., 2:]
-        box_pred = torch.cat([box_preds_x1y1, box_preds_x2y2], dim=-1)
-        
-        cls_pred = cls_pred[0]
-        box_pred = box_pred[0]
-        if self.no_multi_labels:
-            # [M,]
-            scores, labels = torch.max(cls_pred.sigmoid(), dim=1)
-
-            # Keep top k top scoring indices only.
-            num_topk = min(self.num_topk, box_pred.size(0))
-
-            # Topk candidates
-            predicted_prob, topk_idxs = scores.sort(descending=True)
-            topk_scores = predicted_prob[:num_topk]
-            topk_idxs = topk_idxs[:num_topk]
-
-            # Filter out the proposals with low confidence score
-            keep_idxs = topk_scores > self.conf_thresh
-            topk_idxs = topk_idxs[keep_idxs]
-
-            # Top-k results
-            topk_scores = topk_scores[keep_idxs]
-            topk_labels = labels[topk_idxs]
-            topk_bboxes = box_pred[topk_idxs]
-
-        else:
-            # Top-k select
-            cls_pred = cls_pred.flatten().sigmoid_()
-            box_pred = box_pred
-
-            # Keep top k top scoring indices only.
-            num_topk = min(self.num_topk, box_pred.size(0))
-
-            # Topk candidates
-            predicted_prob, topk_idxs = cls_pred.sort(descending=True)
-            topk_scores = predicted_prob[:num_topk]
-            topk_idxs = topk_idxs[:self.num_topk]
-
-            # Filter out the proposals with low confidence score
-            keep_idxs = topk_scores > self.conf_thresh
-            topk_scores = topk_scores[keep_idxs]
-            topk_idxs = topk_idxs[keep_idxs]
-            topk_box_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
-
-            ## Top-k results
-            topk_labels = topk_idxs % self.num_classes
-            topk_bboxes = box_pred[topk_box_idxs]
-
-        if not self.onnx_deploy:
-            topk_scores = topk_scores.cpu().numpy()
-            topk_labels = topk_labels.cpu().numpy()
-            topk_bboxes = topk_bboxes.cpu().numpy()
-
-            # nms
-            if self.use_nms:
-                topk_scores, topk_labels, topk_bboxes = multiclass_nms(
-                    topk_scores, topk_labels, topk_bboxes, self.nms_thresh, self.num_classes, self.nms_class_agnostic)
-
-        return topk_bboxes, topk_scores, topk_labels
-    
-    def forward(self, x, targets=None):
-        # ----------- Image Encoder -----------
-        pyramid_feats = self.image_encoder(x)
-
-        # ----------- Transformer -----------
-        outputs = self.detect_decoder(pyramid_feats, targets)
-
-        if not self.training:
-            img_h, img_w = x.shape[2:]
-            box_pred = outputs["pred_boxes"]
-            cls_pred = outputs["pred_logits"]
-
-            # rescale bbox
-            box_pred[..., [0, 2]] *= img_h
-            box_pred[..., [1, 3]] *= img_w
-            
-            # post-process
-            bboxes, scores, labels = self.post_process(box_pred, cls_pred)
-
-            outputs = {
-                "scores": scores,
-                "labels": labels,
-                "bboxes": bboxes,
-            }
-
-        return outputs
-        
-
-if __name__ == '__main__':
-    import time
-    from thop import profile
-    from loss import build_criterion
-
-    # Model config
-    cfg = {
-        # Image Encoder - Backbone
-        'backbone': 'resnet101',
-        'backbone_norm': 'BN',
-        'res5_dilation': False,
-        'pretrained': False,
-        'pretrained_weight': 'imagenet1k_v1',
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'out_stride': [8, 16, 32],
-        'max_stride': 32,
-        # Image Encoder - FPN
-        'fpn': 'hybrid_encoder',
-        'fpn_num_blocks': 3,
-        'fpn_expansion': 0.5,
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        'hidden_dim': 384,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_ffn_dim': 2048,
-        'en_dropout': 0.0,
-        'pe_temperature': 10000.,
-        'en_act': 'gelu',
-        # Transformer Decoder
-        'transformer': 'rtdetr_transformer',
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_ffn_dim': 2048,
-        'de_dropout': 0.0,
-        'de_act': 'gelu',
-        'de_num_points': 4,
-        'num_queries': 300,
-        'learnt_init_query': False,
-        'pe_temperature': 10000.,
-        'dn_num_denoising': 100,
-        'dn_label_noise_ratio': 0.5,
-        'dn_box_noise_scale': 1,
-        # Matcher
-        'matcher_hpy': {'cost_class': 2.0,
-                        'cost_bbox': 5.0,
-                        'cost_giou': 2.0,},
-        # Loss
-        'use_vfl': True,
-        'loss_coeff': {'class': 1,
-                       'bbox': 5,
-                       'giou': 2,
-                       'no_object': 0.1,},
-        }
-    bs = 1
-    # Create a batch of images & targets
-    image = torch.randn(bs, 3, 640, 640).cuda()
-    targets = [{
-        'labels': torch.tensor([2, 4, 5, 8]).long().cuda(),
-        'boxes':  torch.tensor([[0, 0, 10, 10], [12, 23, 56, 70], [0, 10, 20, 30], [50, 60, 55, 150]]).float().cuda() / 640.
-    }] * bs
-
-    # Create model
-    model = RT_DETR(cfg, num_classes=20)
-    model.train().cuda()
-
-    # Create criterion
-    criterion = build_criterion(cfg, num_classes=20)
-
-    # Model inference
-    outputs = model(image, targets)
-
-    # Compute loss
-    loss = criterion(outputs, targets)
-    for k in loss.keys():
-        print("{} : {}".format(k, loss[k].item()))
-
-    # Inference
-    with torch.no_grad():
-        model.eval()
-        model.deploy()
-        t0 = time.time()
-        outputs = model(image)
-        t1 = time.time()
-        print('Infer time: ', t1 - t0)
-
-    print('==============================')
-    model.eval()
-    flops, params = profile(model, inputs=(image, ), verbose=False)
-    print('==============================')
-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
-    print('Params : {:.2f} M'.format(params / 1e6))

+ 0 - 375
models/detectors/rtdetr/rtdetr_decoder.py

@@ -1,375 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import constant_, xavier_uniform_, uniform_, normal_
-from typing import List
-
-try:
-    from .basic_modules.basic import BasicConv, MLP
-    from .basic_modules.transformer import DeformableTransformerDecoder
-    from .basic_modules.dn_compoments import get_contrastive_denoising_training_group
-except:
-    from  basic_modules.basic import BasicConv, MLP
-    from  basic_modules.transformer import DeformableTransformerDecoder
-    from  basic_modules.dn_compoments import get_contrastive_denoising_training_group
-
-
-def build_transformer(cfg, in_dims, num_classes, return_intermediate=False):
-    if cfg['transformer'] == 'rtdetr_transformer':
-        return RTDETRTransformer(in_dims             = in_dims,
-                                 hidden_dim          = cfg['hidden_dim'],
-                                 strides             = cfg['out_stride'],
-                                 num_classes         = num_classes,
-                                 num_queries         = cfg['num_queries'],
-                                 pos_embed_type      = 'sine',
-                                 num_heads           = cfg['de_num_heads'],
-                                 num_layers          = cfg['de_num_layers'],
-                                 num_levels          = len(cfg['out_stride']),
-                                 num_points          = cfg['de_num_points'],
-                                 ffn_dim           = cfg['de_ffn_dim'],
-                                 dropout             = cfg['de_dropout'],
-                                 act_type            = cfg['de_act'],
-                                 return_intermediate = return_intermediate,
-                                 num_denoising       = cfg['dn_num_denoising'],
-                                 label_noise_ratio   = cfg['dn_label_noise_ratio'],
-                                 box_noise_scale     = cfg['dn_box_noise_scale'],
-                                 learnt_init_query   = cfg['learnt_init_query'],
-                                 )
-
-
-# ----------------- Dencoder for Detection task -----------------
-## RTDETR's Transformer for Detection task
-class RTDETRTransformer(nn.Module):
-    def __init__(self,
-                 # basic parameters
-                 in_dims        :List = [256, 512, 1024],
-                 hidden_dim     :int  = 256,
-                 strides        :List = [8, 16, 32],
-                 num_classes    :int  = 80,
-                 num_queries    :int  = 300,
-                 pos_embed_type :str  = 'sine',
-                 # transformer parameters
-                 num_heads      :int   = 8,
-                 num_layers     :int   = 1,
-                 num_levels     :int   = 3,
-                 num_points     :int   = 4,
-                 ffn_dim        :int   = 1024,
-                 dropout        :float = 0.1,
-                 act_type       :str   = "relu",
-                 return_intermediate :bool = False,
-                 # Denoising parameters
-                 num_denoising       :int  = 100,
-                 label_noise_ratio   :float = 0.5,
-                 box_noise_scale     :float = 1.0,
-                 learnt_init_query   :bool  = False,
-                 aux_loss            :bool  = True
-                 ):
-        super().__init__()
-        # --------------- Basic setting ---------------
-        ## Basic parameters
-        self.in_dims = in_dims
-        self.strides = strides
-        self.num_queries = num_queries
-        self.pos_embed_type = pos_embed_type
-        self.num_classes = num_classes
-        self.eps = 1e-2
-        self.aux_loss = aux_loss
-        ## Transformer parameters
-        self.num_heads  = num_heads
-        self.num_layers = num_layers
-        self.num_levels = num_levels
-        self.num_points = num_points
-        self.ffn_dim  = ffn_dim
-        self.dropout    = dropout
-        self.act_type   = act_type
-        self.return_intermediate = return_intermediate
-        ## Denoising parameters
-        self.num_denoising = num_denoising
-        self.label_noise_ratio = label_noise_ratio
-        self.box_noise_scale = box_noise_scale
-        self.learnt_init_query = learnt_init_query
-
-        # --------------- Network setting ---------------
-        ## Input proj layers
-        self.input_proj_layers = nn.ModuleList(
-            BasicConv(in_dims[i], hidden_dim, kernel_size=1, act_type=None, norm_type="BN")
-            for i in range(num_levels)
-        )
-
-        ## Deformable transformer decoder
-        self.decoder = DeformableTransformerDecoder(
-                                    d_model    = hidden_dim,
-                                    num_heads  = num_heads,
-                                    num_layers = num_layers,
-                                    num_levels = num_levels,
-                                    num_points = num_points,
-                                    ffn_dim  = ffn_dim,
-                                    dropout    = dropout,
-                                    act_type   = act_type,
-                                    return_intermediate = return_intermediate
-                                    )
-        
-        ## Detection head for Encoder
-        self.enc_output = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.LayerNorm(hidden_dim)
-            )
-        self.enc_class_head = nn.Linear(hidden_dim, num_classes)
-        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-
-        ## Detection head for Decoder
-        self.dec_class_head = nn.ModuleList([
-            nn.Linear(hidden_dim, num_classes)
-            for _ in range(num_layers)
-        ])
-        self.dec_bbox_head = nn.ModuleList([
-            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-            for _ in range(num_layers)
-        ])
-
-        ## Object query
-        if learnt_init_query:
-            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
-        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
-
-        ## Denoising part
-        if num_denoising > 0: 
-            self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        # class and bbox head init
-        prior_prob = 0.01
-        cls_bias_init = float(-math.log((1 - prior_prob) / prior_prob))
-
-        nn.init.constant_(self.enc_class_head.bias, cls_bias_init)
-        nn.init.constant_(self.enc_bbox_head.layers[-1].weight, 0.)
-        nn.init.constant_(self.enc_bbox_head.layers[-1].bias, 0.)
-        for cls_, reg_ in zip(self.dec_class_head, self.dec_bbox_head):
-            nn.init.constant_(cls_.bias, cls_bias_init)
-            nn.init.constant_(reg_.layers[-1].weight, 0.)
-            nn.init.constant_(reg_.layers[-1].bias, 0.)
-
-        nn.init.xavier_uniform_(self.enc_output[0].weight)
-        if self.learnt_init_query:
-            nn.init.xavier_uniform_(self.tgt_embed.weight)
-        nn.init.xavier_uniform_(self.query_pos_head.layers[0].weight)
-        nn.init.xavier_uniform_(self.query_pos_head.layers[1].weight)
-
-    @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_coord):
-        # this is a workaround to make torchscript happy, as torchscript
-        # doesn't support dictionary with non-homogeneous values, such
-        # as a dict having both a Tensor and a list.
-        return [{'pred_logits': a, 'pred_boxes': b}
-                for a, b in zip(outputs_class, outputs_coord)]
-
-    def generate_anchors(self, spatial_shapes, grid_size=0.05):
-        anchors = []
-        for lvl, (h, w) in enumerate(spatial_shapes):
-            grid_y, grid_x = torch.meshgrid(torch.arange(h), torch.arange(w))
-            # [H, W, 2]
-            grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
-
-            valid_WH = torch.as_tensor([w, h]).float()
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
-            wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl)
-            # [H, W, 4] -> [1, N, 4], N=HxW
-            anchors.append(torch.cat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4))
-        # List[L, 1, N_i, 4] -> [1, N, 4], N=N_0 + N_1 + N_2 + ...
-        anchors = torch.cat(anchors, dim=1)
-        valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True)
-        anchors = torch.log(anchors / (1 - anchors))
-        # Equal to operation: anchors = torch.masked_fill(anchors, ~valid_mask, torch.as_tensor(float("inf")))
-        anchors = torch.where(valid_mask, anchors, torch.inf)
-        
-        return anchors, valid_mask
-    
-    def get_encoder_input(self, feats):
-        # get projection features
-        proj_feats = [self.input_proj_layers[i](feat) for i, feat in enumerate(feats)]
-
-        # get encoder inputs
-        feat_flatten = []
-        spatial_shapes = []
-        level_start_index = [0, ]
-        for i, feat in enumerate(proj_feats):
-            _, _, h, w = feat.shape
-            spatial_shapes.append([h, w])
-            # [l], start index of each level
-            level_start_index.append(h * w + level_start_index[-1])
-            # [B, C, H, W] -> [B, N, C], N=HxW
-            feat_flatten.append(feat.flatten(2).permute(0, 2, 1).contiguous())
-
-        # [B, N, C], N = N_0 + N_1 + ...
-        feat_flatten = torch.cat(feat_flatten, dim=1)
-        level_start_index.pop()
-
-        return (feat_flatten, spatial_shapes, level_start_index)
-
-    def get_decoder_input(self,
-                          memory,
-                          spatial_shapes,
-                          denoising_class=None,
-                          denoising_bbox_unact=None):
-        bs, _, _ = memory.shape
-        # Prepare input for decoder
-        anchors, valid_mask = self.generate_anchors(spatial_shapes)
-        anchors = anchors.to(memory.device)
-        valid_mask = valid_mask.to(memory.device)
-        
-        # Process encoder's output
-        memory = torch.where(valid_mask, memory, torch.as_tensor(0., device=memory.device))
-        output_memory = self.enc_output(memory)
-
-        # Head for encoder's output : [bs, num_quries, c]
-        enc_outputs_class = self.enc_class_head(output_memory)
-        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
-
-        # Topk proposals from encoder's output
-        topk = self.num_queries
-        topk_ind = torch.topk(enc_outputs_class.max(-1)[0], topk, dim=1)[1]  # [bs, num_queries]
-        enc_topk_logits = torch.gather(
-            enc_outputs_class, 1, topk_ind.unsqueeze(-1).repeat(1, 1, self.num_classes))  # [bs, num_queries, nc]
-        reference_points_unact = torch.gather(
-            enc_outputs_coord_unact, 1, topk_ind.unsqueeze(-1).repeat(1, 1, 4))    # [bs, num_queries, 4]
-        enc_topk_bboxes = F.sigmoid(reference_points_unact)
-
-        if denoising_bbox_unact is not None:
-            reference_points_unact = torch.cat(
-                [denoising_bbox_unact, reference_points_unact], dim=1)
-
-        # Extract region features
-        if self.learnt_init_query:
-            # [num_queries, c] -> [b, num_queries, c]
-            target = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
-        else:
-            # [num_queries, c] -> [b, num_queries, c]
-            target = torch.gather(output_memory, 1, topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
-            target = target.detach()
-        
-        if denoising_class is not None:
-            target = torch.cat([denoising_class, target], dim=1)
-
-        return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits
-    
-    def forward(self, feats, targets=None):
-        # input projection and embedding
-        memory, spatial_shapes, _ = self.get_encoder_input(feats)
-
-        # prepare denoising training
-        if self.training and self.num_denoising > 0:
-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
-                get_contrastive_denoising_training_group(targets, \
-                                                         self.num_classes, 
-                                                         self.num_queries, 
-                                                         self.denoising_class_embed, 
-                                                         num_denoising=self.num_denoising, 
-                                                         label_noise_ratio=self.label_noise_ratio, 
-                                                         box_noise_scale=self.box_noise_scale, )
-        else:
-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
-
-        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
-            self.get_decoder_input(
-            memory, spatial_shapes, denoising_class, denoising_bbox_unact)
-
-        # decoder
-        out_bboxes, out_logits = self.decoder(target,
-                                              init_ref_points_unact,
-                                              memory,
-                                              spatial_shapes,
-                                              self.dec_bbox_head,
-                                              self.dec_class_head,
-                                              self.query_pos_head,
-                                              attn_mask)
-
-        if self.training and dn_meta is not None:
-            dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2)
-            dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2)
-
-        out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
-
-        if self.training and self.aux_loss:
-            out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1])
-            out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes]))
-            
-            if self.training and dn_meta is not None:
-                out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes)
-                out['dn_meta'] = dn_meta
-
-        return out
-
-
-# ----------------- Dencoder for Segmentation task -----------------
-## RTDETR's Transformer for Segmentation task
-class SegTransformerDecoder(nn.Module):
-    def __init__(self, ):
-        super().__init__()
-        # TODO: design seg-decoder
-
-    def forward(self, x):
-        return
-
-
-# ----------------- Dencoder for Pose estimation task -----------------
-## RTDETR's Transformer for Pose estimation task
-class PosTransformerDecoder(nn.Module):
-    def __init__(self, ):
-        super().__init__()
-        # TODO: design seg-decoder
-
-    def forward(self, x):
-        return
-
-
-if __name__ == '__main__':
-    import time
-    from thop import profile
-    cfg = {
-        'out_stride': [8, 16, 32],
-        # Transformer Decoder
-        'transformer': 'rtdetr_transformer',
-        'hidden_dim': 256,
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_ffn_dim': 1024,
-        'de_dropout': 0.1,
-        'de_act': 'gelu',
-        'de_num_points': 4,
-        'num_queries': 300,
-        'learnt_init_query': False,
-        'pe_temperature': 10000.,
-        'dn_num_denoising': 100,
-        'dn_label_noise_ratio': 0.5,
-        'dn_box_noise_scale': 1,
-    }
-    bs = 1
-    hidden_dim = cfg['hidden_dim']
-    in_dims = [hidden_dim] * 3
-    targets = [{
-        'labels': torch.tensor([2, 4, 5, 8]).long(),
-        'boxes':  torch.tensor([[0, 0, 10, 10], [12, 23, 56, 70], [0, 10, 20, 30], [50, 60, 55, 150]]).float()
-    }] * bs
-    pyramid_feats = [torch.randn(bs, hidden_dim, 80, 80),
-                     torch.randn(bs, hidden_dim, 40, 40),
-                     torch.randn(bs, hidden_dim, 20, 20)]
-    model = build_transformer(cfg, in_dims, 80, True)
-    model.train()
-
-    t0 = time.time()
-    outputs = model(pyramid_feats, targets)
-    t1 = time.time()
-    print('Time: ', t1 - t0)
-
-    print(outputs["pred_logits"].shape)
-    print(outputs["pred_boxes"].shape)
-
-    print('==============================')
-    model.eval()
-    flops, params = profile(model, inputs=(pyramid_feats, ), verbose=False)
-    print('==============================')
-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
-    print('Params : {:.2f} M'.format(params / 1e6))

+ 0 - 86
models/detectors/rtdetr/rtdetr_encoder.py

@@ -1,86 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-try:
-    from .basic_modules.backbone import build_backbone
-    from .basic_modules.fpn      import build_fpn
-except:
-    from  basic_modules.backbone import build_backbone
-    from  basic_modules.fpn      import build_fpn
-
-
-# ----------------- Image Encoder -----------------
-def build_image_encoder(cfg):
-    return ImageEncoder(cfg)
-
-class ImageEncoder(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-        # ---------------- Basic settings ----------------
-        ## Basic parameters
-        self.cfg = cfg
-        ## Network parameters
-        self.strides = cfg['out_stride']
-        self.hidden_dim = cfg['hidden_dim']
-        self.num_levels = len(self.strides)
-        
-        # ---------------- Network settings ----------------
-        ## Backbone Network
-        self.backbone, fpn_feat_dims = build_backbone(cfg, pretrained=cfg['pretrained'])
-
-        ## Feature Pyramid Network
-        self.fpn = build_fpn(cfg, fpn_feat_dims, self.hidden_dim)
-        self.fpn_dims = self.fpn.out_dims
-        
-    def forward(self, x):
-        pyramid_feats = self.backbone(x)
-        pyramid_feats = self.fpn(pyramid_feats)
-
-        return pyramid_feats
-
-
-if __name__ == '__main__':
-    import time
-    from thop import profile
-    cfg = {
-        'width': 1.0,
-        'depth': 1.0,
-        'out_stride': [8, 16, 32],
-        # Image Encoder - Backbone
-        'backbone': 'resnet18',
-        'backbone_norm': 'BN',
-        'res5_dilation': False,
-        'pretrained': True,
-        'pretrained_weight': 'imagenet1k_v1',
-        # Image Encoder - FPN
-        'fpn': 'hybrid_encoder',
-        'fpn_act': 'silu',
-        'fpn_norm': 'BN',
-        'fpn_depthwise': False,
-        'hidden_dim': 256,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_mlp_ratio': 4.0,
-        'en_dropout': 0.1,
-        'pe_temperature': 10000.,
-        'en_act': 'gelu',
-    }
-    x = torch.rand(2, 3, 640, 640)
-    model = build_image_encoder(cfg)
-    model.train()
-
-    t0 = time.time()
-    outputs = model(x)
-    t1 = time.time()
-    print('Time: ', t1 - t0)
-    for out in outputs:
-        print(out.shape)
-
-    print('==============================')
-    model.eval()
-    x = torch.rand(1, 3, 640, 640)
-    flops, params = profile(model, inputs=(x, ), verbose=False)
-    print('==============================')
-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
-    print('Params : {:.2f} M'.format(params / 1e6))

+ 1 - 1
models/detectors/yolov8/yolov8_backbone.py

@@ -76,7 +76,7 @@ class Yolov8Backbone(nn.Module):
         c3 = self.layer_3(c2)
         c4 = self.layer_4(c3)
         c5 = self.layer_5(c4)
-
+        print(c3)
         outputs = [c3, c4, c5]
 
         return outputs

+ 1 - 8
train.sh

@@ -10,14 +10,7 @@ RESUME=$7
 # MODEL setting
 IMAGE_SIZE=640
 FIND_UNUSED_PARAMS=False
-if [[ $MODEL == *"rtdetr"* ]]; then
-    # Epoch setting
-    MAX_EPOCH=72
-    WP_EPOCH=-1
-    EVAL_EPOCH=1
-    NO_AUG_EPOCH=-1
-    FIND_UNUSED_PARAMS=True
-elif [[ $MODEL == *"yolov8"* ]]; then
+if [[ $MODEL == *"yolov8"* ]]; then
     # Epoch setting
     MAX_EPOCH=500
     WP_EPOCH=3