1 year ago · 67b8932c1f
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -96,7 +96,6 @@ from .model_config.yolov8_config import yolov8_cfg
 
				 from .model_config.yolox_config  import yolox_cfg
			
 
				 ## Real-time DETR series
			
 
				 from .model_config.rtdetr_config import rtdetr_cfg
			
 
				-from .model_config.rtpdetr_config import rtpdetr_cfg
			
 
				 
			
 
				 def build_model_config(args):
			
 
				     print('==============================')
			
@@ -131,9 +130,6 @@ def build_model_config(args):
 
				     # RT-DETR
			
 
				     elif args.model in ['rtdetr_r18', 'rtdetr_r34', 'rtdetr_r50', 'rtdetr_r101']:
			
 
				         cfg = rtdetr_cfg[args.model]
			
 
				-    # RT-PlainDETR
			
 
				-    elif args.model in ['rtpdetr_r18', 'rtpdetr_r34', 'rtpdetr_r50', 'rtpdetr_r101']:
			
 
				-        cfg = rtpdetr_cfg[args.model]
			
 
				 
			
 
				     return cfg
			
 
				 
			
--- a/config/model_config/rtpdetr_config.py
+++ b/config/model_config/rtpdetr_config.py
@@ -1,57 +0,0 @@
 
				-# Real-time Transformer-based Object Detector
			
 
				-
			
 
				-
			
 
				-# ------------------- Det task --------------------
			
 
				-rtpdetr_cfg = {
			
 
				-    'rtpdetr_r50':{
			
 
				-        # ---------------- Model config ----------------
			
 
				-        ## Model scale
			
 
				-        'width': 1.0,
			
 
				-        'depth': 1.0,
			
 
				-        'max_stride': 32,
			
 
				-        'out_stride': 16,
			
 
				-        # Image Encoder - Backbone
			
 
				-        'backbone': 'resnet50',
			
 
				-        'backbone_norm': 'FrozeBN',
			
 
				-        'pretrained': True,
			
 
				-        'freeze_at': 0,
			
 
				-        'freeze_stem_only': False,
			
 
				-        'hidden_dim': 256,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 6,
			
 
				-        'en_ffn_dim': 2048,
			
 
				-        'en_dropout': 0.0,
			
 
				-        'en_act': 'gelu',
			
 
				-        # Transformer Decoder
			
 
				-        'transformer': 'plain_detr_transformer',
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_num_layers': 6,
			
 
				-        'de_ffn_dim': 2048,
			
 
				-        'de_dropout': 0.0,
			
 
				-        'de_act': 'gelu',
			
 
				-        'de_pre_norm': True,
			
 
				-        'rpe_hidden_dim': 512,
			
 
				-        'use_checkpoint': False,
			
 
				-        'proposal_feature_levels': 3,
			
 
				-        'proposal_tgt_strides': [8, 16, 32],
			
 
				-        'num_queries_one2one': 300,
			
 
				-        'num_queries_one2many': 1500,
			
 
				-        # ---------------- Assignment config ----------------
			
 
				-        'matcher_hpy': {'cost_class': 2.0,
			
 
				-                        'cost_bbox': 1.0,
			
 
				-                        'cost_giou': 2.0,},
			
 
				-        # ---------------- Loss config ----------------
			
 
				-        'k_one2many': 6,
			
 
				-        'lambda_one2many': 1.0,
			
 
				-        'loss_coeff': {'class': 2,
			
 
				-                       'bbox': 1,
			
 
				-                       'giou': 2,},
			
 
				-        # ---------------- Train config ----------------
			
 
				-        ## input
			
 
				-        'multi_scale': [0.5, 1.25],   # 320 -> 800
			
 
				-        'trans_type': 'rtdetr_l',
			
 
				-        # ---------------- Train config ----------------
			
 
				-        'trainer_type': 'rtpdetr',
			
 
				-    },
			
 
				-
			
 
				-}
			
--- a/engine.py
+++ b/engine.py
@@ -1488,119 +1488,6 @@ class RTDetrTrainer(object):
 
				         
			
 
				         self.train_loader.dataset.transform = self.train_transform
			
 
				 
			
 
				-## Real-time PlainDETR Trainer
			
 
				-class RTPDetrTrainer(RTDetrTrainer):
			
 
				-    def __init__(self, args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size):
			
 
				-        super().__init__(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size)
			
 
				-        # ------------------- Basic parameters -------------------
			
 
				-        ## Reset optimzier hyper-parameters
			
 
				-        self.optimizer_dict = {'optimizer': 'adamw', 'momentum': None, 'weight_decay': 0.0001, 'lr0': 0.0001, 'backbone_lr_ratio': 0.1}
			
 
				-        self.warmup_dict = {'warmup': 'linear', 'warmup_iters': 2000, 'warmup_factor': 0.00066667}
			
 
				-        self.lr_schedule_dict = {'lr_scheduler': 'step', 'lr_epoch': [self.args.max_epoch // 12 * 11]}
			
 
				-        self.normalize_bbox = False
			
 
				-
			
 
				-        # ---------------------------- Build Optimizer ----------------------------
			
 
				-        print("- Re-build oprimizer -")
			
 
				-        self.optimizer_dict['lr0'] *= self.args.batch_size / 16.  # auto lr scaling
			
 
				-        self.optimizer, self.start_epoch = build_rtdetr_optimizer(self.optimizer_dict, model, self.args.resume)
			
 
				-
			
 
				-        # ---------------------------- Build LR Scheduler ----------------------------
			
 
				-        print("- Re-build lr scheduler -")
			
 
				-        self.wp_lr_scheduler = build_wp_lr_scheduler(self.warmup_dict, self.optimizer_dict['lr0'])
			
 
				-        self.lr_scheduler    = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, args.resume)
			
 
				-
			
 
				-    def train_one_epoch(self, model):
			
 
				-        metric_logger = MetricLogger(delimiter="  ")
			
 
				-        metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
			
 
				-        metric_logger.add_meter('size', SmoothedValue(window_size=1, fmt='{value:d}'))
			
 
				-        metric_logger.add_meter('grad_norm', SmoothedValue(window_size=1, fmt='{value:.1f}'))
			
 
				-        header = 'Epoch: [{} / {}]'.format(self.epoch, self.args.max_epoch)
			
 
				-        epoch_size = len(self.train_loader)
			
 
				-        print_freq = 10
			
 
				-
			
 
				-        # basic parameters
			
 
				-        epoch_size = len(self.train_loader)
			
 
				-        img_size = self.args.img_size
			
 
				-        nw = self.warmup_dict['warmup_iters']
			
 
				-        lr_warmup_stage = True
			
 
				-
			
 
				-        # Train one epoch
			
 
				-        for iter_i, (images, targets) in enumerate(metric_logger.log_every(self.train_loader, print_freq, header)):
			
 
				-            ni = iter_i + self.epoch * epoch_size
			
 
				-            # WarmUp
			
 
				-            if ni < nw and lr_warmup_stage:
			
 
				-                self.wp_lr_scheduler(ni, self.optimizer)
			
 
				-            elif ni == nw and lr_warmup_stage:
			
 
				-                print('Warmup stage is over.')
			
 
				-                lr_warmup_stage = False
			
 
				-                self.wp_lr_scheduler.set_lr(self.optimizer, self.optimizer_dict['lr0'], self.optimizer_dict['lr0'])
			
 
				-                                            
			
 
				-            # To device
			
 
				-            images = images.to(self.device, non_blocking=True).float()
			
 
				-            for tgt in targets:
			
 
				-                tgt['boxes'] = tgt['boxes'].to(self.device)
			
 
				-                tgt['labels'] = tgt['labels'].to(self.device)
			
 
				-
			
 
				-            # Multi scale
			
 
				-            if self.args.multi_scale:
			
 
				-                images, targets, img_size = self.rescale_image_targets(
			
 
				-                    images, targets, self.model_cfg['max_stride'], self.args.min_box_size, self.model_cfg['multi_scale'])
			
 
				-            else:
			
 
				-                targets = self.refine_targets(img_size, targets, self.args.min_box_size)
			
 
				-
			
 
				-            # xyxy -> cxcywh
			
 
				-            targets = self.box_xyxy_to_cxcywh(targets)
			
 
				-                
			
 
				-            # Visualize train targets
			
 
				-            if self.args.vis_tgt:
			
 
				-                targets = self.box_cxcywh_to_xyxy(targets)
			
 
				-                vis_data(images, targets, pixel_mean=self.trans_cfg['pixel_mean'], pixel_std=self.trans_cfg['pixel_std'])
			
 
				-                targets = self.box_xyxy_to_cxcywh(targets)
			
 
				-
			
 
				-            # Inference
			
 
				-            with torch.cuda.amp.autocast(enabled=self.args.fp16):
			
 
				-                outputs = model(images)
			
 
				-                # Compute loss
			
 
				-                loss_dict = self.criterion(outputs, targets)
			
 
				-                losses = sum(loss_dict.values())
			
 
				-                # Grad Accumulate
			
 
				-                if self.grad_accumulate > 1:
			
 
				-                    losses /= self.grad_accumulate
			
 
				-
			
 
				-                loss_dict_reduced = distributed_utils.reduce_dict(loss_dict)
			
 
				-
			
 
				-            # Backward
			
 
				-            self.scaler.scale(losses).backward()
			
 
				-
			
 
				-            # Optimize
			
 
				-            if ni % self.grad_accumulate == 0:
			
 
				-                grad_norm = None
			
 
				-                if self.clip_grad > 0:
			
 
				-                    # unscale gradients
			
 
				-                    self.scaler.unscale_(self.optimizer)
			
 
				-                    # clip gradients
			
 
				-                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.clip_grad)
			
 
				-                # optimizer.step
			
 
				-                self.scaler.step(self.optimizer)
			
 
				-                self.scaler.update()
			
 
				-                self.optimizer.zero_grad()
			
 
				-                # ema
			
 
				-                if self.model_ema is not None:
			
 
				-                    self.model_ema.update(model)
			
 
				-
			
 
				-            # Update log
			
 
				-            metric_logger.update(loss=losses.item(), **loss_dict_reduced)
			
 
				-            metric_logger.update(lr=self.optimizer.param_groups[0]["lr"])
			
 
				-            metric_logger.update(grad_norm=grad_norm)
			
 
				-            metric_logger.update(size=img_size)
			
 
				-
			
 
				-            if self.args.debug:
			
 
				-                print("For debug mode, we only train 1 iteration")
			
 
				-                break
			
 
				-
			
 
				-        # LR Schedule
			
 
				-        self.lr_scheduler.step()
			
 
				-        
			
 
				 
			
 
				 # Build Trainer
			
 
				 def build_trainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size):
			
@@ -1613,9 +1500,6 @@ def build_trainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion
 
				         return RTCTrainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size)
			
 
				     elif model_cfg['trainer_type'] == 'rtdetr':
			
 
				         return RTDetrTrainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size)
			
 
				-    elif model_cfg['trainer_type'] == 'rtpdetr':
			
 
				-        return RTPDetrTrainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size)
			
 
				-    
			
 
				     else:
			
 
				         raise NotImplementedError(model_cfg['trainer_type'])
			
 
				     
			
--- a/models/detectors/__init__.py
+++ b/models/detectors/__init__.py
@@ -13,7 +13,6 @@ from .yolov8.build import build_yolov8
 
				 from .yolox.build import build_yolox
			
 
				 # Real-time DETR series
			
 
				 from .rtdetr.build import build_rtdetr
			
 
				-from .rtpdetr.build import build_rtpdetr
			
 
				 
			
 
				 
			
 
				 # build object detector
			
@@ -67,10 +66,6 @@ def build_model(args,
 
				     elif args.model in ['rtdetr_r18', 'rtdetr_r34', 'rtdetr_r50', 'rtdetr_r101']:
			
 
				         model, criterion = build_rtdetr(
			
 
				             args, model_cfg, num_classes, trainable, deploy)
			
 
				-    # RT-PlainDETR
			
 
				-    elif args.model in ['rtpdetr_r18', 'rtpdetr_r34', 'rtpdetr_r50', 'rtpdetr_r101']:
			
 
				-        model, criterion = build_rtpdetr(
			
 
				-            args, model_cfg, num_classes, trainable, deploy)
			
 
				 
			
 
				     if trainable:
			
 
				         # Load pretrained weight
			
--- a/models/detectors/rtpdetr/basic_modules/backbone.py
+++ b/models/detectors/rtpdetr/basic_modules/backbone.py
@@ -1,145 +0,0 @@
 
				-import torch
			
 
				-import torchvision
			
 
				-from torch import nn
			
 
				-from torchvision.models._utils import IntermediateLayerGetter
			
 
				-
			
 
				-try:
			
 
				-    from .basic import FrozenBatchNorm2d
			
 
				-except:
			
 
				-    from basic  import FrozenBatchNorm2d
			
 
				-   
			
 
				-
			
 
				-# IN1K MIM pretrained weights (from SparK: https://github.com/keyu-tian/SparK)
			
 
				-pretrained_urls = {
			
 
				-    # ResNet series
			
 
				-    'resnet18':  None,
			
 
				-    'resnet34':  None,
			
 
				-    'resnet50':  "https://github.com/yjh0410/RT-ODLab/releases/download/backbone_weight/resnet50_in1k_spark_pretrained_timm_style.pth",
			
 
				-    'resnet101': None,
			
 
				-    # ShuffleNet series
			
 
				-}
			
 
				-
			
 
				-
			
 
				-# ----------------- Model functions -----------------
			
 
				-## Build backbone network
			
 
				-def build_backbone(cfg, pretrained=False):
			
 
				-    print('==============================')
			
 
				-    print('Backbone: {}'.format(cfg['backbone']))
			
 
				-    # ResNet
			
 
				-    if 'resnet' in cfg['backbone']:
			
 
				-        model, feats = build_resnet(cfg, pretrained)
			
 
				-    elif 'svnetv2' in cfg['backbone']:
			
 
				-        pretrained_weight = cfg['pretrained_weight'] if pretrained else None
			
 
				-        model, feats = build_scnetv2(cfg, pretrained_weight)
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown backbone: <>.".format(cfg['backbone']))
			
 
				-    
			
 
				-    return model, feats
			
 
				-
			
 
				-
			
 
				-# ----------------- ResNet Backbone -----------------
			
 
				-class ResNet(nn.Module):
			
 
				-    """ResNet backbone with frozen BatchNorm."""
			
 
				-    def __init__(self,
			
 
				-                 name: str,
			
 
				-                 norm_type: str,
			
 
				-                 pretrained: bool = False,
			
 
				-                 freeze_at: int = -1,
			
 
				-                 freeze_stem_only: bool = False):
			
 
				-        super().__init__()
			
 
				-        # Pretrained
			
 
				-        # Norm layer
			
 
				-        if norm_type == 'BN':
			
 
				-            norm_layer = nn.BatchNorm2d
			
 
				-        elif norm_type == 'FrozeBN':
			
 
				-            norm_layer = FrozenBatchNorm2d
			
 
				-        # Backbone
			
 
				-        backbone = getattr(torchvision.models, name)(norm_layer=norm_layer,)
			
 
				-        return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
			
 
				-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
			
 
				-        self.feat_dims = [128, 256, 512] if name in ('resnet18', 'resnet34') else [512, 1024, 2048]
			
 
				-        
			
 
				-        # Load pretrained
			
 
				-        if pretrained:
			
 
				-            self.load_pretrained(name)
			
 
				-
			
 
				-        # Freeze
			
 
				-        if freeze_at >= 0:
			
 
				-            for name, parameter in backbone.named_parameters():
			
 
				-                if freeze_stem_only:
			
 
				-                    if 'layer1' not in name and 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
			
 
				-                        parameter.requires_grad_(False)
			
 
				-                else:
			
 
				-                    if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
			
 
				-                        parameter.requires_grad_(False)
			
 
				-
			
 
				-    def load_pretrained(self, name):
			
 
				-        url = pretrained_urls[name]
			
 
				-        if url is not None:
			
 
				-            print('Loading pretrained weight from : {}'.format(url))
			
 
				-            # checkpoint state dict
			
 
				-            checkpoint_state_dict = torch.hub.load_state_dict_from_url(
			
 
				-                url=url, map_location="cpu", check_hash=True)
			
 
				-            # model state dict
			
 
				-            model_state_dict = self.body.state_dict()
			
 
				-            # check
			
 
				-            for k in list(checkpoint_state_dict.keys()):
			
 
				-                if k in model_state_dict:
			
 
				-                    shape_model = tuple(model_state_dict[k].shape)
			
 
				-                    shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
			
 
				-                    if shape_model != shape_checkpoint:
			
 
				-                        checkpoint_state_dict.pop(k)
			
 
				-                else:
			
 
				-                    checkpoint_state_dict.pop(k)
			
 
				-                    print('Unused key: ', k)
			
 
				-            # load the weight
			
 
				-            self.body.load_state_dict(checkpoint_state_dict)
			
 
				-        else:
			
 
				-            print('No backbone pretrained for {}.'.format(name))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        xs = self.body(x)
			
 
				-        fmp_list = []
			
 
				-        for name, fmp in xs.items():
			
 
				-            fmp_list.append(fmp)
			
 
				-
			
 
				-        return fmp_list
			
 
				-
			
 
				-def build_resnet(cfg, pretrained=False):
			
 
				-    # ResNet series
			
 
				-    backbone = ResNet(cfg['backbone'],
			
 
				-                      cfg['backbone_norm'],
			
 
				-                      pretrained,
			
 
				-                      cfg['freeze_at'],
			
 
				-                      cfg['freeze_stem_only'])
			
 
				-
			
 
				-    return backbone, backbone.feat_dims
			
 
				-
			
 
				-
			
 
				-# ----------------- ShuffleNet Backbone -----------------
			
 
				-## TODO: Add shufflenet-v2
			
 
				-class ShuffleNetv2:
			
 
				-    pass
			
 
				-
			
 
				-def build_scnetv2(cfg, pretrained_weight=None):
			
 
				-    return
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    cfg = {
			
 
				-        'backbone': 'resnet50',
			
 
				-        'backbone_norm': 'FrozeBN',
			
 
				-        'pretrained': True,
			
 
				-        'freeze_at': 0,
			
 
				-        'freeze_stem_only': False,
			
 
				-    }
			
 
				-    model, feat_dim = build_backbone(cfg, cfg['pretrained'])
			
 
				-    model.eval()
			
 
				-    print(feat_dim)
			
 
				-
			
 
				-    x = torch.ones(2, 3, 320, 320)
			
 
				-    output = model(x)
			
 
				-    for y in output:
			
 
				-        print(y.size())
			
 
				-    print(output[-1])
			
 
				-
			
--- a/models/detectors/rtpdetr/basic_modules/basic.py
+++ b/models/detectors/rtpdetr/basic_modules/basic.py
@@ -1,402 +0,0 @@
 
				-import math
			
 
				-import warnings
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-
			
 
				-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
			
 
				-    """Copy from timm"""
			
 
				-    with torch.no_grad():
			
 
				-        """Copy from timm"""
			
 
				-        def norm_cdf(x):
			
 
				-            return (1. + math.erf(x / math.sqrt(2.))) / 2.
			
 
				-
			
 
				-        if (mean < a - 2 * std) or (mean > b + 2 * std):
			
 
				-            warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
			
 
				-                        "The distribution of values may be incorrect.",
			
 
				-                        stacklevel=2)
			
 
				-
			
 
				-        l = norm_cdf((a - mean) / std)
			
 
				-        u = norm_cdf((b - mean) / std)
			
 
				-
			
 
				-        tensor.uniform_(2 * l - 1, 2 * u - 1)
			
 
				-        tensor.erfinv_()
			
 
				-
			
 
				-        tensor.mul_(std * math.sqrt(2.))
			
 
				-        tensor.add_(mean)
			
 
				-
			
 
				-        tensor.clamp_(min=a, max=b)
			
 
				-
			
 
				-        return tensor
			
 
				-
			
 
				-def box_xyxy_to_cxcywh(x):
			
 
				-    x0, y0, x1, y1 = x.unbind(-1)
			
 
				-    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
			
 
				-    
			
 
				-    return torch.stack(b, dim=-1)
			
 
				-
			
 
				-def delta2bbox(proposals,
			
 
				-               deltas,
			
 
				-               max_shape=None,
			
 
				-               wh_ratio_clip=16 / 1000,
			
 
				-               clip_border=True,
			
 
				-               add_ctr_clamp=False,
			
 
				-               ctr_clamp=32):
			
 
				-
			
 
				-    dxy = deltas[..., :2]
			
 
				-    dwh = deltas[..., 2:]
			
 
				-
			
 
				-    # Compute width/height of each roi
			
 
				-    pxy = proposals[..., :2]
			
 
				-    pwh = proposals[..., 2:]
			
 
				-
			
 
				-    dxy_wh = pwh * dxy
			
 
				-    wh_ratio_clip = torch.as_tensor(wh_ratio_clip)
			
 
				-    max_ratio = torch.abs(torch.log(wh_ratio_clip)).item()
			
 
				-    
			
 
				-    if add_ctr_clamp:
			
 
				-        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
			
 
				-        dwh = torch.clamp(dwh, max=max_ratio)
			
 
				-    else:
			
 
				-        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
			
 
				-
			
 
				-    gxy = pxy + dxy_wh
			
 
				-    gwh = pwh * dwh.exp()
			
 
				-    x1y1 = gxy - (gwh * 0.5)
			
 
				-    x2y2 = gxy + (gwh * 0.5)
			
 
				-    bboxes = torch.cat([x1y1, x2y2], dim=-1)
			
 
				-    if clip_border and max_shape is not None:
			
 
				-        bboxes[..., 0::2].clamp_(min=0).clamp_(max=max_shape[1])
			
 
				-        bboxes[..., 1::2].clamp_(min=0).clamp_(max=max_shape[0])
			
 
				-
			
 
				-    return bboxes
			
 
				-
			
 
				-
			
 
				-# ---------------------------- NMS ----------------------------
			
 
				-## basic NMS
			
 
				-def nms(bboxes, scores, nms_thresh):
			
 
				-    """"Pure Python NMS."""
			
 
				-    x1 = bboxes[:, 0]  #xmin
			
 
				-    y1 = bboxes[:, 1]  #ymin
			
 
				-    x2 = bboxes[:, 2]  #xmax
			
 
				-    y2 = bboxes[:, 3]  #ymax
			
 
				-
			
 
				-    areas = (x2 - x1) * (y2 - y1)
			
 
				-    order = scores.argsort()[::-1]
			
 
				-
			
 
				-    keep = []
			
 
				-    while order.size > 0:
			
 
				-        i = order[0]
			
 
				-        keep.append(i)
			
 
				-        # compute iou
			
 
				-        xx1 = np.maximum(x1[i], x1[order[1:]])
			
 
				-        yy1 = np.maximum(y1[i], y1[order[1:]])
			
 
				-        xx2 = np.minimum(x2[i], x2[order[1:]])
			
 
				-        yy2 = np.minimum(y2[i], y2[order[1:]])
			
 
				-
			
 
				-        w = np.maximum(1e-10, xx2 - xx1)
			
 
				-        h = np.maximum(1e-10, yy2 - yy1)
			
 
				-        inter = w * h
			
 
				-
			
 
				-        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
			
 
				-        #reserve all the boundingbox whose ovr less than thresh
			
 
				-        inds = np.where(iou <= nms_thresh)[0]
			
 
				-        order = order[inds + 1]
			
 
				-
			
 
				-    return keep
			
 
				-
			
 
				-## class-agnostic NMS 
			
 
				-def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh):
			
 
				-    # nms
			
 
				-    keep = nms(bboxes, scores, nms_thresh)
			
 
				-    scores = scores[keep]
			
 
				-    labels = labels[keep]
			
 
				-    bboxes = bboxes[keep]
			
 
				-
			
 
				-    return scores, labels, bboxes
			
 
				-
			
 
				-## class-aware NMS 
			
 
				-def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes):
			
 
				-    # nms
			
 
				-    keep = np.zeros(len(bboxes), dtype=np.int32)
			
 
				-    for i in range(num_classes):
			
 
				-        inds = np.where(labels == i)[0]
			
 
				-        if len(inds) == 0:
			
 
				-            continue
			
 
				-        c_bboxes = bboxes[inds]
			
 
				-        c_scores = scores[inds]
			
 
				-        c_keep = nms(c_bboxes, c_scores, nms_thresh)
			
 
				-        keep[inds[c_keep]] = 1
			
 
				-    keep = np.where(keep > 0)
			
 
				-    scores = scores[keep]
			
 
				-    labels = labels[keep]
			
 
				-    bboxes = bboxes[keep]
			
 
				-
			
 
				-    return scores, labels, bboxes
			
 
				-
			
 
				-## multi-class NMS 
			
 
				-def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False):
			
 
				-    if class_agnostic:
			
 
				-        return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh)
			
 
				-    else:
			
 
				-        return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes)
			
 
				-
			
 
				-
			
 
				-# ----------------- Customed NormLayer Ops -----------------
			
 
				-class FrozenBatchNorm2d(torch.nn.Module):
			
 
				-    def __init__(self, n):
			
 
				-        super(FrozenBatchNorm2d, self).__init__()
			
 
				-        self.register_buffer("weight", torch.ones(n))
			
 
				-        self.register_buffer("bias", torch.zeros(n))
			
 
				-        self.register_buffer("running_mean", torch.zeros(n))
			
 
				-        self.register_buffer("running_var", torch.ones(n))
			
 
				-
			
 
				-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
			
 
				-                              missing_keys, unexpected_keys, error_msgs):
			
 
				-        num_batches_tracked_key = prefix + 'num_batches_tracked'
			
 
				-        if num_batches_tracked_key in state_dict:
			
 
				-            del state_dict[num_batches_tracked_key]
			
 
				-
			
 
				-        super(FrozenBatchNorm2d, self)._load_from_state_dict(
			
 
				-            state_dict, prefix, local_metadata, strict,
			
 
				-            missing_keys, unexpected_keys, error_msgs)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        # move reshapes to the beginning
			
 
				-        # to make it fuser-friendly
			
 
				-        w = self.weight.reshape(1, -1, 1, 1)
			
 
				-        b = self.bias.reshape(1, -1, 1, 1)
			
 
				-        rv = self.running_var.reshape(1, -1, 1, 1)
			
 
				-        rm = self.running_mean.reshape(1, -1, 1, 1)
			
 
				-        eps = 1e-5
			
 
				-        scale = w * (rv + eps).rsqrt()
			
 
				-        bias = b - rm * scale
			
 
				-        return x * scale + bias
			
 
				-
			
 
				-class LayerNorm2D(nn.Module):
			
 
				-    def __init__(self, normalized_shape, norm_layer=nn.LayerNorm):
			
 
				-        super().__init__()
			
 
				-        self.ln = norm_layer(normalized_shape) if norm_layer is not None else nn.Identity()
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        """
			
 
				-        x: N C H W
			
 
				-        """
			
 
				-        x = x.permute(0, 2, 3, 1)
			
 
				-        x = self.ln(x)
			
 
				-        x = x.permute(0, 3, 1, 2)
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-# ----------------- Basic CNN Ops -----------------
			
 
				-def get_conv2d(c1, c2, k, p, s, g, bias=False):
			
 
				-    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, groups=g, bias=bias)
			
 
				-
			
 
				-    return conv
			
 
				-
			
 
				-def get_activation(act_type=None):
			
 
				-    if act_type == 'relu':
			
 
				-        return nn.ReLU(inplace=True)
			
 
				-    elif act_type == 'lrelu':
			
 
				-        return nn.LeakyReLU(0.1, inplace=True)
			
 
				-    elif act_type == 'mish':
			
 
				-        return nn.Mish(inplace=True)
			
 
				-    elif act_type == 'silu':
			
 
				-        return nn.SiLU(inplace=True)
			
 
				-    elif act_type == 'gelu':
			
 
				-        return nn.GELU()
			
 
				-    elif act_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-        
			
 
				-def get_norm(norm_type, dim):
			
 
				-    if norm_type == 'BN':
			
 
				-        return nn.BatchNorm2d(dim)
			
 
				-    elif norm_type == 'GN':
			
 
				-        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				-    elif norm_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-
			
 
				-class BasicConv(nn.Module):
			
 
				-    def __init__(self, 
			
 
				-                 in_dim,                   # in channels
			
 
				-                 out_dim,                  # out channels 
			
 
				-                 kernel_size=1,            # kernel size 
			
 
				-                 padding=0,                # padding
			
 
				-                 stride=1,                 # padding
			
 
				-                 act_type  :str = 'lrelu', # activation
			
 
				-                 norm_type :str = 'BN',    # normalization
			
 
				-                ):
			
 
				-        super(BasicConv, self).__init__()
			
 
				-        add_bias = False if norm_type else True
			
 
				-        self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
			
 
				-        self.norm = get_norm(norm_type, out_dim)
			
 
				-        self.act  = get_activation(act_type)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return self.act(self.norm(self.conv(x)))
			
 
				-
			
 
				-class UpSampleWrapper(nn.Module):
			
 
				-    """Upsample last feat map to specific stride."""
			
 
				-    def __init__(self, in_dim, upsample_factor):
			
 
				-        super(UpSampleWrapper, self).__init__()
			
 
				-        # ---------- Basic parameters ----------
			
 
				-        self.upsample_factor = upsample_factor
			
 
				-
			
 
				-        # ---------- Network parameters ----------
			
 
				-        if upsample_factor == 1:
			
 
				-            self.upsample = nn.Identity()
			
 
				-        else:
			
 
				-            scale = int(math.log2(upsample_factor))
			
 
				-            dim = in_dim
			
 
				-            layers = []
			
 
				-            for _ in range(scale-1):
			
 
				-                layers += [
			
 
				-                    nn.ConvTranspose2d(dim, dim, kernel_size=2, stride=2),
			
 
				-                    LayerNorm2D(dim),
			
 
				-                    nn.GELU()
			
 
				-                ]
			
 
				-            layers += [nn.ConvTranspose2d(dim, dim, kernel_size=2, stride=2)]
			
 
				-            self.upsample = nn.Sequential(*layers)
			
 
				-            self.out_dim = dim
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = self.upsample(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-# ----------------- MLP modules -----------------
			
 
				-class MLP(nn.Module):
			
 
				-    def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
			
 
				-        super().__init__()
			
 
				-        self.num_layers = num_layers
			
 
				-        h = [hidden_dim] * (num_layers - 1)
			
 
				-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([in_dim] + h, h + [out_dim]))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        for i, layer in enumerate(self.layers):
			
 
				-            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
			
 
				-        return x
			
 
				-
			
 
				-class FFN(nn.Module):
			
 
				-    def __init__(self, d_model=256, ffn_dim=1024, dropout=0., act_type='relu', pre_norm=False):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.pre_norm = pre_norm
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        # ----------- Network parameters -----------
			
 
				-        self.linear1 = nn.Linear(d_model, self.ffn_dim)
			
 
				-        self.activation = get_activation(act_type)
			
 
				-        self.dropout2 = nn.Dropout(dropout)
			
 
				-        self.linear2 = nn.Linear(self.ffn_dim, d_model)
			
 
				-        self.dropout3 = nn.Dropout(dropout)
			
 
				-        self.norm = nn.LayerNorm(d_model)
			
 
				-
			
 
				-    def forward(self, src):
			
 
				-        if self.pre_norm:
			
 
				-            src = self.norm(src)
			
 
				-            src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
			
 
				-            src = src + self.dropout3(src2)
			
 
				-        else:
			
 
				-            src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
			
 
				-            src = src + self.dropout3(src2)
			
 
				-            src = self.norm(src)
			
 
				-        
			
 
				-        return src
			
 
				-    
			
 
				-
			
 
				-# ----------------- Attention Ops -----------------
			
 
				-class GlobalCrossAttention(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        dim            :int   = 256,
			
 
				-        num_heads      :int   = 8,
			
 
				-        qkv_bias       :bool  = True,
			
 
				-        qk_scale       :float = None,
			
 
				-        attn_drop      :float = 0.0,
			
 
				-        proj_drop      :float = 0.0,
			
 
				-        rpe_hidden_dim :int   = 512,
			
 
				-        feature_stride :int   = 16,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        # --------- Basic parameters ---------
			
 
				-        self.dim = dim
			
 
				-        self.num_heads = num_heads
			
 
				-        head_dim = dim // num_heads
			
 
				-        self.scale = qk_scale or head_dim ** -0.5
			
 
				-        self.feature_stride = feature_stride
			
 
				-
			
 
				-        # --------- Network parameters ---------
			
 
				-        self.cpb_mlp1 = self.build_cpb_mlp(2, rpe_hidden_dim, num_heads)
			
 
				-        self.cpb_mlp2 = self.build_cpb_mlp(2, rpe_hidden_dim, num_heads)
			
 
				-        self.q = nn.Linear(dim, dim, bias=qkv_bias)
			
 
				-        self.k = nn.Linear(dim, dim, bias=qkv_bias)
			
 
				-        self.v = nn.Linear(dim, dim, bias=qkv_bias)
			
 
				-        self.attn_drop = nn.Dropout(attn_drop)
			
 
				-        self.proj = nn.Linear(dim, dim)
			
 
				-        self.proj_drop = nn.Dropout(proj_drop)
			
 
				-        self.softmax = nn.Softmax(dim=-1)
			
 
				-
			
 
				-    def build_cpb_mlp(self, in_dim, hidden_dim, out_dim):
			
 
				-        cpb_mlp = nn.Sequential(nn.Linear(in_dim, hidden_dim, bias=True),
			
 
				-                                nn.ReLU(inplace=True),
			
 
				-                                nn.Linear(hidden_dim, out_dim, bias=False))
			
 
				-        return cpb_mlp
			
 
				-
			
 
				-    def forward(
			
 
				-        self,
			
 
				-        query,
			
 
				-        reference_points,
			
 
				-        k_input_flatten,
			
 
				-        v_input_flatten,
			
 
				-        input_spatial_shapes,
			
 
				-        input_padding_mask=None,
			
 
				-    ):
			
 
				-        assert input_spatial_shapes.size(0) == 1, 'This is designed for single-scale decoder.'
			
 
				-        h, w = input_spatial_shapes[0]
			
 
				-        stride = self.feature_stride
			
 
				-
			
 
				-        ref_pts = torch.cat([
			
 
				-            reference_points[:, :, :, :2] - reference_points[:, :, :, 2:] / 2,
			
 
				-            reference_points[:, :, :, :2] + reference_points[:, :, :, 2:] / 2,
			
 
				-        ], dim=-1)  # B, nQ, 1, 4
			
 
				-
			
 
				-        pos_x = torch.linspace(0.5, w - 0.5, w, dtype=torch.float32, device=w.device)[None, None, :, None] * stride  # 1, 1, w, 1
			
 
				-        pos_y = torch.linspace(0.5, h - 0.5, h, dtype=torch.float32, device=h.device)[None, None, :, None] * stride  # 1, 1, h, 1
			
 
				-
			
 
				-        delta_x = ref_pts[..., 0::2] - pos_x  # B, nQ, w, 2
			
 
				-        delta_y = ref_pts[..., 1::2] - pos_y  # B, nQ, h, 2
			
 
				-
			
 
				-        rpe_x, rpe_y = self.cpb_mlp1(delta_x), self.cpb_mlp2(delta_y)  # B, nQ, w/h, nheads
			
 
				-        rpe = (rpe_x[:, :, None] + rpe_y[:, :, :, None]).flatten(2, 3) # B, nQ, h, w, nheads ->  B, nQ, h*w, nheads
			
 
				-        rpe = rpe.permute(0, 3, 1, 2)
			
 
				-
			
 
				-        B_, N, C = k_input_flatten.shape
			
 
				-        k = self.k(k_input_flatten).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
			
 
				-        v = self.v(v_input_flatten).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
			
 
				-        B_, N, C = query.shape
			
 
				-        q = self.q(query).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
			
 
				-        q = q * self.scale
			
 
				-
			
 
				-        attn = q @ k.transpose(-2, -1)
			
 
				-        attn += rpe
			
 
				-        if input_padding_mask is not None:
			
 
				-            attn += input_padding_mask[:, None, None] * -100
			
 
				-
			
 
				-        fmin, fmax = torch.finfo(attn.dtype).min, torch.finfo(attn.dtype).max
			
 
				-        torch.clip_(attn, min=fmin, max=fmax)
			
 
				-
			
 
				-        attn = self.softmax(attn)
			
 
				-        attn = self.attn_drop(attn)
			
 
				-        x = attn @ v
			
 
				-
			
 
				-        x = x.transpose(1, 2).reshape(B_, N, C)
			
 
				-        x = self.proj(x)
			
 
				-        x = self.proj_drop(x)
			
 
				-
			
 
				-        return x
			
--- a/models/detectors/rtpdetr/basic_modules/transformer.py
+++ b/models/detectors/rtpdetr/basic_modules/transformer.py
@@ -1,447 +0,0 @@
 
				-import math
			
 
				-import copy
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-import torch.utils.checkpoint as checkpoint
			
 
				-
			
 
				-try:
			
 
				-    from .basic import FFN, GlobalCrossAttention
			
 
				-    from .basic import trunc_normal_
			
 
				-except:
			
 
				-    from  basic import FFN, GlobalCrossAttention
			
 
				-    from  basic import trunc_normal_
			
 
				-
			
 
				-
			
 
				-def get_clones(module, N):
			
 
				-    if N <= 0:
			
 
				-        return None
			
 
				-    else:
			
 
				-        return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
			
 
				-
			
 
				-def inverse_sigmoid(x, eps=1e-5):
			
 
				-    x = x.clamp(min=0., max=1.)
			
 
				-    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
			
 
				-
			
 
				-
			
 
				-# ----------------- Transformer modules -----------------
			
 
				-## Transformer Encoder layer
			
 
				-class TransformerEncoderLayer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 d_model         :int   = 256,
			
 
				-                 num_heads       :int   = 8,
			
 
				-                 ffn_dim         :int = 1024,
			
 
				-                 dropout         :float = 0.1,
			
 
				-                 act_type        :str   = "relu",
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.dropout = dropout
			
 
				-        self.act_type = act_type
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        # Multi-head Self-Attn
			
 
				-        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-        self.norm = nn.LayerNorm(d_model)
			
 
				-
			
 
				-        # Feedforwaed Network
			
 
				-        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
			
 
				-
			
 
				-    def with_pos_embed(self, tensor, pos):
			
 
				-        return tensor if pos is None else tensor + pos
			
 
				-
			
 
				-
			
 
				-    def forward(self, src, pos_embed):
			
 
				-        """
			
 
				-        Input:
			
 
				-            src:       [torch.Tensor] -> [B, N, C]
			
 
				-            pos_embed: [torch.Tensor] -> [B, N, C]
			
 
				-        Output:
			
 
				-            src:       [torch.Tensor] -> [B, N, C]
			
 
				-        """
			
 
				-        q = k = self.with_pos_embed(src, pos_embed)
			
 
				-
			
 
				-        # -------------- MHSA --------------
			
 
				-        src2 = self.self_attn(q, k, value=src)[0]
			
 
				-        src = src + self.dropout(src2)
			
 
				-        src = self.norm(src)
			
 
				-
			
 
				-        # -------------- FFN --------------
			
 
				-        src = self.ffn(src)
			
 
				-        
			
 
				-        return src
			
 
				-
			
 
				-## Transformer Encoder
			
 
				-class TransformerEncoder(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 d_model        :int   = 256,
			
 
				-                 num_heads      :int   = 8,
			
 
				-                 num_layers     :int   = 1,
			
 
				-                 ffn_dim        :int = 1024,
			
 
				-                 pe_temperature : float = 10000.,
			
 
				-                 dropout        :float = 0.1,
			
 
				-                 act_type       :str   = "relu",
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.num_layers = num_layers
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.dropout = dropout
			
 
				-        self.act_type = act_type
			
 
				-        self.pe_temperature = pe_temperature
			
 
				-        self.pos_embed = None
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.encoder_layers = get_clones(
			
 
				-            TransformerEncoderLayer(d_model, num_heads, ffn_dim, dropout, act_type), num_layers)
			
 
				-
			
 
				-    def build_2d_sincos_position_embedding(self, device, w, h, embed_dim=256, temperature=10000.):
			
 
				-        assert embed_dim % 4 == 0, \
			
 
				-            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
			
 
				-        
			
 
				-        # ----------- Check cahed pos_embed -----------
			
 
				-        if self.pos_embed is not None and \
			
 
				-            self.pos_embed.shape[2:] == [h, w]:
			
 
				-            return self.pos_embed
			
 
				-        
			
 
				-        # ----------- Generate grid coords -----------
			
 
				-        grid_w = torch.arange(int(w), dtype=torch.float32)
			
 
				-        grid_h = torch.arange(int(h), dtype=torch.float32)
			
 
				-        grid_w, grid_h = torch.meshgrid([grid_w, grid_h])  # shape: [H, W]
			
 
				-
			
 
				-        pos_dim = embed_dim // 4
			
 
				-        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
			
 
				-        omega = 1. / (temperature**omega)
			
 
				-
			
 
				-        out_w = grid_w.flatten()[..., None] @ omega[None] # shape: [N, C]
			
 
				-        out_h = grid_h.flatten()[..., None] @ omega[None] # shape: [N, C]
			
 
				-
			
 
				-        # shape: [1, N, C]
			
 
				-        pos_embed = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h),torch.cos(out_h)], dim=1)[None, :, :]
			
 
				-        pos_embed = pos_embed.to(device)
			
 
				-        self.pos_embed = pos_embed
			
 
				-
			
 
				-        return pos_embed
			
 
				-
			
 
				-    def forward(self, src):
			
 
				-        """
			
 
				-        Input:
			
 
				-            src:  [torch.Tensor] -> [B, C, H, W]
			
 
				-        Output:
			
 
				-            src:  [torch.Tensor] -> [B, C, H, W]
			
 
				-        """
			
 
				-        # -------- Transformer encoder --------
			
 
				-        channels, fmp_h, fmp_w = src.shape[1:]
			
 
				-        # [B, C, H, W] -> [B, N, C], N=HxW
			
 
				-        src_flatten = src.flatten(2).permute(0, 2, 1)
			
 
				-        memory = src_flatten
			
 
				-
			
 
				-        # PosEmbed: [1, N, C]
			
 
				-        pos_embed = self.build_2d_sincos_position_embedding(
			
 
				-            src.device, fmp_w, fmp_h, channels, self.pe_temperature)
			
 
				-        
			
 
				-        # Transformer Encoder layer
			
 
				-        for encoder in self.encoder_layers:
			
 
				-            memory = encoder(memory, pos_embed=pos_embed)
			
 
				-
			
 
				-        # Output: [B, N, C] -> [B, C, N] -> [B, C, H, W]
			
 
				-        src = memory.permute(0, 2, 1).reshape([-1, channels, fmp_h, fmp_w])
			
 
				-
			
 
				-        return src
			
 
				-
			
 
				-## PlainDETR's Decoder layer
			
 
				-class GlobalDecoderLayer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 d_model    :int   = 256,
			
 
				-                 num_heads  :int   = 8,
			
 
				-                 ffn_dim    :int = 1024,
			
 
				-                 dropout    :float = 0.1,
			
 
				-                 act_type   :str   = "relu",
			
 
				-                 pre_norm   :bool  = False,
			
 
				-                 rpe_hidden_dim :int = 512,
			
 
				-                 feature_stride :int = 16,
			
 
				-                 ) -> None:
			
 
				-        super().__init__()
			
 
				-        # ------------ Basic parameters ------------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.rpe_hidden_dim = rpe_hidden_dim
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.act_type = act_type
			
 
				-        self.pre_norm = pre_norm
			
 
				-
			
 
				-        # ------------ Network parameters ------------
			
 
				-        ## Multi-head Self-Attn
			
 
				-        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
			
 
				-        self.dropout1 = nn.Dropout(dropout)
			
 
				-        self.norm1 = nn.LayerNorm(d_model)
			
 
				-
			
 
				-        ## Box-reparam Global Cross-Attn
			
 
				-        self.cross_attn = GlobalCrossAttention(d_model, num_heads, rpe_hidden_dim=rpe_hidden_dim, feature_stride=feature_stride)
			
 
				-        self.dropout2 = nn.Dropout(dropout)
			
 
				-        self.norm2 = nn.LayerNorm(d_model)
			
 
				-
			
 
				-        ## FFN
			
 
				-        self.ffn = FFN(d_model, ffn_dim, dropout, act_type, pre_norm)
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def with_pos_embed(tensor, pos):
			
 
				-        return tensor if pos is None else tensor + pos
			
 
				-
			
 
				-    def forward_pre_norm(self,
			
 
				-                         tgt,
			
 
				-                         query_pos,
			
 
				-                         reference_points,
			
 
				-                         src,
			
 
				-                         src_pos_embed,
			
 
				-                         src_spatial_shapes,
			
 
				-                         src_padding_mask=None,
			
 
				-                         self_attn_mask=None,
			
 
				-                         ):
			
 
				-        # ----------- Multi-head self attention -----------
			
 
				-        tgt1 = self.norm1(tgt)
			
 
				-        q = k = self.with_pos_embed(tgt1, query_pos)
			
 
				-        tgt1 = self.self_attn(q.transpose(0, 1),        # [B, N, C] -> [N, B, C], batch_first = False
			
 
				-                              k.transpose(0, 1),        # [B, N, C] -> [N, B, C], batch_first = False
			
 
				-                              tgt1.transpose(0, 1),     # [B, N, C] -> [N, B, C], batch_first = False
			
 
				-                              attn_mask=self_attn_mask,
			
 
				-                              )[0].transpose(0, 1)      # [N, B, C] -> [B, N, C]
			
 
				-        tgt = tgt + self.dropout1(tgt1)
			
 
				-
			
 
				-        # ----------- Global corss attention -----------
			
 
				-        tgt1 = self.norm2(tgt)
			
 
				-        tgt1 = self.cross_attn(self.with_pos_embed(tgt1, query_pos),
			
 
				-                               reference_points,
			
 
				-                               self.with_pos_embed(src, src_pos_embed),
			
 
				-                               src,
			
 
				-                               src_spatial_shapes,
			
 
				-                               src_padding_mask,
			
 
				-                               )
			
 
				-        tgt = tgt + self.dropout2(tgt1)
			
 
				-
			
 
				-        # ----------- FeedForward Network -----------
			
 
				-        tgt = self.ffn(tgt)
			
 
				-
			
 
				-        return tgt
			
 
				-
			
 
				-    def forward_post_norm(self,
			
 
				-                          tgt,
			
 
				-                          query_pos,
			
 
				-                          reference_points,
			
 
				-                          src,
			
 
				-                          src_pos_embed,
			
 
				-                          src_spatial_shapes,
			
 
				-                          src_padding_mask=None,
			
 
				-                          self_attn_mask=None,
			
 
				-                          ):
			
 
				-        # ----------- Multi-head self attention -----------
			
 
				-        q = k = self.with_pos_embed(tgt, query_pos)
			
 
				-        tgt1 = self.self_attn(q.transpose(0, 1),        # [B, N, C] -> [N, B, C], batch_first = False
			
 
				-                              k.transpose(0, 1),        # [B, N, C] -> [N, B, C], batch_first = False
			
 
				-                              tgt.transpose(0, 1),     # [B, N, C] -> [N, B, C], batch_first = False
			
 
				-                              attn_mask=self_attn_mask,
			
 
				-                              )[0].transpose(0, 1)      # [N, B, C] -> [B, N, C]
			
 
				-        tgt = tgt + self.dropout1(tgt1)
			
 
				-        tgt = self.norm1(tgt)
			
 
				-
			
 
				-        # ----------- Global corss attention -----------
			
 
				-        tgt1 = self.cross_attn(self.with_pos_embed(tgt, query_pos),
			
 
				-                               reference_points,
			
 
				-                               self.with_pos_embed(src, src_pos_embed),
			
 
				-                               src,
			
 
				-                               src_spatial_shapes,
			
 
				-                               src_padding_mask,
			
 
				-                               )
			
 
				-        tgt = tgt + self.dropout2(tgt1)
			
 
				-        tgt = self.norm2(tgt)
			
 
				-
			
 
				-        # ----------- FeedForward Network -----------
			
 
				-        tgt = self.ffn(tgt)
			
 
				-
			
 
				-        return tgt
			
 
				-
			
 
				-    def forward(self,
			
 
				-                tgt,
			
 
				-                query_pos,
			
 
				-                reference_points,
			
 
				-                src,
			
 
				-                src_pos_embed,
			
 
				-                src_spatial_shapes,
			
 
				-                src_padding_mask=None,
			
 
				-                self_attn_mask=None,
			
 
				-                ):
			
 
				-        if self.pre_norm:
			
 
				-            return self.forward_pre_norm(tgt, query_pos, reference_points, src, src_pos_embed, src_spatial_shapes,
			
 
				-                                         src_padding_mask, self_attn_mask)
			
 
				-        else:
			
 
				-            return self.forward_post_norm(tgt, query_pos, reference_points, src, src_pos_embed, src_spatial_shapes,
			
 
				-                                          src_padding_mask, self_attn_mask)
			
 
				-
			
 
				-## PlainDETR's Decoder
			
 
				-class GlobalDecoder(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 # Decoder layer params
			
 
				-                 d_model    :int   = 256,
			
 
				-                 num_heads  :int   = 8,
			
 
				-                 ffn_dim    :int = 1024,
			
 
				-                 dropout    :float = 0.1,
			
 
				-                 act_type   :str   = "relu",
			
 
				-                 pre_norm   :bool  = False,
			
 
				-                 rpe_hidden_dim :int = 512,
			
 
				-                 feature_stride :int = 16,
			
 
				-                 num_layers     :int = 6,
			
 
				-                 # Decoder params
			
 
				-                 return_intermediate :bool = False,
			
 
				-                 use_checkpoint      :bool = False,
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ------------ Basic parameters ------------
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.rpe_hidden_dim = rpe_hidden_dim
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.act_type = act_type
			
 
				-        self.num_layers = num_layers
			
 
				-        self.return_intermediate = return_intermediate
			
 
				-        self.use_checkpoint = use_checkpoint
			
 
				-
			
 
				-        # ------------ Network parameters ------------
			
 
				-        decoder_layer = GlobalDecoderLayer(
			
 
				-            d_model, num_heads, ffn_dim, dropout, act_type, pre_norm, rpe_hidden_dim, feature_stride,)
			
 
				-        self.layers = get_clones(decoder_layer, num_layers)
			
 
				-        self.bbox_embed = None
			
 
				-        self.class_embed = None
			
 
				-
			
 
				-        if pre_norm:
			
 
				-            self.final_layer_norm = nn.LayerNorm(d_model)
			
 
				-        else:
			
 
				-            self.final_layer_norm = None
			
 
				-
			
 
				-    def _reset_parameters(self):            
			
 
				-        # stolen from Swin Transformer
			
 
				-        def _init_weights(m):
			
 
				-            if isinstance(m, nn.Linear):
			
 
				-                trunc_normal_(m.weight, std=0.02)
			
 
				-                if isinstance(m, nn.Linear) and m.bias is not None:
			
 
				-                    nn.init.constant_(m.bias, 0)
			
 
				-            elif isinstance(m, nn.LayerNorm):
			
 
				-                nn.init.constant_(m.bias, 0)
			
 
				-                nn.init.constant_(m.weight, 1.0)
			
 
				-
			
 
				-        self.apply(_init_weights)
			
 
				-
			
 
				-    def inverse_sigmoid(self, x, eps=1e-5):
			
 
				-        x = x.clamp(min=0, max=1)
			
 
				-        x1 = x.clamp(min=eps)
			
 
				-        x2 = (1 - x).clamp(min=eps)
			
 
				-
			
 
				-        return torch.log(x1 / x2)
			
 
				-
			
 
				-    def box_xyxy_to_cxcywh(self, x):
			
 
				-        x0, y0, x1, y1 = x.unbind(-1)
			
 
				-        b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
			
 
				-        
			
 
				-        return torch.stack(b, dim=-1)
			
 
				-
			
 
				-    def delta2bbox(self, proposals,
			
 
				-                   deltas,
			
 
				-                   max_shape=None,
			
 
				-                   wh_ratio_clip=16 / 1000,
			
 
				-                   clip_border=True,
			
 
				-                   add_ctr_clamp=False,
			
 
				-                   ctr_clamp=32):
			
 
				-
			
 
				-        dxy = deltas[..., :2]
			
 
				-        dwh = deltas[..., 2:]
			
 
				-
			
 
				-        # Compute width/height of each roi
			
 
				-        pxy = proposals[..., :2]
			
 
				-        pwh = proposals[..., 2:]
			
 
				-
			
 
				-        dxy_wh = pwh * dxy
			
 
				-        wh_ratio_clip = torch.as_tensor(wh_ratio_clip)
			
 
				-        max_ratio = torch.abs(torch.log(wh_ratio_clip)).item()
			
 
				-        
			
 
				-        if add_ctr_clamp:
			
 
				-            dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
			
 
				-            dwh = torch.clamp(dwh, max=max_ratio)
			
 
				-        else:
			
 
				-            dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
			
 
				-
			
 
				-        gxy = pxy + dxy_wh
			
 
				-        gwh = pwh * dwh.exp()
			
 
				-        x1y1 = gxy - (gwh * 0.5)
			
 
				-        x2y2 = gxy + (gwh * 0.5)
			
 
				-        bboxes = torch.cat([x1y1, x2y2], dim=-1)
			
 
				-        if clip_border and max_shape is not None:
			
 
				-            bboxes[..., 0::2].clamp_(min=0).clamp_(max=max_shape[1])
			
 
				-            bboxes[..., 1::2].clamp_(min=0).clamp_(max=max_shape[0])
			
 
				-
			
 
				-        return bboxes
			
 
				-
			
 
				-    def forward(self,
			
 
				-                tgt,
			
 
				-                reference_points,
			
 
				-                src,
			
 
				-                src_pos_embed,
			
 
				-                src_spatial_shapes,
			
 
				-                query_pos=None,
			
 
				-                src_padding_mask=None,
			
 
				-                self_attn_mask=None,
			
 
				-                max_shape=None,
			
 
				-                ):
			
 
				-        output = tgt
			
 
				-
			
 
				-        intermediate = []
			
 
				-        intermediate_reference_points = []
			
 
				-        for lid, layer in enumerate(self.layers):
			
 
				-            reference_points_input = reference_points[:, :, None]
			
 
				-            if self.use_checkpoint:
			
 
				-                output = checkpoint.checkpoint(
			
 
				-                    layer,
			
 
				-                    output,
			
 
				-                    query_pos,
			
 
				-                    reference_points_input,
			
 
				-                    src,
			
 
				-                    src_pos_embed,
			
 
				-                    src_spatial_shapes,
			
 
				-                    src_padding_mask,
			
 
				-                    self_attn_mask,
			
 
				-                )
			
 
				-            else:
			
 
				-                output = layer(
			
 
				-                    output,
			
 
				-                    query_pos,
			
 
				-                    reference_points_input,
			
 
				-                    src,
			
 
				-                    src_pos_embed,
			
 
				-                    src_spatial_shapes,
			
 
				-                    src_padding_mask,
			
 
				-                    self_attn_mask,
			
 
				-                )
			
 
				-
			
 
				-            if self.final_layer_norm is not None:
			
 
				-                output_after_norm = self.final_layer_norm(output)
			
 
				-            else:
			
 
				-                output_after_norm = output
			
 
				-
			
 
				-            # hack implementation for iterative bounding box refinement
			
 
				-            if self.bbox_embed is not None:
			
 
				-                tmp = self.bbox_embed[lid](output_after_norm)
			
 
				-                new_reference_points = self.box_xyxy_to_cxcywh(
			
 
				-                    self.delta2bbox(reference_points, tmp, max_shape)) 
			
 
				-                reference_points = new_reference_points.detach()
			
 
				-
			
 
				-            if self.return_intermediate:
			
 
				-                intermediate.append(output_after_norm)
			
 
				-                intermediate_reference_points.append(new_reference_points)
			
 
				-
			
 
				-        if self.return_intermediate:
			
 
				-            return torch.stack(intermediate), torch.stack(intermediate_reference_points)
			
 
				-
			
 
				-        return output_after_norm, reference_points
			
--- a/models/detectors/rtpdetr/build.py
+++ b/models/detectors/rtpdetr/build.py
@@ -1,36 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .loss import build_criterion
			
 
				-from .rtpdetr import RT_PDETR
			
 
				-
			
 
				-
			
 
				-# build object detector
			
 
				-def build_rtpdetr(args, cfg, num_classes=80, trainable=False, deploy=False):
			
 
				-    print('==============================')
			
 
				-    print('Build {} ...'.format(args.model.upper()))
			
 
				-    
			
 
				-    print('==============================')
			
 
				-    print('Model Configuration: \n', cfg)
			
 
				-    
			
 
				-    # -------------- Build RT-DETR --------------
			
 
				-    model = RT_PDETR(cfg             = cfg,
			
 
				-                     num_classes     = num_classes,
			
 
				-                     conf_thresh     = args.conf_thresh,
			
 
				-                     topk            = 300,
			
 
				-                     deploy          = deploy,
			
 
				-                     no_multi_labels = args.no_multi_labels,
			
 
				-                     use_nms         = True,   # NMS is beneficial 
			
 
				-                     nms_class_agnostic = args.nms_class_agnostic
			
 
				-                     )
			
 
				-            
			
 
				-    # -------------- Build criterion --------------
			
 
				-    criterion = None
			
 
				-    if trainable:
			
 
				-        # build criterion for training
			
 
				-        criterion = build_criterion(cfg, num_classes, aux_loss=True)
			
 
				-        
			
 
				-    return model, criterion
			
--- a/models/detectors/rtpdetr/loss.py
+++ b/models/detectors/rtpdetr/loss.py
@@ -1,214 +0,0 @@
 
				-import copy
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-try:
			
 
				-    from .loss_utils import sigmoid_focal_loss
			
 
				-    from .loss_utils import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, generalized_box_iou, bbox2delta
			
 
				-    from .loss_utils import is_dist_avail_and_initialized, get_world_size
			
 
				-    from .matcher import HungarianMatcher
			
 
				-except:
			
 
				-    from loss_utils import sigmoid_focal_loss
			
 
				-    from loss_utils import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, generalized_box_iou, bbox2delta
			
 
				-    from loss_utils import is_dist_avail_and_initialized, get_world_size
			
 
				-    from matcher import HungarianMatcher
			
 
				-
			
 
				-
			
 
				-class Criterion(nn.Module):
			
 
				-    """ This class computes the loss for DETR.
			
 
				-    The process happens in two steps:
			
 
				-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
			
 
				-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
			
 
				-    """
			
 
				-    def __init__(self, cfg, num_classes=80, aux_loss=False):
			
 
				-        super().__init__()
			
 
				-        # ------------ Basic parameters ------------
			
 
				-        self.cfg = cfg
			
 
				-        self.num_classes = num_classes
			
 
				-        self.k_one2many = cfg['k_one2many']
			
 
				-        self.lambda_one2many = cfg['lambda_one2many']
			
 
				-        self.aux_loss = aux_loss
			
 
				-        self.losses = ['labels', 'boxes']
			
 
				-        # ------------- Focal loss -------------
			
 
				-        self.alpha = 0.25
			
 
				-        self.gamma = 2.0
			
 
				-        # ------------ Matcher ------------
			
 
				-        self.matcher = HungarianMatcher(cost_class = cfg['matcher_hpy']['cost_class'],
			
 
				-                                        cost_bbox  = cfg['matcher_hpy']['cost_bbox'],
			
 
				-                                        cost_giou  = cfg['matcher_hpy']['cost_giou']
			
 
				-                                        )
			
 
				-        # ------------- Loss weight -------------
			
 
				-        self.weight_dict = {'loss_cls':  cfg['loss_coeff']['class'],
			
 
				-                            'loss_box':  cfg['loss_coeff']['bbox'],
			
 
				-                            'loss_giou': cfg['loss_coeff']['giou']}
			
 
				-
			
 
				-    def _get_src_permutation_idx(self, indices):
			
 
				-        # permute predictions following indices
			
 
				-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
			
 
				-        src_idx = torch.cat([src for (src, _) in indices])
			
 
				-        return batch_idx, src_idx
			
 
				-
			
 
				-    def _get_tgt_permutation_idx(self, indices):
			
 
				-        # permute targets following indices
			
 
				-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
			
 
				-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
			
 
				-        return batch_idx, tgt_idx
			
 
				-
			
 
				-    def loss_labels(self, outputs, targets, indices, num_boxes):
			
 
				-        """Classification loss (NLL)
			
 
				-        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
			
 
				-        """
			
 
				-        assert 'pred_logits' in outputs
			
 
				-        src_logits = outputs['pred_logits']
			
 
				-        # prepare class targets
			
 
				-        idx = self._get_src_permutation_idx(indices)
			
 
				-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]).to(src_logits.device)
			
 
				-        target_classes = torch.full(src_logits.shape[:2],
			
 
				-                                    self.num_classes,
			
 
				-                                    dtype=torch.int64,
			
 
				-                                    device=src_logits.device)
			
 
				-        target_classes[idx] = target_classes_o
			
 
				-
			
 
				-        # to one-hot labels
			
 
				-        target_classes_onehot = torch.zeros([*src_logits.shape[:2], self.num_classes + 1],
			
 
				-                                            dtype=src_logits.dtype,
			
 
				-                                            layout=src_logits.layout,
			
 
				-                                            device=src_logits.device)
			
 
				-        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
			
 
				-        target_classes_onehot = target_classes_onehot[..., :-1]
			
 
				-
			
 
				-        # focal loss
			
 
				-        loss_cls = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, self.alpha, self.gamma)
			
 
				-
			
 
				-        losses = {}
			
 
				-        losses['loss_cls'] = loss_cls * src_logits.shape[1]
			
 
				-
			
 
				-        return losses
			
 
				-
			
 
				-    def loss_boxes(self, outputs, targets, indices, num_boxes):
			
 
				-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
			
 
				-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
			
 
				-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
			
 
				-        """
			
 
				-        assert 'pred_boxes' in outputs
			
 
				-        # prepare bbox targets
			
 
				-        idx = self._get_src_permutation_idx(indices)
			
 
				-        src_boxes = outputs['pred_boxes'][idx]
			
 
				-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0).to(src_boxes.device)
			
 
				-        
			
 
				-        # compute L1 loss
			
 
				-        loss_bbox = F.l1_loss(src_boxes, box_xyxy_to_cxcywh(target_boxes), reduction='none')
			
 
				-        src_deltas = outputs["pred_deltas"][idx]
			
 
				-        src_boxes_old = outputs["pred_boxes_old"][idx]
			
 
				-        target_deltas = bbox2delta(src_boxes_old, target_boxes)
			
 
				-        loss_bbox = F.l1_loss(src_deltas, target_deltas, reduction="none")
			
 
				-
			
 
				-        # compute GIoU loss
			
 
				-        bbox_giou = generalized_box_iou(box_cxcywh_to_xyxy(src_boxes),
			
 
				-                                        box_cxcywh_to_xyxy(target_boxes))
			
 
				-        loss_giou = 1 - torch.diag(bbox_giou)
			
 
				-        
			
 
				-        losses = {}
			
 
				-        losses['loss_box'] = loss_bbox.sum() / num_boxes
			
 
				-        losses['loss_giou'] = loss_giou.sum() / num_boxes
			
 
				-
			
 
				-        return losses
			
 
				-
			
 
				-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
			
 
				-        loss_map = {
			
 
				-            'labels': self.loss_labels,
			
 
				-            'boxes': self.loss_boxes,
			
 
				-        }
			
 
				-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
			
 
				-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
			
 
				-
			
 
				-    def compute_loss(self, outputs, targets):
			
 
				-        """ This performs the loss computation.
			
 
				-        Parameters:
			
 
				-             outputs: dict of tensors, see the output specification of the model for the format
			
 
				-             targets: list of dicts, such that len(targets) == batch_size.
			
 
				-                      The expected keys in each dict depends on the losses applied, see each loss' doc
			
 
				-        """
			
 
				-        outputs_without_aux = {
			
 
				-            k: v
			
 
				-            for k, v in outputs.items()
			
 
				-            if k != "aux_outputs" and k != "enc_outputs"
			
 
				-        }
			
 
				-
			
 
				-        # Retrieve the matching between the outputs of the last layer and the targets
			
 
				-        indices = self.matcher(outputs_without_aux, targets)
			
 
				-
			
 
				-        # Compute the average number of target boxes accross all nodes, for normalization purposes
			
 
				-        num_boxes = sum(len(t["labels"]) for t in targets)
			
 
				-        num_boxes = torch.as_tensor(
			
 
				-            [num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device
			
 
				-        )
			
 
				-        if is_dist_avail_and_initialized():
			
 
				-            torch.distributed.all_reduce(num_boxes)
			
 
				-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
			
 
				-
			
 
				-        # Compute all the requested losses
			
 
				-        losses = {}
			
 
				-        for loss in self.losses:
			
 
				-            kwargs = {}
			
 
				-            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes, **kwargs)
			
 
				-            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
			
 
				-            losses.update(l_dict)
			
 
				-
			
 
				-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
			
 
				-        if "aux_outputs" in outputs:
			
 
				-            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
			
 
				-                indices = self.matcher(aux_outputs, targets)
			
 
				-                for loss in self.losses:
			
 
				-                    kwargs = {}
			
 
				-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
			
 
				-                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
			
 
				-                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
			
 
				-                    losses.update(l_dict)
			
 
				-
			
 
				-        if "enc_outputs" in outputs:
			
 
				-            enc_outputs = outputs["enc_outputs"]
			
 
				-            bin_targets = copy.deepcopy(targets)
			
 
				-            for bt in bin_targets:
			
 
				-                bt["labels"] = torch.zeros_like(bt["labels"])
			
 
				-            indices = self.matcher(enc_outputs, bin_targets)
			
 
				-            for loss in self.losses:
			
 
				-                kwargs = {}
			
 
				-                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs)
			
 
				-                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
			
 
				-                l_dict = {k + "_enc": v for k, v in l_dict.items()}
			
 
				-                losses.update(l_dict)
			
 
				-
			
 
				-        return losses
			
 
				-
			
 
				-    def forward(self, outputs, targets):
			
 
				-        # --------------------- One-to-one losses ---------------------
			
 
				-        outputs_one2one = {k: v for k, v in outputs.items() if "one2many" not in k}
			
 
				-        loss_dict = self.compute_loss(outputs_one2one, targets)
			
 
				-
			
 
				-        # --------------------- One-to-many losses ---------------------
			
 
				-        outputs_one2many = {k[:-9]: v for k, v in outputs.items() if "one2many" in k}
			
 
				-        if len(outputs_one2many) > 0:
			
 
				-            # Copy targets
			
 
				-            multi_targets = copy.deepcopy(targets)
			
 
				-            for target in multi_targets:
			
 
				-                target["boxes"] = target["boxes"].repeat(self.k_one2many, 1)
			
 
				-                target["labels"] = target["labels"].repeat(self.k_one2many)
			
 
				-            # Compute one-to-many losses
			
 
				-            one2many_loss_dict = self.compute_loss(outputs_one2many, multi_targets)
			
 
				-            # add one2many losses in to the final loss_dict
			
 
				-            for k, v in one2many_loss_dict.items():
			
 
				-                if k + "_one2many" in loss_dict.keys():
			
 
				-                    loss_dict[k + "_one2many"] += v * self.lambda_one2many
			
 
				-                else:
			
 
				-                    loss_dict[k + "_one2many"] = v * self.lambda_one2many
			
 
				-
			
 
				-        return loss_dict
			
 
				-    
			
 
				-# build criterion
			
 
				-def build_criterion(cfg, num_classes, aux_loss=True):
			
 
				-    criterion = Criterion(cfg, num_classes, aux_loss)
			
 
				-
			
 
				-    return criterion
			
 
				-    
			
--- a/models/detectors/rtpdetr/loss_utils.py
+++ b/models/detectors/rtpdetr/loss_utils.py
@@ -1,124 +0,0 @@
 
				-import math
			
 
				-import torch
			
 
				-import torch.nn.functional as F
			
 
				-import torch.distributed as dist
			
 
				-from torchvision.ops.boxes import box_area
			
 
				-
			
 
				-
			
 
				-# ------------------------- For loss -------------------------
			
 
				-## FocalLoss
			
 
				-def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
			
 
				-    """
			
 
				-    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
			
 
				-    Args:
			
 
				-        inputs: A float tensor of arbitrary shape.
			
 
				-                The predictions for each example.
			
 
				-        targets: A float tensor with the same shape as inputs. Stores the binary
			
 
				-                 classification label for each element in inputs
			
 
				-                (0 for the negative class and 1 for the positive class).
			
 
				-        alpha: (optional) Weighting factor in range (0,1) to balance
			
 
				-                positive vs negative examples. Default = -1 (no weighting).
			
 
				-        gamma: Exponent of the modulating factor (1 - p_t) to
			
 
				-               balance easy vs hard examples.
			
 
				-    Returns:
			
 
				-        Loss tensor
			
 
				-    """
			
 
				-    prob = inputs.sigmoid()
			
 
				-    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
			
 
				-    p_t = prob * targets + (1 - prob) * (1 - targets)
			
 
				-    loss = ce_loss * ((1 - p_t) ** gamma)
			
 
				-
			
 
				-    if alpha >= 0:
			
 
				-        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
			
 
				-        loss = alpha_t * loss
			
 
				-
			
 
				-    return loss.mean(1).sum() / num_boxes
			
 
				-
			
 
				-
			
 
				-# ------------------------- For box -------------------------
			
 
				-def box_cxcywh_to_xyxy(x):
			
 
				-    x_c, y_c, w, h = x.unbind(-1)
			
 
				-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
			
 
				-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
			
 
				-    return torch.stack(b, dim=-1)
			
 
				-
			
 
				-def box_xyxy_to_cxcywh(x):
			
 
				-    x0, y0, x1, y1 = x.unbind(-1)
			
 
				-    b = [(x0 + x1) / 2, (y0 + y1) / 2,
			
 
				-         (x1 - x0), (y1 - y0)]
			
 
				-    return torch.stack(b, dim=-1)
			
 
				-
			
 
				-def bbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)):
			
 
				-    # hack for matcher
			
 
				-    if proposals.size() != gt.size():
			
 
				-        proposals = proposals[:, None]
			
 
				-        gt = gt[None]
			
 
				-
			
 
				-    proposals = proposals.float()
			
 
				-    gt = gt.float()
			
 
				-    px, py, pw, ph = proposals.unbind(-1)
			
 
				-    gx, gy, gw, gh = gt.unbind(-1)
			
 
				-
			
 
				-    dx = (gx - px) / (pw + 0.1)
			
 
				-    dy = (gy - py) / (ph + 0.1)
			
 
				-    dw = torch.log(gw / (pw + 0.1))
			
 
				-    dh = torch.log(gh / (ph + 0.1))
			
 
				-    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
			
 
				-
			
 
				-    means = deltas.new_tensor(means).unsqueeze(0)
			
 
				-    stds = deltas.new_tensor(stds).unsqueeze(0)
			
 
				-    deltas = deltas.sub_(means).div_(stds)
			
 
				-
			
 
				-    return deltas
			
 
				-
			
 
				-def box_iou(boxes1, boxes2):
			
 
				-    area1 = box_area(boxes1)
			
 
				-    area2 = box_area(boxes2)
			
 
				-
			
 
				-    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
			
 
				-    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
			
 
				-
			
 
				-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
			
 
				-    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
			
 
				-
			
 
				-    union = area1[:, None] + area2 - inter
			
 
				-
			
 
				-    iou = inter / union
			
 
				-    return iou, union
			
 
				-
			
 
				-def generalized_box_iou(boxes1, boxes2):
			
 
				-    """
			
 
				-    Generalized IoU from https://giou.stanford.edu/
			
 
				-
			
 
				-    The boxes should be in [x0, y0, x1, y1] format
			
 
				-
			
 
				-    Returns a [N, M] pairwise matrix, where N = len(boxes1)
			
 
				-    and M = len(boxes2)
			
 
				-    """
			
 
				-    # degenerate boxes gives inf / nan results
			
 
				-    # so do an early check
			
 
				-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
			
 
				-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
			
 
				-    iou, union = box_iou(boxes1, boxes2)
			
 
				-
			
 
				-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
			
 
				-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
			
 
				-
			
 
				-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
			
 
				-    area = wh[:, :, 0] * wh[:, :, 1]
			
 
				-
			
 
				-    return iou - (area - union) / area
			
 
				-
			
 
				-
			
 
				-# ------------------------- For distributed -------------------------
			
 
				-def is_dist_avail_and_initialized():
			
 
				-    if not dist.is_available():
			
 
				-        return False
			
 
				-    if not dist.is_initialized():
			
 
				-        return False
			
 
				-    return True
			
 
				-
			
 
				-def get_world_size():
			
 
				-    if not is_dist_avail_and_initialized():
			
 
				-        return 1
			
 
				-    return dist.get_world_size()
			
--- a/models/detectors/rtpdetr/matcher.py
+++ b/models/detectors/rtpdetr/matcher.py
@@ -1,115 +0,0 @@
 
				-# ------------------------------------------------------------------------
			
 
				-# Plain-DETR
			
 
				-# Copyright (c) 2023 Xi'an Jiaotong University & Microsoft Research Asia.
			
 
				-# Licensed under The MIT License [see LICENSE for details]
			
 
				-# ------------------------------------------------------------------------
			
 
				-# Deformable DETR
			
 
				-# Copyright (c) 2020 SenseTime. All Rights Reserved.
			
 
				-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
			
 
				-# ------------------------------------------------------------------------
			
 
				-# Modified from DETR (https://github.com/facebookresearch/detr)
			
 
				-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
			
 
				-# ------------------------------------------------------------------------
			
 
				-
			
 
				-"""
			
 
				-Modules to compute the matching cost and solve the corresponding LSAP.
			
 
				-"""
			
 
				-import torch
			
 
				-from scipy.optimize import linear_sum_assignment
			
 
				-from torch import nn
			
 
				-
			
 
				-try:
			
 
				-    from .loss_utils import box_cxcywh_to_xyxy, generalized_box_iou, bbox2delta
			
 
				-except:
			
 
				-    from loss_utils import box_cxcywh_to_xyxy, generalized_box_iou, bbox2delta
			
 
				-
			
 
				-
			
 
				-class HungarianMatcher(nn.Module):
			
 
				-    """This class computes an assignment between the targets and the predictions of the network
			
 
				-
			
 
				-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
			
 
				-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
			
 
				-    while the others are un-matched (and thus treated as non-objects).
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self,
			
 
				-                 cost_class: float = 1,
			
 
				-                 cost_bbox:  float = 1,
			
 
				-                 cost_giou:  float = 1,
			
 
				-                 ):
			
 
				-        """Creates the matcher
			
 
				-
			
 
				-        Params:
			
 
				-            cost_class: This is the relative weight of the classification error in the matching cost
			
 
				-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
			
 
				-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
			
 
				-        """
			
 
				-        super().__init__()
			
 
				-        self.cost_class = cost_class
			
 
				-        self.cost_bbox = cost_bbox
			
 
				-        self.cost_giou = cost_giou
			
 
				-        assert (
			
 
				-            cost_class != 0 or cost_bbox != 0 or cost_giou != 0
			
 
				-        ), "all costs cant be 0"
			
 
				-
			
 
				-    def forward(self, outputs, targets):
			
 
				-        """ Performs the matching
			
 
				-
			
 
				-        Params:
			
 
				-            outputs: This is a dict that contains at least these entries:
			
 
				-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
			
 
				-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
			
 
				-
			
 
				-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
			
 
				-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
			
 
				-                           objects in the target) containing the class labels
			
 
				-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
			
 
				-
			
 
				-        Returns:
			
 
				-            A list of size batch_size, containing tuples of (index_i, index_j) where:
			
 
				-                - index_i is the indices of the selected predictions (in order)
			
 
				-                - index_j is the indices of the corresponding selected targets (in order)
			
 
				-            For each batch element, it holds:
			
 
				-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
			
 
				-        """
			
 
				-        with torch.no_grad():
			
 
				-            bs, num_queries = outputs["pred_logits"].shape[:2]
			
 
				-
			
 
				-            # We flatten to compute the cost matrices in a batch
			
 
				-            out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
			
 
				-            out_bbox = outputs["pred_boxes"].flatten(0, 1)
			
 
				-
			
 
				-            # Also concat the target labels and boxes
			
 
				-            tgt_ids = torch.cat([v["labels"] for v in targets]).to(out_prob.device)
			
 
				-            tgt_bbox = torch.cat([v["boxes"] for v in targets]).to(out_prob.device)
			
 
				-
			
 
				-            # Compute the classification cost.
			
 
				-            alpha = 0.25
			
 
				-            gamma = 2.0
			
 
				-            neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
			
 
				-            pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
			
 
				-            cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
			
 
				-
			
 
				-            # Compute the L1 cost between boxes
			
 
				-            out_delta = outputs["pred_deltas"].flatten(0, 1)
			
 
				-            out_bbox_old = outputs["pred_boxes_old"].flatten(0, 1)
			
 
				-            tgt_delta = bbox2delta(out_bbox_old, tgt_bbox)
			
 
				-            cost_bbox = torch.cdist(out_delta[:, None], tgt_delta, p=1).squeeze(1)
			
 
				-
			
 
				-            # Compute the giou cost betwen boxes
			
 
				-            cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
			
 
				-                                             box_cxcywh_to_xyxy(tgt_bbox)
			
 
				-            )
			
 
				-
			
 
				-            # Final cost matrix
			
 
				-            C = self.cost_bbox  * cost_bbox + \
			
 
				-                self.cost_class * cost_class + \
			
 
				-                self.cost_giou  * cost_giou
			
 
				-            C = C.view(bs, num_queries, -1).cpu()
			
 
				-
			
 
				-            sizes = [len(v["boxes"]) for v in targets]
			
 
				-            indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
			
 
				-            
			
 
				-            return [(torch.as_tensor(i, dtype=torch.int64),  # batch index
			
 
				-                     torch.as_tensor(j, dtype=torch.int64))  # query index
			
 
				-                     for i, j in indices]
			
--- a/models/detectors/rtpdetr/rtpdetr.py
+++ b/models/detectors/rtpdetr/rtpdetr.py
@@ -1,433 +0,0 @@
 
				-import math
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-try:
			
 
				-    from .basic_modules.basic import MLP, multiclass_nms
			
 
				-    from .basic_modules.transformer import get_clones
			
 
				-    from .rtpdetr_encoder import build_image_encoder
			
 
				-    from .rtpdetr_decoder import build_transformer
			
 
				-except:
			
 
				-    from  basic_modules.basic import MLP, multiclass_nms
			
 
				-    from  basic_modules.transformer import get_clones
			
 
				-    from  rtpdetr_encoder import build_image_encoder
			
 
				-    from  rtpdetr_decoder import build_transformer
			
 
				-
			
 
				-
			
 
				-# Real-time PlainDETR
			
 
				-class RT_PDETR(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 cfg,
			
 
				-                 num_classes = 80,
			
 
				-                 conf_thresh = 0.1,
			
 
				-                 nms_thresh  = 0.5,
			
 
				-                 topk        = 300,
			
 
				-                 deploy      = False,
			
 
				-                 no_multi_labels = False,
			
 
				-                 use_nms     = False,
			
 
				-                 nms_class_agnostic = False,
			
 
				-                 aux_loss    = False,
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic setting -----------
			
 
				-        self.num_queries_one2one = cfg['num_queries_one2one']
			
 
				-        self.num_queries_one2many = cfg['num_queries_one2many']
			
 
				-        self.num_queries = self.num_queries_one2one + self.num_queries_one2many
			
 
				-        self.num_classes = num_classes
			
 
				-        self.num_topk = topk
			
 
				-        self.aux_loss = aux_loss
			
 
				-        self.deploy = deploy
			
 
				-        # scale hidden channels by width_factor
			
 
				-        cfg['hidden_dim'] = round(cfg['hidden_dim'] * cfg['width'])
			
 
				-        ## Post-process parameters
			
 
				-        self.use_nms = use_nms
			
 
				-        self.nms_thresh = nms_thresh
			
 
				-        self.conf_thresh = conf_thresh
			
 
				-        self.no_multi_labels = no_multi_labels
			
 
				-        self.nms_class_agnostic = nms_class_agnostic
			
 
				-
			
 
				-        # ----------- Network setting -----------
			
 
				-        ## Image encoder
			
 
				-        self.image_encoder = build_image_encoder(cfg)
			
 
				-
			
 
				-        ## Transformer Decoder
			
 
				-        self.transformer = build_transformer(cfg, return_intermediate=self.training)
			
 
				-        self.query_embed = nn.Embedding(self.num_queries, cfg['hidden_dim'])
			
 
				-
			
 
				-        ## Detect Head
			
 
				-        class_embed = nn.Linear(cfg['hidden_dim'], num_classes)
			
 
				-        bbox_embed = MLP(cfg['hidden_dim'], cfg['hidden_dim'], 4, 3)
			
 
				-
			
 
				-        prior_prob = 0.01
			
 
				-        bias_value = -math.log((1 - prior_prob) / prior_prob)
			
 
				-        class_embed.bias.data = torch.ones(num_classes) * bias_value
			
 
				-        nn.init.constant_(bbox_embed.layers[-1].weight.data, 0)
			
 
				-        nn.init.constant_(bbox_embed.layers[-1].bias.data, 0)
			
 
				-
			
 
				-        self.class_embed = get_clones(class_embed, cfg['de_num_layers'] + 1)
			
 
				-        self.bbox_embed  = get_clones(bbox_embed, cfg['de_num_layers'] + 1)
			
 
				-        nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
			
 
				-
			
 
				-        self.transformer.decoder.bbox_embed = self.bbox_embed
			
 
				-        self.transformer.decoder.class_embed = self.class_embed
			
 
				-
			
 
				-    def pos2posembed(self, d_model, pos, temperature=10000):
			
 
				-        scale = 2 * torch.pi
			
 
				-        num_pos_feats = d_model // 2
			
 
				-
			
 
				-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
			
 
				-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats
			
 
				-        dim_t = temperature ** (2 * dim_t_)
			
 
				-
			
 
				-        # Position embedding for XY
			
 
				-        x_embed = pos[..., 0] * scale
			
 
				-        y_embed = pos[..., 1] * scale
			
 
				-        pos_x = x_embed[..., None] / dim_t
			
 
				-        pos_y = y_embed[..., None] / dim_t
			
 
				-        pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-        pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-        posemb = torch.cat((pos_y, pos_x), dim=-1)
			
 
				-        
			
 
				-        # Position embedding for WH
			
 
				-        if pos.size(-1) == 4:
			
 
				-            w_embed = pos[..., 2] * scale
			
 
				-            h_embed = pos[..., 3] * scale
			
 
				-            pos_w = w_embed[..., None] / dim_t
			
 
				-            pos_h = h_embed[..., None] / dim_t
			
 
				-            pos_w = torch.stack((pos_w[..., 0::2].sin(), pos_w[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-            pos_h = torch.stack((pos_h[..., 0::2].sin(), pos_h[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-            posemb = torch.cat((posemb, pos_w, pos_h), dim=-1)
			
 
				-        
			
 
				-        return posemb
			
 
				-
			
 
				-    def get_posembed(self, d_model, mask, temperature=10000, normalize=False):
			
 
				-        not_mask = ~mask
			
 
				-        # [B, H, W]
			
 
				-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
			
 
				-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
			
 
				-
			
 
				-        if normalize:
			
 
				-            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + 1e-6)
			
 
				-            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + 1e-6)
			
 
				-        else:
			
 
				-            y_embed = y_embed - 0.5
			
 
				-            x_embed = x_embed - 0.5
			
 
				-    
			
 
				-        # [H, W] -> [B, H, W, 2]
			
 
				-        pos = torch.stack([x_embed, y_embed], dim=-1)
			
 
				-
			
 
				-        # [B, H, W, C]
			
 
				-        pos_embed = self.pos2posembed(d_model, pos, temperature)
			
 
				-        pos_embed = pos_embed.permute(0, 3, 1, 2)
			
 
				-        
			
 
				-        return pos_embed
			
 
				-
			
 
				-    def post_process(self, box_pred, cls_pred):
			
 
				-        # xywh -> xyxy
			
 
				-        box_preds_x1y1 = box_pred[..., :2] - 0.5 * box_pred[..., 2:]
			
 
				-        box_preds_x2y2 = box_pred[..., :2] + 0.5 * box_pred[..., 2:]
			
 
				-        box_pred = torch.cat([box_preds_x1y1, box_preds_x2y2], dim=-1)
			
 
				-
			
 
				-        cls_pred = cls_pred[0]
			
 
				-        box_pred = box_pred[0]
			
 
				-        if self.no_multi_labels:
			
 
				-            # [M,]
			
 
				-            scores, labels = torch.max(cls_pred.sigmoid(), dim=1)
			
 
				-
			
 
				-            # Keep top k top scoring indices only.
			
 
				-            num_topk = min(self.num_topk, box_pred.size(0))
			
 
				-
			
 
				-            # Topk candidates
			
 
				-            predicted_prob, topk_idxs = scores.sort(descending=True)
			
 
				-            topk_scores = predicted_prob[:num_topk]
			
 
				-            topk_idxs = topk_idxs[:num_topk]
			
 
				-
			
 
				-            # Filter out the proposals with low confidence score
			
 
				-            keep_idxs = topk_scores > self.conf_thresh
			
 
				-            topk_idxs = topk_idxs[keep_idxs]
			
 
				-
			
 
				-            # Top-k results
			
 
				-            topk_scores = topk_scores[keep_idxs]
			
 
				-            topk_labels = labels[topk_idxs]
			
 
				-            topk_bboxes = box_pred[topk_idxs]
			
 
				-
			
 
				-        else:
			
 
				-            # Top-k select
			
 
				-            cls_pred = cls_pred.flatten().sigmoid_()
			
 
				-            box_pred = box_pred
			
 
				-
			
 
				-            # Keep top k top scoring indices only.
			
 
				-            num_topk = min(self.num_topk, box_pred.size(0))
			
 
				-
			
 
				-            # Topk candidates
			
 
				-            predicted_prob, topk_idxs = cls_pred.sort(descending=True)
			
 
				-            topk_scores = predicted_prob[:num_topk]
			
 
				-            topk_idxs = topk_idxs[:self.num_topk]
			
 
				-
			
 
				-            # Filter out the proposals with low confidence score
			
 
				-            keep_idxs = topk_scores > self.conf_thresh
			
 
				-            topk_scores = topk_scores[keep_idxs]
			
 
				-            topk_idxs = topk_idxs[keep_idxs]
			
 
				-            topk_box_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
			
 
				-
			
 
				-            ## Top-k results
			
 
				-            topk_labels = topk_idxs % self.num_classes
			
 
				-            topk_bboxes = box_pred[topk_box_idxs]
			
 
				-
			
 
				-        topk_scores = topk_scores.cpu().numpy()
			
 
				-        topk_labels = topk_labels.cpu().numpy()
			
 
				-        topk_bboxes = topk_bboxes.cpu().numpy()
			
 
				-
			
 
				-        # nms
			
 
				-        if self.use_nms:
			
 
				-            topk_scores, topk_labels, topk_bboxes = multiclass_nms(
			
 
				-                topk_scores, topk_labels, topk_bboxes, self.nms_thresh, self.num_classes, self.nms_class_agnostic)
			
 
				-
			
 
				-        return topk_bboxes, topk_scores, topk_labels
			
 
				-    
			
 
				-    @torch.jit.unused
			
 
				-    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_coord_old, outputs_deltas):
			
 
				-        # this is a workaround to make torchscript happy, as torchscript
			
 
				-        # doesn't support dictionary with non-homogeneous values, such
			
 
				-        # as a dict having both a Tensor and a list.
			
 
				-        return [
			
 
				-            {"pred_logits": a, "pred_boxes": b, "pred_boxes_old": c, "pred_deltas": d, }
			
 
				-            for a, b, c, d in zip(outputs_class[:-1], outputs_coord[:-1], outputs_coord_old[:-1], outputs_deltas[:-1])
			
 
				-        ]
			
 
				-
			
 
				-    def inference_single_image(self, x):
			
 
				-        # ----------- Image Encoder -----------
			
 
				-        src = self.image_encoder(x)
			
 
				-
			
 
				-        # ----------- Prepare inputs for Transformer -----------
			
 
				-        mask = torch.zeros([src.shape[0], src.shape[2], src.shape[3]]).bool().to(src.device)
			
 
				-        pos_embed = self.get_posembed(src.shape[1], mask, normalize=False)
			
 
				-        self_attn_mask = None
			
 
				-        query_embeds = self.query_embed.weight[:self.num_queries_one2one]
			
 
				-
			
 
				-        # -----------Transformer -----------
			
 
				-        (
			
 
				-            hs,
			
 
				-            init_reference,
			
 
				-            inter_references,
			
 
				-            _,
			
 
				-            _,
			
 
				-            _,
			
 
				-            _,
			
 
				-            max_shape
			
 
				-        ) = self.transformer(src, mask, pos_embed, query_embeds, self_attn_mask)
			
 
				-
			
 
				-        # ----------- Process outputs -----------
			
 
				-        outputs_classes_one2one = []
			
 
				-        outputs_coords_one2one = []
			
 
				-        outputs_deltas_one2one = []
			
 
				-
			
 
				-        for lid in range(hs.shape[0]):
			
 
				-            if lid == 0:
			
 
				-                reference = init_reference
			
 
				-            else:
			
 
				-                reference = inter_references[lid - 1]
			
 
				-            outputs_class = self.class_embed[lid](hs[lid])
			
 
				-            tmp = self.bbox_embed[lid](hs[lid])
			
 
				-            outputs_coord = self.transformer.decoder.delta2bbox(reference, tmp, max_shape)  # xyxy
			
 
				-
			
 
				-            outputs_classes_one2one.append(outputs_class[:, :self.num_queries_one2one])
			
 
				-            outputs_coords_one2one.append(outputs_coord[:, :self.num_queries_one2one])
			
 
				-            outputs_deltas_one2one.append(tmp[:, :self.num_queries_one2one])
			
 
				-
			
 
				-        outputs_classes_one2one = torch.stack(outputs_classes_one2one)
			
 
				-        outputs_coords_one2one = torch.stack(outputs_coords_one2one)
			
 
				-
			
 
				-        # ------------ Post process ------------
			
 
				-        cls_pred = outputs_classes_one2one[-1]
			
 
				-        box_pred = outputs_coords_one2one[-1]
			
 
				-        
			
 
				-        # post-process
			
 
				-        bboxes, scores, labels = self.post_process(box_pred, cls_pred)
			
 
				-
			
 
				-        outputs = {
			
 
				-            "scores": scores,
			
 
				-            "labels": labels,
			
 
				-            "bboxes": bboxes,
			
 
				-        }
			
 
				-
			
 
				-        return outputs
			
 
				-        
			
 
				-    def forward(self, x):
			
 
				-        if not self.training:
			
 
				-            return self.inference_single_image(x)
			
 
				-
			
 
				-        # ----------- Image Encoder -----------
			
 
				-        src = self.image_encoder(x)
			
 
				-
			
 
				-        # ----------- Prepare inputs for Transformer -----------
			
 
				-        mask = torch.zeros([src.shape[0], src.shape[2], src.shape[3]]).bool().to(src.device)
			
 
				-        pos_embed = self.get_posembed(src.shape[1], mask, normalize=False)
			
 
				-        self_attn_mask = torch.zeros(
			
 
				-            [self.num_queries, self.num_queries, ]).bool().to(src.device)
			
 
				-        self_attn_mask[self.num_queries_one2one:, 0: self.num_queries_one2one, ] = True
			
 
				-        self_attn_mask[0: self.num_queries_one2one, self.num_queries_one2one:, ] = True
			
 
				-        query_embeds = self.query_embed.weight
			
 
				-
			
 
				-        # -----------Transformer -----------
			
 
				-        (
			
 
				-            hs,
			
 
				-            init_reference,
			
 
				-            inter_references,
			
 
				-            enc_outputs_class,
			
 
				-            enc_outputs_coord_unact,
			
 
				-            enc_outputs_delta,
			
 
				-            output_proposals,
			
 
				-            max_shape
			
 
				-        ) = self.transformer(src, mask, pos_embed, query_embeds, self_attn_mask)
			
 
				-
			
 
				-        # ----------- Process outputs -----------
			
 
				-        outputs_classes_one2one = []
			
 
				-        outputs_coords_one2one = []
			
 
				-        outputs_classes_one2many = []
			
 
				-        outputs_coords_one2many = []
			
 
				-
			
 
				-        outputs_coords_old_one2one = []
			
 
				-        outputs_deltas_one2one = []
			
 
				-        outputs_coords_old_one2many = []
			
 
				-        outputs_deltas_one2many = []
			
 
				-
			
 
				-        for lid in range(hs.shape[0]):
			
 
				-            if lid == 0:
			
 
				-                reference = init_reference
			
 
				-            else:
			
 
				-                reference = inter_references[lid - 1]
			
 
				-            outputs_class = self.class_embed[lid](hs[lid])
			
 
				-            tmp = self.bbox_embed[lid](hs[lid])
			
 
				-            outputs_coord = self.transformer.decoder.box_xyxy_to_cxcywh(
			
 
				-                self.transformer.decoder.delta2bbox(reference, tmp, max_shape))
			
 
				-
			
 
				-            outputs_classes_one2one.append(outputs_class[:, 0: self.num_queries_one2one])
			
 
				-            outputs_classes_one2many.append(outputs_class[:, self.num_queries_one2one:])
			
 
				-
			
 
				-            outputs_coords_one2one.append(outputs_coord[:, 0: self.num_queries_one2one])
			
 
				-            outputs_coords_one2many.append(outputs_coord[:, self.num_queries_one2one:])
			
 
				-
			
 
				-            outputs_coords_old_one2one.append(reference[:, :self.num_queries_one2one])
			
 
				-            outputs_coords_old_one2many.append(reference[:, self.num_queries_one2one:])
			
 
				-            outputs_deltas_one2one.append(tmp[:, :self.num_queries_one2one])
			
 
				-            outputs_deltas_one2many.append(tmp[:, self.num_queries_one2one:])
			
 
				-
			
 
				-        outputs_classes_one2one = torch.stack(outputs_classes_one2one)
			
 
				-        outputs_coords_one2one = torch.stack(outputs_coords_one2one)
			
 
				-
			
 
				-        outputs_classes_one2many = torch.stack(outputs_classes_one2many)
			
 
				-        outputs_coords_one2many = torch.stack(outputs_coords_one2many)
			
 
				-
			
 
				-        out = {
			
 
				-            "pred_logits": outputs_classes_one2one[-1],
			
 
				-            "pred_boxes": outputs_coords_one2one[-1],
			
 
				-            "pred_logits_one2many": outputs_classes_one2many[-1],
			
 
				-            "pred_boxes_one2many": outputs_coords_one2many[-1],
			
 
				-
			
 
				-            "pred_boxes_old": outputs_coords_old_one2one[-1],
			
 
				-            "pred_deltas": outputs_deltas_one2one[-1],
			
 
				-            "pred_boxes_old_one2many": outputs_coords_old_one2many[-1],
			
 
				-            "pred_deltas_one2many": outputs_deltas_one2many[-1],
			
 
				-        }
			
 
				-
			
 
				-        out["aux_outputs"] = self._set_aux_loss(
			
 
				-            outputs_classes_one2one, outputs_coords_one2one, outputs_coords_old_one2one, outputs_deltas_one2one
			
 
				-        )
			
 
				-        out["aux_outputs_one2many"] = self._set_aux_loss(
			
 
				-            outputs_classes_one2many, outputs_coords_one2many, outputs_coords_old_one2many, outputs_deltas_one2many
			
 
				-        )
			
 
				-
			
 
				-        out["enc_outputs"] = {
			
 
				-            "pred_logits": enc_outputs_class,
			
 
				-            "pred_boxes": enc_outputs_coord_unact,
			
 
				-            "pred_boxes_old": output_proposals,
			
 
				-            "pred_deltas": enc_outputs_delta,
			
 
				-        }
			
 
				-
			
 
				-        return out
			
 
				-                
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    from loss import build_criterion
			
 
				-
			
 
				-    # Model config
			
 
				-    cfg = {
			
 
				-        'width': 1.0,
			
 
				-        'depth': 1.0,
			
 
				-        'max_stride': 32,
			
 
				-        'out_stride': 16,
			
 
				-        # Image Encoder - Backbone
			
 
				-        'backbone': 'resnet50',
			
 
				-        'backbone_norm': 'FrozeBN',
			
 
				-        'pretrained': True,
			
 
				-        'freeze_at': 0,
			
 
				-        'freeze_stem_only': False,
			
 
				-        'hidden_dim': 256,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 6,
			
 
				-        'en_ffn_dim': 2048,
			
 
				-        'en_dropout': 0.0,
			
 
				-        'en_act': 'gelu',
			
 
				-        # Transformer Decoder
			
 
				-        'transformer': 'plain_detr_transformer',
			
 
				-        'hidden_dim': 256,
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_num_layers': 6,
			
 
				-        'de_ffn_dim': 2048,
			
 
				-        'de_dropout': 0.0,
			
 
				-        'de_act': 'gelu',
			
 
				-        'de_pre_norm': True,
			
 
				-        'rpe_hidden_dim': 512,
			
 
				-        'use_checkpoint': False,
			
 
				-        'proposal_feature_levels': 3,
			
 
				-        'proposal_tgt_strides': [8, 16, 32],
			
 
				-        'num_queries_one2one': 300,
			
 
				-        'num_queries_one2many': 300,
			
 
				-        # Matcher
			
 
				-        'matcher_hpy': {'cost_class': 2.0,
			
 
				-                        'cost_bbox': 1.0,
			
 
				-                        'cost_giou': 2.0,},
			
 
				-        # Loss
			
 
				-        'use_vfl': True,
			
 
				-        'k_one2many': 6,
			
 
				-        'lambda_one2many': 1.0,
			
 
				-        'loss_coeff': {'class': 2,
			
 
				-                       'bbox': 1,
			
 
				-                       'giou': 2,
			
 
				-                       'no_object': 0.1,},
			
 
				-        }
			
 
				-    bs = 1
			
 
				-    # Create a batch of images & targets
			
 
				-    image = torch.randn(bs, 3, 640, 640)
			
 
				-    targets = [{
			
 
				-        'labels': torch.tensor([2, 4, 5, 8]).long(),
			
 
				-        'boxes':  torch.tensor([[0, 0, 10, 10], [12, 23, 56, 70], [0, 10, 20, 30], [50, 60, 55, 150]]).float() / 640.
			
 
				-    }] * bs
			
 
				-
			
 
				-    # Create model
			
 
				-    model = RT_PDETR(cfg, num_classes=80)
			
 
				-    model.train()
			
 
				-
			
 
				-    # Model inference
			
 
				-    t0 = time.time()
			
 
				-    outputs = model(image)
			
 
				-    t1 = time.time()
			
 
				-    print('Infer time: ', t1 - t0)
			
 
				-
			
 
				-    # Create criterion
			
 
				-    criterion = build_criterion(cfg, num_classes=80, aux_loss=True)
			
 
				-
			
 
				-    # Compute loss
			
 
				-    loss = criterion(outputs, targets)
			
 
				-    for k in loss.keys():
			
 
				-        print("{} : {}".format(k, loss[k].item()))
			
 
				-
			
 
				-    print('==============================')
			
 
				-    model.eval()
			
 
				-    flops, params = profile(model, inputs=(image, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/models/detectors/rtpdetr/rtpdetr_decoder.py
+++ b/models/detectors/rtpdetr/rtpdetr_decoder.py
@@ -1,405 +0,0 @@
 
				-import math
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-try:
			
 
				-    from .basic_modules.basic import LayerNorm2D
			
 
				-    from .basic_modules.transformer import GlobalDecoder
			
 
				-except:
			
 
				-    from  basic_modules.basic import LayerNorm2D
			
 
				-    from  basic_modules.transformer import GlobalDecoder
			
 
				-
			
 
				-
			
 
				-def build_transformer(cfg, return_intermediate=False):
			
 
				-    if cfg['transformer'] == 'plain_detr_transformer':
			
 
				-        return PlainDETRTransformer(d_model             = cfg['hidden_dim'],
			
 
				-                                    num_heads           = cfg['de_num_heads'],
			
 
				-                                    ffn_dim             = cfg['de_ffn_dim'],
			
 
				-                                    dropout             = cfg['de_dropout'],
			
 
				-                                    act_type            = cfg['de_act'],
			
 
				-                                    pre_norm            = cfg['de_pre_norm'],
			
 
				-                                    rpe_hidden_dim      = cfg['rpe_hidden_dim'],
			
 
				-                                    feature_stride      = cfg['out_stride'],
			
 
				-                                    num_layers          = cfg['de_num_layers'],
			
 
				-                                    return_intermediate = return_intermediate,
			
 
				-                                    use_checkpoint      = cfg['use_checkpoint'],
			
 
				-                                    num_queries_one2one = cfg['num_queries_one2one'],
			
 
				-                                    num_queries_one2many    = cfg['num_queries_one2many'],
			
 
				-                                    proposal_feature_levels = cfg['proposal_feature_levels'],
			
 
				-                                    proposal_in_stride      = cfg['out_stride'],
			
 
				-                                    proposal_tgt_strides    = cfg['proposal_tgt_strides'],
			
 
				-                                    )
			
 
				-
			
 
				-
			
 
				-# ----------------- Dencoder for Detection task -----------------
			
 
				-## PlainDETR's Transformer for Detection task
			
 
				-class PlainDETRTransformer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 # Decoder layer params
			
 
				-                 d_model        :int   = 256,
			
 
				-                 num_heads      :int   = 8,
			
 
				-                 ffn_dim        :int   = 1024,
			
 
				-                 dropout        :float = 0.1,
			
 
				-                 act_type       :str   = "relu",
			
 
				-                 pre_norm       :bool  = False,
			
 
				-                 rpe_hidden_dim :int   = 512,
			
 
				-                 feature_stride :int   = 16,
			
 
				-                 num_layers     :int   = 6,
			
 
				-                 # Decoder params
			
 
				-                 return_intermediate     :bool = False,
			
 
				-                 use_checkpoint          :bool = False,
			
 
				-                 num_queries_one2one     :int  = 300,
			
 
				-                 num_queries_one2many    :int  = 1500,
			
 
				-                 proposal_feature_levels :int  = 3,
			
 
				-                 proposal_in_stride      :int  = 16,
			
 
				-                 proposal_tgt_strides    :int  = [8, 16, 32],
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # ------------ Basic setting ------------
			
 
				-        ## Model
			
 
				-        self.d_model = d_model
			
 
				-        self.num_heads = num_heads
			
 
				-        self.rpe_hidden_dim = rpe_hidden_dim
			
 
				-        self.ffn_dim = ffn_dim
			
 
				-        self.act_type = act_type
			
 
				-        self.num_layers = num_layers
			
 
				-        self.return_intermediate = return_intermediate
			
 
				-        ## Trick
			
 
				-        self.use_checkpoint = use_checkpoint
			
 
				-        self.num_queries_one2one = num_queries_one2one
			
 
				-        self.num_queries_one2many = num_queries_one2many
			
 
				-        self.proposal_feature_levels = proposal_feature_levels
			
 
				-        self.proposal_tgt_strides = proposal_tgt_strides
			
 
				-        self.proposal_in_stride = proposal_in_stride
			
 
				-        self.proposal_min_size = 50
			
 
				-
			
 
				-        # --------------- Network setting ---------------
			
 
				-        ## Global Decoder
			
 
				-        self.decoder = GlobalDecoder(d_model, num_heads, ffn_dim, dropout, act_type, pre_norm,
			
 
				-                                     rpe_hidden_dim, feature_stride, num_layers, return_intermediate,
			
 
				-                                     use_checkpoint,)
			
 
				-        
			
 
				-        ## Two stage
			
 
				-        self.enc_output = nn.Linear(d_model, d_model)
			
 
				-        self.enc_output_norm = nn.LayerNorm(d_model)
			
 
				-        self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
			
 
				-        self.pos_trans_norm = nn.LayerNorm(d_model * 2)
			
 
				-
			
 
				-        ## Expand layers
			
 
				-        if proposal_feature_levels > 1:
			
 
				-            assert len(proposal_tgt_strides) == proposal_feature_levels
			
 
				-
			
 
				-            self.enc_output_proj = nn.ModuleList([])
			
 
				-            for stride in proposal_tgt_strides:
			
 
				-                if stride == proposal_in_stride:
			
 
				-                    self.enc_output_proj.append(nn.Identity())
			
 
				-                elif stride > proposal_in_stride:
			
 
				-                    scale = int(math.log2(stride / proposal_in_stride))
			
 
				-                    layers = []
			
 
				-                    for _ in range(scale - 1):
			
 
				-                        layers += [
			
 
				-                            nn.Conv2d(d_model, d_model, kernel_size=2, stride=2),
			
 
				-                            LayerNorm2D(d_model),
			
 
				-                            nn.GELU()
			
 
				-                        ]
			
 
				-                    layers.append(nn.Conv2d(d_model, d_model, kernel_size=2, stride=2))
			
 
				-                    self.enc_output_proj.append(nn.Sequential(*layers))
			
 
				-                else:
			
 
				-                    scale = int(math.log2(proposal_in_stride / stride))
			
 
				-                    layers = []
			
 
				-                    for _ in range(scale - 1):
			
 
				-                        layers += [
			
 
				-                            nn.ConvTranspose2d(d_model, d_model, kernel_size=2, stride=2),
			
 
				-                            LayerNorm2D(d_model),
			
 
				-                            nn.GELU()
			
 
				-                        ]
			
 
				-                    layers.append(nn.ConvTranspose2d(d_model, d_model, kernel_size=2, stride=2))
			
 
				-                    self.enc_output_proj.append(nn.Sequential(*layers))
			
 
				-
			
 
				-        self._reset_parameters()
			
 
				-
			
 
				-    def _reset_parameters(self):
			
 
				-        for p in self.parameters():
			
 
				-            if p.dim() > 1:
			
 
				-                nn.init.xavier_uniform_(p)
			
 
				-
			
 
				-        if hasattr(self.decoder, '_reset_parameters'):
			
 
				-            print('decoder re-init')
			
 
				-            self.decoder._reset_parameters()
			
 
				-
			
 
				-    def get_proposal_pos_embed(self, proposals):
			
 
				-        num_pos_feats = self.d_model // 2
			
 
				-        temperature = 10000
			
 
				-        scale = 2 * torch.pi
			
 
				-
			
 
				-        dim_t = torch.arange(
			
 
				-            num_pos_feats, dtype=torch.float32, device=proposals.device
			
 
				-        )
			
 
				-        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
			
 
				-        # N, L, 4
			
 
				-        proposals = proposals * scale
			
 
				-        # N, L, 4, 128
			
 
				-        pos = proposals[:, :, :, None] / dim_t
			
 
				-        # N, L, 4, 64, 2
			
 
				-        pos = torch.stack(
			
 
				-            (pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4
			
 
				-        ).flatten(2)
			
 
				-
			
 
				-        return pos
			
 
				-
			
 
				-    def get_valid_ratio(self, mask):
			
 
				-        _, H, W = mask.shape
			
 
				-        valid_H = torch.sum(~mask[:, :, 0], 1)
			
 
				-        valid_W = torch.sum(~mask[:, 0, :], 1)
			
 
				-        valid_ratio_h = valid_H.float() / H
			
 
				-        valid_ratio_w = valid_W.float() / W
			
 
				-        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
			
 
				-
			
 
				-        return valid_ratio
			
 
				-
			
 
				-    def expand_encoder_output(self, memory, memory_padding_mask, spatial_shapes):
			
 
				-        assert spatial_shapes.size(0) == 1, f'Get encoder output of shape {spatial_shapes}, not sure how to expand'
			
 
				-
			
 
				-        bs, _, c = memory.shape
			
 
				-        h, w = spatial_shapes[0]
			
 
				-
			
 
				-        _out_memory = memory.view(bs, h, w, c).permute(0, 3, 1, 2)
			
 
				-        _out_memory_padding_mask = memory_padding_mask.view(bs, h, w)
			
 
				-
			
 
				-        out_memory, out_memory_padding_mask, out_spatial_shapes = [], [], []
			
 
				-        for i in range(self.proposal_feature_levels):
			
 
				-            mem = self.enc_output_proj[i](_out_memory)
			
 
				-            mask = F.interpolate(
			
 
				-                _out_memory_padding_mask[None].float(), size=mem.shape[-2:]
			
 
				-            ).to(torch.bool)
			
 
				-
			
 
				-            out_memory.append(mem)
			
 
				-            out_memory_padding_mask.append(mask.squeeze(0))
			
 
				-            out_spatial_shapes.append(mem.shape[-2:])
			
 
				-
			
 
				-        out_memory = torch.cat([mem.flatten(2).transpose(1, 2) for mem in out_memory], dim=1)
			
 
				-        out_memory_padding_mask = torch.cat([mask.flatten(1) for mask in out_memory_padding_mask], dim=1)
			
 
				-        out_spatial_shapes = torch.as_tensor(out_spatial_shapes, dtype=torch.long, device=out_memory.device)
			
 
				-        
			
 
				-        return out_memory, out_memory_padding_mask, out_spatial_shapes
			
 
				-
			
 
				-    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
			
 
				-        if self.proposal_feature_levels > 1:
			
 
				-            memory, memory_padding_mask, spatial_shapes = self.expand_encoder_output(
			
 
				-                memory, memory_padding_mask, spatial_shapes
			
 
				-            )
			
 
				-        N_, S_, C_ = memory.shape
			
 
				-        # base_scale = 4.0
			
 
				-        proposals = []
			
 
				-        _cur = 0
			
 
				-        for lvl, (H_, W_) in enumerate(spatial_shapes):
			
 
				-            stride = self.proposal_tgt_strides[lvl]
			
 
				-
			
 
				-            grid_y, grid_x = torch.meshgrid(
			
 
				-                torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
			
 
				-                torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
			
 
				-            )
			
 
				-            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
			
 
				-            grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) * stride
			
 
				-            wh = torch.ones_like(grid) * self.proposal_min_size * (2.0 ** lvl)
			
 
				-            proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
			
 
				-            proposals.append(proposal)
			
 
				-            _cur += H_ * W_
			
 
				-        output_proposals = torch.cat(proposals, 1)
			
 
				-
			
 
				-        H_, W_ = spatial_shapes[0]
			
 
				-        stride = self.proposal_tgt_strides[0]
			
 
				-        mask_flatten_ = memory_padding_mask[:, :H_*W_].view(N_, H_, W_, 1)
			
 
				-        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1, keepdim=True) * stride
			
 
				-        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1, keepdim=True) * stride
			
 
				-        img_size = torch.cat([valid_W, valid_H, valid_W, valid_H], dim=-1)
			
 
				-        img_size = img_size.unsqueeze(1) # [BS, 1, 4]
			
 
				-
			
 
				-        output_proposals_valid = (
			
 
				-            (output_proposals > 0.01 * img_size) & (output_proposals < 0.99 * img_size)
			
 
				-        ).all(-1, keepdim=True)
			
 
				-        output_proposals = output_proposals.masked_fill(
			
 
				-            memory_padding_mask.unsqueeze(-1).repeat(1, 1, 1),
			
 
				-            max(H_, W_) * stride,
			
 
				-        )
			
 
				-        output_proposals = output_proposals.masked_fill(
			
 
				-            ~output_proposals_valid,
			
 
				-            max(H_, W_) * stride,
			
 
				-        )
			
 
				-
			
 
				-        output_memory = memory
			
 
				-        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
			
 
				-        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
			
 
				-        output_memory = self.enc_output_norm(self.enc_output(output_memory))
			
 
				-
			
 
				-        max_shape = (valid_H[:, None, :], valid_W[:, None, :])
			
 
				-        return output_memory, output_proposals, max_shape
			
 
				-    
			
 
				-    def get_reference_points(self, memory, mask_flatten, spatial_shapes):
			
 
				-        output_memory, output_proposals, max_shape = self.gen_encoder_output_proposals(
			
 
				-            memory, mask_flatten, spatial_shapes
			
 
				-        )
			
 
				-
			
 
				-        # hack implementation for two-stage Deformable DETR
			
 
				-        enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
			
 
				-        enc_outputs_delta = self.decoder.bbox_embed[self.decoder.num_layers](output_memory)
			
 
				-        enc_outputs_coord_unact = self.decoder.box_xyxy_to_cxcywh(self.decoder.delta2bbox(
			
 
				-            output_proposals,
			
 
				-            enc_outputs_delta,
			
 
				-            max_shape
			
 
				-        ))
			
 
				-
			
 
				-        topk = self.two_stage_num_proposals
			
 
				-        topk_proposals = torch.topk(enc_outputs_class.max(-1)[0], topk, dim=1)[1]
			
 
				-        topk_coords_unact = torch.gather(
			
 
				-            enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
			
 
				-        )
			
 
				-        topk_coords_unact = topk_coords_unact.detach()
			
 
				-        reference_points = topk_coords_unact
			
 
				-        
			
 
				-        return (reference_points, max_shape, enc_outputs_class,
			
 
				-                enc_outputs_coord_unact, enc_outputs_delta, output_proposals)
			
 
				-
			
 
				-    def forward(self, src, mask, pos_embed, query_embed=None, self_attn_mask=None):
			
 
				-        # Prepare input for encoder
			
 
				-        bs, c, h, w = src.shape
			
 
				-        src_flatten = src.flatten(2).transpose(1, 2)
			
 
				-        mask_flatten = mask.flatten(1)
			
 
				-        pos_embed_flatten = pos_embed.flatten(2).transpose(1, 2)
			
 
				-        spatial_shapes = torch.as_tensor([(h, w)], dtype=torch.long, device=src_flatten.device)
			
 
				-
			
 
				-        # Prepare input for decoder
			
 
				-        memory = src_flatten
			
 
				-        bs, _, c = memory.shape
			
 
				-
			
 
				-        # Two stage trick
			
 
				-        if self.training:
			
 
				-            self.two_stage_num_proposals = self.num_queries_one2one + self.num_queries_one2many
			
 
				-        else:
			
 
				-            self.two_stage_num_proposals = self.num_queries_one2one
			
 
				-        (reference_points, max_shape, enc_outputs_class,
			
 
				-        enc_outputs_coord_unact, enc_outputs_delta, output_proposals) \
			
 
				-            = self.get_reference_points(memory, mask_flatten, spatial_shapes)
			
 
				-        init_reference_out = reference_points
			
 
				-        pos_trans_out = torch.zeros((bs, self.two_stage_num_proposals, 2*c), device=init_reference_out.device)
			
 
				-        pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(reference_points)))
			
 
				-
			
 
				-        # Mixed selection trick
			
 
				-        tgt = query_embed.unsqueeze(0).expand(bs, -1, -1)
			
 
				-        query_embed, _ = torch.split(pos_trans_out, c, dim=2)
			
 
				-
			
 
				-        # Decoder
			
 
				-        hs, inter_references = self.decoder(tgt,
			
 
				-                                            reference_points,
			
 
				-                                            memory,
			
 
				-                                            pos_embed_flatten,
			
 
				-                                            spatial_shapes,
			
 
				-                                            query_embed,
			
 
				-                                            mask_flatten,
			
 
				-                                            self_attn_mask,
			
 
				-                                            max_shape
			
 
				-                                            )
			
 
				-        inter_references_out = inter_references
			
 
				-
			
 
				-        return (hs,
			
 
				-                init_reference_out,
			
 
				-                inter_references_out,
			
 
				-                enc_outputs_class,
			
 
				-                enc_outputs_coord_unact,
			
 
				-                enc_outputs_delta,
			
 
				-                output_proposals,
			
 
				-                max_shape
			
 
				-                )
			
 
				-
			
 
				-
			
 
				-# ----------------- Dencoder for Segmentation task -----------------
			
 
				-## PlainDETR's Transformer for Segmentation task
			
 
				-class SegTransformerDecoder(nn.Module):
			
 
				-    def __init__(self, ):
			
 
				-        super().__init__()
			
 
				-        # TODO: design seg-decoder
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return
			
 
				-
			
 
				-
			
 
				-# ----------------- Dencoder for Pose estimation task -----------------
			
 
				-## PlainDETR's Transformer for Pose estimation task
			
 
				-class PosTransformerDecoder(nn.Module):
			
 
				-    def __init__(self, ):
			
 
				-        super().__init__()
			
 
				-        # TODO: design seg-decoder
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    from basic_modules.basic import MLP
			
 
				-    from basic_modules.transformer import get_clones
			
 
				-
			
 
				-    cfg = {
			
 
				-        'out_stride': 16,
			
 
				-        'hidden_dim': 256,
			
 
				-        # Transformer Decoder
			
 
				-        'transformer': 'plain_detr_transformer',
			
 
				-        'de_num_heads': 8,
			
 
				-        'de_num_layers': 6,
			
 
				-        'de_ffn_dim': 1024,
			
 
				-        'de_dropout': 0.0,
			
 
				-        'de_act': 'gelu',
			
 
				-        'de_pre_norm': True,
			
 
				-        'rpe_hidden_dim': 512,
			
 
				-        'use_checkpoint': False,
			
 
				-        'proposal_feature_levels': 3,
			
 
				-        'proposal_tgt_strides': [8, 16, 32],
			
 
				-        'num_queries_one2one': 300,
			
 
				-        'num_queries_one2many': 100,
			
 
				-    }
			
 
				-    feat = torch.randn(1, cfg['hidden_dim'], 40, 40)
			
 
				-    mask = torch.zeros(1, 40, 40)
			
 
				-    pos_embed = torch.randn(1, cfg['hidden_dim'], 40, 40)
			
 
				-    query_embed = torch.randn(cfg['num_queries_one2one'] + cfg['num_queries_one2many'], cfg['hidden_dim'])
			
 
				-
			
 
				-    model = build_transformer(cfg, True)
			
 
				-
			
 
				-    class_embed = nn.Linear(cfg['hidden_dim'], 80)
			
 
				-    bbox_embed = MLP(cfg['hidden_dim'], cfg['hidden_dim'], 4, 3)
			
 
				-    class_embed = get_clones(class_embed, cfg['de_num_layers'] + 1)
			
 
				-    bbox_embed = get_clones(bbox_embed, cfg['de_num_layers'] + 1)
			
 
				-
			
 
				-    model.decoder.bbox_embed = bbox_embed
			
 
				-    model.decoder.class_embed = class_embed
			
 
				-
			
 
				-    model.train()
			
 
				-    t0 = time.time()
			
 
				-    outputs = model(feat, mask, pos_embed, query_embed)
			
 
				-    (hs,
			
 
				-     init_reference_out,
			
 
				-     inter_references_out,
			
 
				-     enc_outputs_class,
			
 
				-     enc_outputs_coord_unact,
			
 
				-     enc_outputs_delta,
			
 
				-     output_proposals,
			
 
				-     max_shape
			
 
				-     ) = outputs
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
 
				-    print(hs.shape)
			
 
				-    print(init_reference_out.shape)
			
 
				-    print(inter_references_out.shape)
			
 
				-    print(enc_outputs_class.shape)
			
 
				-    print(enc_outputs_coord_unact.shape)
			
 
				-    print(enc_outputs_delta.shape)
			
 
				-    print(output_proposals.shape)
			
 
				-
			
 
				-    print('==============================')
			
 
				-    model.eval()
			
 
				-    query_embed = torch.randn(cfg['num_queries_one2one'], cfg['hidden_dim'])
			
 
				-    flops, params = profile(model, inputs=(feat, mask, pos_embed, query_embed, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/models/detectors/rtpdetr/rtpdetr_encoder.py
+++ b/models/detectors/rtpdetr/rtpdetr_encoder.py
@@ -1,99 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-try:
			
 
				-    from .basic_modules.basic    import BasicConv, UpSampleWrapper
			
 
				-    from .basic_modules.backbone import build_backbone
			
 
				-    from .basic_modules.transformer import TransformerEncoder
			
 
				-except:
			
 
				-    from  basic_modules.basic    import BasicConv, UpSampleWrapper
			
 
				-    from  basic_modules.backbone import build_backbone
			
 
				-    from  basic_modules.transformer import TransformerEncoder
			
 
				-
			
 
				-
			
 
				-# ----------------- Image Encoder -----------------
			
 
				-def build_image_encoder(cfg):
			
 
				-    return ImageEncoder(cfg)
			
 
				-
			
 
				-class ImageEncoder(nn.Module):
			
 
				-    def __init__(self, cfg):
			
 
				-        super().__init__()
			
 
				-        # ---------------- Basic settings ----------------
			
 
				-        ## Basic parameters
			
 
				-        self.cfg = cfg
			
 
				-        ## Network parameters
			
 
				-        self.stride = cfg['out_stride']
			
 
				-        self.upsample_factor = 32 // self.stride
			
 
				-        self.hidden_dim = cfg['hidden_dim']
			
 
				-        
			
 
				-        # ---------------- Network settings ----------------
			
 
				-        ## Backbone Network
			
 
				-        self.backbone, fpn_feat_dims = build_backbone(cfg, pretrained=cfg['pretrained']&self.training)
			
 
				-
			
 
				-        ## Input projection
			
 
				-        self.input_proj = BasicConv(fpn_feat_dims[-1], cfg['hidden_dim'], kernel_size=1, act_type=None, norm_type='BN')
			
 
				-
			
 
				-        # ---------------- Transformer Encoder ----------------
			
 
				-        self.transformer_encoder = TransformerEncoder(d_model     = cfg['hidden_dim'],
			
 
				-                                                      num_heads   = cfg['en_num_heads'],
			
 
				-                                                      num_layers  = cfg['en_num_layers'],
			
 
				-                                                      ffn_dim     = cfg['en_ffn_dim'],
			
 
				-                                                      dropout     = cfg['en_dropout'],
			
 
				-                                                      act_type    = cfg['en_act']
			
 
				-                                                      )
			
 
				-
			
 
				-        ## Upsample layer
			
 
				-        self.upsample = UpSampleWrapper(cfg['hidden_dim'], self.upsample_factor)
			
 
				-        
			
 
				-        ## Output projection
			
 
				-        self.output_proj = BasicConv(cfg['hidden_dim'], cfg['hidden_dim'], kernel_size=3, padding=1, act_type='silu', norm_type='BN')
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        pyramid_feats = self.backbone(x)
			
 
				-        feat = self.input_proj(pyramid_feats[-1])
			
 
				-        feat = self.transformer_encoder(feat)
			
 
				-        feat = self.upsample(feat)
			
 
				-        feat = self.output_proj(feat)
			
 
				-
			
 
				-        return feat
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    cfg = {
			
 
				-        'width': 1.0,
			
 
				-        'depth': 1.0,
			
 
				-        'out_stride': 16,
			
 
				-        # Image Encoder - Backbone
			
 
				-        'backbone': 'resnet50',
			
 
				-        'backbone_norm': 'FrozeBN',
			
 
				-        'pretrained': True,
			
 
				-        'freeze_at': 0,
			
 
				-        'freeze_stem_only': False,
			
 
				-        'hidden_dim': 256,
			
 
				-        'en_num_heads': 8,
			
 
				-        'en_num_layers': 1,
			
 
				-        'en_ffn_dim': 1024,
			
 
				-        'en_dropout': 0.0,
			
 
				-        'en_act': 'gelu',
			
 
				-    }
			
 
				-    x = torch.rand(2, 3, 640, 640)
			
 
				-    model = build_image_encoder(cfg)
			
 
				-    model.train()
			
 
				-
			
 
				-    t0 = time.time()
			
 
				-    outputs = model(x)
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
 
				-    print(outputs.shape)
			
 
				-
			
 
				-    print('==============================')
			
 
				-    model.eval()
			
 
				-    x = torch.rand(1, 3, 640, 640)
			
 
				-    flops, params = profile(model, inputs=(x, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))