Browse Source

remove useless codes

yjh0410 1 year ago
parent
commit
67b8932c1f

+ 0 - 4
config/__init__.py

@@ -96,7 +96,6 @@ from .model_config.yolov8_config import yolov8_cfg
 from .model_config.yolox_config  import yolox_cfg
 ## Real-time DETR series
 from .model_config.rtdetr_config import rtdetr_cfg
-from .model_config.rtpdetr_config import rtpdetr_cfg
 
 def build_model_config(args):
     print('==============================')
@@ -131,9 +130,6 @@ def build_model_config(args):
     # RT-DETR
     elif args.model in ['rtdetr_r18', 'rtdetr_r34', 'rtdetr_r50', 'rtdetr_r101']:
         cfg = rtdetr_cfg[args.model]
-    # RT-PlainDETR
-    elif args.model in ['rtpdetr_r18', 'rtpdetr_r34', 'rtpdetr_r50', 'rtpdetr_r101']:
-        cfg = rtpdetr_cfg[args.model]
 
     return cfg
 

+ 0 - 57
config/model_config/rtpdetr_config.py

@@ -1,57 +0,0 @@
-# Real-time Transformer-based Object Detector
-
-
-# ------------------- Det task --------------------
-rtpdetr_cfg = {
-    'rtpdetr_r50':{
-        # ---------------- Model config ----------------
-        ## Model scale
-        'width': 1.0,
-        'depth': 1.0,
-        'max_stride': 32,
-        'out_stride': 16,
-        # Image Encoder - Backbone
-        'backbone': 'resnet50',
-        'backbone_norm': 'FrozeBN',
-        'pretrained': True,
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'hidden_dim': 256,
-        'en_num_heads': 8,
-        'en_num_layers': 6,
-        'en_ffn_dim': 2048,
-        'en_dropout': 0.0,
-        'en_act': 'gelu',
-        # Transformer Decoder
-        'transformer': 'plain_detr_transformer',
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_ffn_dim': 2048,
-        'de_dropout': 0.0,
-        'de_act': 'gelu',
-        'de_pre_norm': True,
-        'rpe_hidden_dim': 512,
-        'use_checkpoint': False,
-        'proposal_feature_levels': 3,
-        'proposal_tgt_strides': [8, 16, 32],
-        'num_queries_one2one': 300,
-        'num_queries_one2many': 1500,
-        # ---------------- Assignment config ----------------
-        'matcher_hpy': {'cost_class': 2.0,
-                        'cost_bbox': 1.0,
-                        'cost_giou': 2.0,},
-        # ---------------- Loss config ----------------
-        'k_one2many': 6,
-        'lambda_one2many': 1.0,
-        'loss_coeff': {'class': 2,
-                       'bbox': 1,
-                       'giou': 2,},
-        # ---------------- Train config ----------------
-        ## input
-        'multi_scale': [0.5, 1.25],   # 320 -> 800
-        'trans_type': 'rtdetr_l',
-        # ---------------- Train config ----------------
-        'trainer_type': 'rtpdetr',
-    },
-
-}

+ 0 - 116
engine.py

@@ -1488,119 +1488,6 @@ class RTDetrTrainer(object):
         
         self.train_loader.dataset.transform = self.train_transform
 
-## Real-time PlainDETR Trainer
-class RTPDetrTrainer(RTDetrTrainer):
-    def __init__(self, args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size):
-        super().__init__(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size)
-        # ------------------- Basic parameters -------------------
-        ## Reset optimzier hyper-parameters
-        self.optimizer_dict = {'optimizer': 'adamw', 'momentum': None, 'weight_decay': 0.0001, 'lr0': 0.0001, 'backbone_lr_ratio': 0.1}
-        self.warmup_dict = {'warmup': 'linear', 'warmup_iters': 2000, 'warmup_factor': 0.00066667}
-        self.lr_schedule_dict = {'lr_scheduler': 'step', 'lr_epoch': [self.args.max_epoch // 12 * 11]}
-        self.normalize_bbox = False
-
-        # ---------------------------- Build Optimizer ----------------------------
-        print("- Re-build oprimizer -")
-        self.optimizer_dict['lr0'] *= self.args.batch_size / 16.  # auto lr scaling
-        self.optimizer, self.start_epoch = build_rtdetr_optimizer(self.optimizer_dict, model, self.args.resume)
-
-        # ---------------------------- Build LR Scheduler ----------------------------
-        print("- Re-build lr scheduler -")
-        self.wp_lr_scheduler = build_wp_lr_scheduler(self.warmup_dict, self.optimizer_dict['lr0'])
-        self.lr_scheduler    = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, args.resume)
-
-    def train_one_epoch(self, model):
-        metric_logger = MetricLogger(delimiter="  ")
-        metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
-        metric_logger.add_meter('size', SmoothedValue(window_size=1, fmt='{value:d}'))
-        metric_logger.add_meter('grad_norm', SmoothedValue(window_size=1, fmt='{value:.1f}'))
-        header = 'Epoch: [{} / {}]'.format(self.epoch, self.args.max_epoch)
-        epoch_size = len(self.train_loader)
-        print_freq = 10
-
-        # basic parameters
-        epoch_size = len(self.train_loader)
-        img_size = self.args.img_size
-        nw = self.warmup_dict['warmup_iters']
-        lr_warmup_stage = True
-
-        # Train one epoch
-        for iter_i, (images, targets) in enumerate(metric_logger.log_every(self.train_loader, print_freq, header)):
-            ni = iter_i + self.epoch * epoch_size
-            # WarmUp
-            if ni < nw and lr_warmup_stage:
-                self.wp_lr_scheduler(ni, self.optimizer)
-            elif ni == nw and lr_warmup_stage:
-                print('Warmup stage is over.')
-                lr_warmup_stage = False
-                self.wp_lr_scheduler.set_lr(self.optimizer, self.optimizer_dict['lr0'], self.optimizer_dict['lr0'])
-                                            
-            # To device
-            images = images.to(self.device, non_blocking=True).float()
-            for tgt in targets:
-                tgt['boxes'] = tgt['boxes'].to(self.device)
-                tgt['labels'] = tgt['labels'].to(self.device)
-
-            # Multi scale
-            if self.args.multi_scale:
-                images, targets, img_size = self.rescale_image_targets(
-                    images, targets, self.model_cfg['max_stride'], self.args.min_box_size, self.model_cfg['multi_scale'])
-            else:
-                targets = self.refine_targets(img_size, targets, self.args.min_box_size)
-
-            # xyxy -> cxcywh
-            targets = self.box_xyxy_to_cxcywh(targets)
-                
-            # Visualize train targets
-            if self.args.vis_tgt:
-                targets = self.box_cxcywh_to_xyxy(targets)
-                vis_data(images, targets, pixel_mean=self.trans_cfg['pixel_mean'], pixel_std=self.trans_cfg['pixel_std'])
-                targets = self.box_xyxy_to_cxcywh(targets)
-
-            # Inference
-            with torch.cuda.amp.autocast(enabled=self.args.fp16):
-                outputs = model(images)
-                # Compute loss
-                loss_dict = self.criterion(outputs, targets)
-                losses = sum(loss_dict.values())
-                # Grad Accumulate
-                if self.grad_accumulate > 1:
-                    losses /= self.grad_accumulate
-
-                loss_dict_reduced = distributed_utils.reduce_dict(loss_dict)
-
-            # Backward
-            self.scaler.scale(losses).backward()
-
-            # Optimize
-            if ni % self.grad_accumulate == 0:
-                grad_norm = None
-                if self.clip_grad > 0:
-                    # unscale gradients
-                    self.scaler.unscale_(self.optimizer)
-                    # clip gradients
-                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.clip_grad)
-                # optimizer.step
-                self.scaler.step(self.optimizer)
-                self.scaler.update()
-                self.optimizer.zero_grad()
-                # ema
-                if self.model_ema is not None:
-                    self.model_ema.update(model)
-
-            # Update log
-            metric_logger.update(loss=losses.item(), **loss_dict_reduced)
-            metric_logger.update(lr=self.optimizer.param_groups[0]["lr"])
-            metric_logger.update(grad_norm=grad_norm)
-            metric_logger.update(size=img_size)
-
-            if self.args.debug:
-                print("For debug mode, we only train 1 iteration")
-                break
-
-        # LR Schedule
-        self.lr_scheduler.step()
-        
 
 # Build Trainer
 def build_trainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size):
@@ -1613,9 +1500,6 @@ def build_trainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion
         return RTCTrainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size)
     elif model_cfg['trainer_type'] == 'rtdetr':
         return RTDetrTrainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size)
-    elif model_cfg['trainer_type'] == 'rtpdetr':
-        return RTPDetrTrainer(args, data_cfg, model_cfg, trans_cfg, device, model, criterion, world_size)
-    
     else:
         raise NotImplementedError(model_cfg['trainer_type'])
     

+ 0 - 5
models/detectors/__init__.py

@@ -13,7 +13,6 @@ from .yolov8.build import build_yolov8
 from .yolox.build import build_yolox
 # Real-time DETR series
 from .rtdetr.build import build_rtdetr
-from .rtpdetr.build import build_rtpdetr
 
 
 # build object detector
@@ -67,10 +66,6 @@ def build_model(args,
     elif args.model in ['rtdetr_r18', 'rtdetr_r34', 'rtdetr_r50', 'rtdetr_r101']:
         model, criterion = build_rtdetr(
             args, model_cfg, num_classes, trainable, deploy)
-    # RT-PlainDETR
-    elif args.model in ['rtpdetr_r18', 'rtpdetr_r34', 'rtpdetr_r50', 'rtpdetr_r101']:
-        model, criterion = build_rtpdetr(
-            args, model_cfg, num_classes, trainable, deploy)
 
     if trainable:
         # Load pretrained weight

+ 0 - 145
models/detectors/rtpdetr/basic_modules/backbone.py

@@ -1,145 +0,0 @@
-import torch
-import torchvision
-from torch import nn
-from torchvision.models._utils import IntermediateLayerGetter
-
-try:
-    from .basic import FrozenBatchNorm2d
-except:
-    from basic  import FrozenBatchNorm2d
-   
-
-# IN1K MIM pretrained weights (from SparK: https://github.com/keyu-tian/SparK)
-pretrained_urls = {
-    # ResNet series
-    'resnet18':  None,
-    'resnet34':  None,
-    'resnet50':  "https://github.com/yjh0410/RT-ODLab/releases/download/backbone_weight/resnet50_in1k_spark_pretrained_timm_style.pth",
-    'resnet101': None,
-    # ShuffleNet series
-}
-
-
-# ----------------- Model functions -----------------
-## Build backbone network
-def build_backbone(cfg, pretrained=False):
-    print('==============================')
-    print('Backbone: {}'.format(cfg['backbone']))
-    # ResNet
-    if 'resnet' in cfg['backbone']:
-        model, feats = build_resnet(cfg, pretrained)
-    elif 'svnetv2' in cfg['backbone']:
-        pretrained_weight = cfg['pretrained_weight'] if pretrained else None
-        model, feats = build_scnetv2(cfg, pretrained_weight)
-    else:
-        raise NotImplementedError("Unknown backbone: <>.".format(cfg['backbone']))
-    
-    return model, feats
-
-
-# ----------------- ResNet Backbone -----------------
-class ResNet(nn.Module):
-    """ResNet backbone with frozen BatchNorm."""
-    def __init__(self,
-                 name: str,
-                 norm_type: str,
-                 pretrained: bool = False,
-                 freeze_at: int = -1,
-                 freeze_stem_only: bool = False):
-        super().__init__()
-        # Pretrained
-        # Norm layer
-        if norm_type == 'BN':
-            norm_layer = nn.BatchNorm2d
-        elif norm_type == 'FrozeBN':
-            norm_layer = FrozenBatchNorm2d
-        # Backbone
-        backbone = getattr(torchvision.models, name)(norm_layer=norm_layer,)
-        return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
-        self.feat_dims = [128, 256, 512] if name in ('resnet18', 'resnet34') else [512, 1024, 2048]
-        
-        # Load pretrained
-        if pretrained:
-            self.load_pretrained(name)
-
-        # Freeze
-        if freeze_at >= 0:
-            for name, parameter in backbone.named_parameters():
-                if freeze_stem_only:
-                    if 'layer1' not in name and 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
-                        parameter.requires_grad_(False)
-                else:
-                    if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
-                        parameter.requires_grad_(False)
-
-    def load_pretrained(self, name):
-        url = pretrained_urls[name]
-        if url is not None:
-            print('Loading pretrained weight from : {}'.format(url))
-            # checkpoint state dict
-            checkpoint_state_dict = torch.hub.load_state_dict_from_url(
-                url=url, map_location="cpu", check_hash=True)
-            # model state dict
-            model_state_dict = self.body.state_dict()
-            # check
-            for k in list(checkpoint_state_dict.keys()):
-                if k in model_state_dict:
-                    shape_model = tuple(model_state_dict[k].shape)
-                    shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
-                    if shape_model != shape_checkpoint:
-                        checkpoint_state_dict.pop(k)
-                else:
-                    checkpoint_state_dict.pop(k)
-                    print('Unused key: ', k)
-            # load the weight
-            self.body.load_state_dict(checkpoint_state_dict)
-        else:
-            print('No backbone pretrained for {}.'.format(name))
-
-    def forward(self, x):
-        xs = self.body(x)
-        fmp_list = []
-        for name, fmp in xs.items():
-            fmp_list.append(fmp)
-
-        return fmp_list
-
-def build_resnet(cfg, pretrained=False):
-    # ResNet series
-    backbone = ResNet(cfg['backbone'],
-                      cfg['backbone_norm'],
-                      pretrained,
-                      cfg['freeze_at'],
-                      cfg['freeze_stem_only'])
-
-    return backbone, backbone.feat_dims
-
-
-# ----------------- ShuffleNet Backbone -----------------
-## TODO: Add shufflenet-v2
-class ShuffleNetv2:
-    pass
-
-def build_scnetv2(cfg, pretrained_weight=None):
-    return
-
-
-if __name__ == '__main__':
-    cfg = {
-        'backbone': 'resnet50',
-        'backbone_norm': 'FrozeBN',
-        'pretrained': True,
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-    }
-    model, feat_dim = build_backbone(cfg, cfg['pretrained'])
-    model.eval()
-    print(feat_dim)
-
-    x = torch.ones(2, 3, 320, 320)
-    output = model(x)
-    for y in output:
-        print(y.size())
-    print(output[-1])
-

+ 0 - 402
models/detectors/rtpdetr/basic_modules/basic.py

@@ -1,402 +0,0 @@
-import math
-import warnings
-import numpy as np
-import torch
-import torch.nn as nn
-
-
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    """Copy from timm"""
-    with torch.no_grad():
-        """Copy from timm"""
-        def norm_cdf(x):
-            return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-        if (mean < a - 2 * std) or (mean > b + 2 * std):
-            warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-                        "The distribution of values may be incorrect.",
-                        stacklevel=2)
-
-        l = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
-
-        tensor.uniform_(2 * l - 1, 2 * u - 1)
-        tensor.erfinv_()
-
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-
-        tensor.clamp_(min=a, max=b)
-
-        return tensor
-
-def box_xyxy_to_cxcywh(x):
-    x0, y0, x1, y1 = x.unbind(-1)
-    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
-    
-    return torch.stack(b, dim=-1)
-
-def delta2bbox(proposals,
-               deltas,
-               max_shape=None,
-               wh_ratio_clip=16 / 1000,
-               clip_border=True,
-               add_ctr_clamp=False,
-               ctr_clamp=32):
-
-    dxy = deltas[..., :2]
-    dwh = deltas[..., 2:]
-
-    # Compute width/height of each roi
-    pxy = proposals[..., :2]
-    pwh = proposals[..., 2:]
-
-    dxy_wh = pwh * dxy
-    wh_ratio_clip = torch.as_tensor(wh_ratio_clip)
-    max_ratio = torch.abs(torch.log(wh_ratio_clip)).item()
-    
-    if add_ctr_clamp:
-        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
-        dwh = torch.clamp(dwh, max=max_ratio)
-    else:
-        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
-
-    gxy = pxy + dxy_wh
-    gwh = pwh * dwh.exp()
-    x1y1 = gxy - (gwh * 0.5)
-    x2y2 = gxy + (gwh * 0.5)
-    bboxes = torch.cat([x1y1, x2y2], dim=-1)
-    if clip_border and max_shape is not None:
-        bboxes[..., 0::2].clamp_(min=0).clamp_(max=max_shape[1])
-        bboxes[..., 1::2].clamp_(min=0).clamp_(max=max_shape[0])
-
-    return bboxes
-
-
-# ---------------------------- NMS ----------------------------
-## basic NMS
-def nms(bboxes, scores, nms_thresh):
-    """"Pure Python NMS."""
-    x1 = bboxes[:, 0]  #xmin
-    y1 = bboxes[:, 1]  #ymin
-    x2 = bboxes[:, 2]  #xmax
-    y2 = bboxes[:, 3]  #ymax
-
-    areas = (x2 - x1) * (y2 - y1)
-    order = scores.argsort()[::-1]
-
-    keep = []
-    while order.size > 0:
-        i = order[0]
-        keep.append(i)
-        # compute iou
-        xx1 = np.maximum(x1[i], x1[order[1:]])
-        yy1 = np.maximum(y1[i], y1[order[1:]])
-        xx2 = np.minimum(x2[i], x2[order[1:]])
-        yy2 = np.minimum(y2[i], y2[order[1:]])
-
-        w = np.maximum(1e-10, xx2 - xx1)
-        h = np.maximum(1e-10, yy2 - yy1)
-        inter = w * h
-
-        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
-        #reserve all the boundingbox whose ovr less than thresh
-        inds = np.where(iou <= nms_thresh)[0]
-        order = order[inds + 1]
-
-    return keep
-
-## class-agnostic NMS 
-def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh):
-    # nms
-    keep = nms(bboxes, scores, nms_thresh)
-    scores = scores[keep]
-    labels = labels[keep]
-    bboxes = bboxes[keep]
-
-    return scores, labels, bboxes
-
-## class-aware NMS 
-def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes):
-    # nms
-    keep = np.zeros(len(bboxes), dtype=np.int32)
-    for i in range(num_classes):
-        inds = np.where(labels == i)[0]
-        if len(inds) == 0:
-            continue
-        c_bboxes = bboxes[inds]
-        c_scores = scores[inds]
-        c_keep = nms(c_bboxes, c_scores, nms_thresh)
-        keep[inds[c_keep]] = 1
-    keep = np.where(keep > 0)
-    scores = scores[keep]
-    labels = labels[keep]
-    bboxes = bboxes[keep]
-
-    return scores, labels, bboxes
-
-## multi-class NMS 
-def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False):
-    if class_agnostic:
-        return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh)
-    else:
-        return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes)
-
-
-# ----------------- Customed NormLayer Ops -----------------
-class FrozenBatchNorm2d(torch.nn.Module):
-    def __init__(self, n):
-        super(FrozenBatchNorm2d, self).__init__()
-        self.register_buffer("weight", torch.ones(n))
-        self.register_buffer("bias", torch.zeros(n))
-        self.register_buffer("running_mean", torch.zeros(n))
-        self.register_buffer("running_var", torch.ones(n))
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        num_batches_tracked_key = prefix + 'num_batches_tracked'
-        if num_batches_tracked_key in state_dict:
-            del state_dict[num_batches_tracked_key]
-
-        super(FrozenBatchNorm2d, self)._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict,
-            missing_keys, unexpected_keys, error_msgs)
-
-    def forward(self, x):
-        # move reshapes to the beginning
-        # to make it fuser-friendly
-        w = self.weight.reshape(1, -1, 1, 1)
-        b = self.bias.reshape(1, -1, 1, 1)
-        rv = self.running_var.reshape(1, -1, 1, 1)
-        rm = self.running_mean.reshape(1, -1, 1, 1)
-        eps = 1e-5
-        scale = w * (rv + eps).rsqrt()
-        bias = b - rm * scale
-        return x * scale + bias
-
-class LayerNorm2D(nn.Module):
-    def __init__(self, normalized_shape, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.ln = norm_layer(normalized_shape) if norm_layer is not None else nn.Identity()
-
-    def forward(self, x):
-        """
-        x: N C H W
-        """
-        x = x.permute(0, 2, 3, 1)
-        x = self.ln(x)
-        x = x.permute(0, 3, 1, 2)
-        return x
-
-
-# ----------------- Basic CNN Ops -----------------
-def get_conv2d(c1, c2, k, p, s, g, bias=False):
-    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, groups=g, bias=bias)
-
-    return conv
-
-def get_activation(act_type=None):
-    if act_type == 'relu':
-        return nn.ReLU(inplace=True)
-    elif act_type == 'lrelu':
-        return nn.LeakyReLU(0.1, inplace=True)
-    elif act_type == 'mish':
-        return nn.Mish(inplace=True)
-    elif act_type == 'silu':
-        return nn.SiLU(inplace=True)
-    elif act_type == 'gelu':
-        return nn.GELU()
-    elif act_type is None:
-        return nn.Identity()
-    else:
-        raise NotImplementedError
-        
-def get_norm(norm_type, dim):
-    if norm_type == 'BN':
-        return nn.BatchNorm2d(dim)
-    elif norm_type == 'GN':
-        return nn.GroupNorm(num_groups=32, num_channels=dim)
-    elif norm_type is None:
-        return nn.Identity()
-    else:
-        raise NotImplementedError
-
-class BasicConv(nn.Module):
-    def __init__(self, 
-                 in_dim,                   # in channels
-                 out_dim,                  # out channels 
-                 kernel_size=1,            # kernel size 
-                 padding=0,                # padding
-                 stride=1,                 # padding
-                 act_type  :str = 'lrelu', # activation
-                 norm_type :str = 'BN',    # normalization
-                ):
-        super(BasicConv, self).__init__()
-        add_bias = False if norm_type else True
-        self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
-        self.norm = get_norm(norm_type, out_dim)
-        self.act  = get_activation(act_type)
-
-    def forward(self, x):
-        return self.act(self.norm(self.conv(x)))
-
-class UpSampleWrapper(nn.Module):
-    """Upsample last feat map to specific stride."""
-    def __init__(self, in_dim, upsample_factor):
-        super(UpSampleWrapper, self).__init__()
-        # ---------- Basic parameters ----------
-        self.upsample_factor = upsample_factor
-
-        # ---------- Network parameters ----------
-        if upsample_factor == 1:
-            self.upsample = nn.Identity()
-        else:
-            scale = int(math.log2(upsample_factor))
-            dim = in_dim
-            layers = []
-            for _ in range(scale-1):
-                layers += [
-                    nn.ConvTranspose2d(dim, dim, kernel_size=2, stride=2),
-                    LayerNorm2D(dim),
-                    nn.GELU()
-                ]
-            layers += [nn.ConvTranspose2d(dim, dim, kernel_size=2, stride=2)]
-            self.upsample = nn.Sequential(*layers)
-            self.out_dim = dim
-
-    def forward(self, x):
-        x = self.upsample(x)
-
-        return x
-
-
-# ----------------- MLP modules -----------------
-class MLP(nn.Module):
-    def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([in_dim] + h, h + [out_dim]))
-
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
-
-class FFN(nn.Module):
-    def __init__(self, d_model=256, ffn_dim=1024, dropout=0., act_type='relu', pre_norm=False):
-        super().__init__()
-        # ----------- Basic parameters -----------
-        self.pre_norm = pre_norm
-        self.ffn_dim = ffn_dim
-        # ----------- Network parameters -----------
-        self.linear1 = nn.Linear(d_model, self.ffn_dim)
-        self.activation = get_activation(act_type)
-        self.dropout2 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(self.ffn_dim, d_model)
-        self.dropout3 = nn.Dropout(dropout)
-        self.norm = nn.LayerNorm(d_model)
-
-    def forward(self, src):
-        if self.pre_norm:
-            src = self.norm(src)
-            src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
-            src = src + self.dropout3(src2)
-        else:
-            src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
-            src = src + self.dropout3(src2)
-            src = self.norm(src)
-        
-        return src
-    
-
-# ----------------- Attention Ops -----------------
-class GlobalCrossAttention(nn.Module):
-    def __init__(
-        self,
-        dim            :int   = 256,
-        num_heads      :int   = 8,
-        qkv_bias       :bool  = True,
-        qk_scale       :float = None,
-        attn_drop      :float = 0.0,
-        proj_drop      :float = 0.0,
-        rpe_hidden_dim :int   = 512,
-        feature_stride :int   = 16,
-    ):
-        super().__init__()
-        # --------- Basic parameters ---------
-        self.dim = dim
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-        self.feature_stride = feature_stride
-
-        # --------- Network parameters ---------
-        self.cpb_mlp1 = self.build_cpb_mlp(2, rpe_hidden_dim, num_heads)
-        self.cpb_mlp2 = self.build_cpb_mlp(2, rpe_hidden_dim, num_heads)
-        self.q = nn.Linear(dim, dim, bias=qkv_bias)
-        self.k = nn.Linear(dim, dim, bias=qkv_bias)
-        self.v = nn.Linear(dim, dim, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def build_cpb_mlp(self, in_dim, hidden_dim, out_dim):
-        cpb_mlp = nn.Sequential(nn.Linear(in_dim, hidden_dim, bias=True),
-                                nn.ReLU(inplace=True),
-                                nn.Linear(hidden_dim, out_dim, bias=False))
-        return cpb_mlp
-
-    def forward(
-        self,
-        query,
-        reference_points,
-        k_input_flatten,
-        v_input_flatten,
-        input_spatial_shapes,
-        input_padding_mask=None,
-    ):
-        assert input_spatial_shapes.size(0) == 1, 'This is designed for single-scale decoder.'
-        h, w = input_spatial_shapes[0]
-        stride = self.feature_stride
-
-        ref_pts = torch.cat([
-            reference_points[:, :, :, :2] - reference_points[:, :, :, 2:] / 2,
-            reference_points[:, :, :, :2] + reference_points[:, :, :, 2:] / 2,
-        ], dim=-1)  # B, nQ, 1, 4
-
-        pos_x = torch.linspace(0.5, w - 0.5, w, dtype=torch.float32, device=w.device)[None, None, :, None] * stride  # 1, 1, w, 1
-        pos_y = torch.linspace(0.5, h - 0.5, h, dtype=torch.float32, device=h.device)[None, None, :, None] * stride  # 1, 1, h, 1
-
-        delta_x = ref_pts[..., 0::2] - pos_x  # B, nQ, w, 2
-        delta_y = ref_pts[..., 1::2] - pos_y  # B, nQ, h, 2
-
-        rpe_x, rpe_y = self.cpb_mlp1(delta_x), self.cpb_mlp2(delta_y)  # B, nQ, w/h, nheads
-        rpe = (rpe_x[:, :, None] + rpe_y[:, :, :, None]).flatten(2, 3) # B, nQ, h, w, nheads ->  B, nQ, h*w, nheads
-        rpe = rpe.permute(0, 3, 1, 2)
-
-        B_, N, C = k_input_flatten.shape
-        k = self.k(k_input_flatten).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        v = self.v(v_input_flatten).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        B_, N, C = query.shape
-        q = self.q(query).reshape(B_, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        q = q * self.scale
-
-        attn = q @ k.transpose(-2, -1)
-        attn += rpe
-        if input_padding_mask is not None:
-            attn += input_padding_mask[:, None, None] * -100
-
-        fmin, fmax = torch.finfo(attn.dtype).min, torch.finfo(attn.dtype).max
-        torch.clip_(attn, min=fmin, max=fmax)
-
-        attn = self.softmax(attn)
-        attn = self.attn_drop(attn)
-        x = attn @ v
-
-        x = x.transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-
-        return x

+ 0 - 447
models/detectors/rtpdetr/basic_modules/transformer.py

@@ -1,447 +0,0 @@
-import math
-import copy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
-
-try:
-    from .basic import FFN, GlobalCrossAttention
-    from .basic import trunc_normal_
-except:
-    from  basic import FFN, GlobalCrossAttention
-    from  basic import trunc_normal_
-
-
-def get_clones(module, N):
-    if N <= 0:
-        return None
-    else:
-        return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
-
-def inverse_sigmoid(x, eps=1e-5):
-    x = x.clamp(min=0., max=1.)
-    return torch.log(x.clamp(min=eps) / (1 - x).clamp(min=eps))
-
-
-# ----------------- Transformer modules -----------------
-## Transformer Encoder layer
-class TransformerEncoderLayer(nn.Module):
-    def __init__(self,
-                 d_model         :int   = 256,
-                 num_heads       :int   = 8,
-                 ffn_dim         :int = 1024,
-                 dropout         :float = 0.1,
-                 act_type        :str   = "relu",
-                 ):
-        super().__init__()
-        # ----------- Basic parameters -----------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.ffn_dim = ffn_dim
-        self.dropout = dropout
-        self.act_type = act_type
-        # ----------- Basic parameters -----------
-        # Multi-head Self-Attn
-        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
-        self.dropout = nn.Dropout(dropout)
-        self.norm = nn.LayerNorm(d_model)
-
-        # Feedforwaed Network
-        self.ffn = FFN(d_model, ffn_dim, dropout, act_type)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-
-    def forward(self, src, pos_embed):
-        """
-        Input:
-            src:       [torch.Tensor] -> [B, N, C]
-            pos_embed: [torch.Tensor] -> [B, N, C]
-        Output:
-            src:       [torch.Tensor] -> [B, N, C]
-        """
-        q = k = self.with_pos_embed(src, pos_embed)
-
-        # -------------- MHSA --------------
-        src2 = self.self_attn(q, k, value=src)[0]
-        src = src + self.dropout(src2)
-        src = self.norm(src)
-
-        # -------------- FFN --------------
-        src = self.ffn(src)
-        
-        return src
-
-## Transformer Encoder
-class TransformerEncoder(nn.Module):
-    def __init__(self,
-                 d_model        :int   = 256,
-                 num_heads      :int   = 8,
-                 num_layers     :int   = 1,
-                 ffn_dim        :int = 1024,
-                 pe_temperature : float = 10000.,
-                 dropout        :float = 0.1,
-                 act_type       :str   = "relu",
-                 ):
-        super().__init__()
-        # ----------- Basic parameters -----------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.ffn_dim = ffn_dim
-        self.dropout = dropout
-        self.act_type = act_type
-        self.pe_temperature = pe_temperature
-        self.pos_embed = None
-        # ----------- Basic parameters -----------
-        self.encoder_layers = get_clones(
-            TransformerEncoderLayer(d_model, num_heads, ffn_dim, dropout, act_type), num_layers)
-
-    def build_2d_sincos_position_embedding(self, device, w, h, embed_dim=256, temperature=10000.):
-        assert embed_dim % 4 == 0, \
-            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
-        
-        # ----------- Check cahed pos_embed -----------
-        if self.pos_embed is not None and \
-            self.pos_embed.shape[2:] == [h, w]:
-            return self.pos_embed
-        
-        # ----------- Generate grid coords -----------
-        grid_w = torch.arange(int(w), dtype=torch.float32)
-        grid_h = torch.arange(int(h), dtype=torch.float32)
-        grid_w, grid_h = torch.meshgrid([grid_w, grid_h])  # shape: [H, W]
-
-        pos_dim = embed_dim // 4
-        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
-        omega = 1. / (temperature**omega)
-
-        out_w = grid_w.flatten()[..., None] @ omega[None] # shape: [N, C]
-        out_h = grid_h.flatten()[..., None] @ omega[None] # shape: [N, C]
-
-        # shape: [1, N, C]
-        pos_embed = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h),torch.cos(out_h)], dim=1)[None, :, :]
-        pos_embed = pos_embed.to(device)
-        self.pos_embed = pos_embed
-
-        return pos_embed
-
-    def forward(self, src):
-        """
-        Input:
-            src:  [torch.Tensor] -> [B, C, H, W]
-        Output:
-            src:  [torch.Tensor] -> [B, C, H, W]
-        """
-        # -------- Transformer encoder --------
-        channels, fmp_h, fmp_w = src.shape[1:]
-        # [B, C, H, W] -> [B, N, C], N=HxW
-        src_flatten = src.flatten(2).permute(0, 2, 1)
-        memory = src_flatten
-
-        # PosEmbed: [1, N, C]
-        pos_embed = self.build_2d_sincos_position_embedding(
-            src.device, fmp_w, fmp_h, channels, self.pe_temperature)
-        
-        # Transformer Encoder layer
-        for encoder in self.encoder_layers:
-            memory = encoder(memory, pos_embed=pos_embed)
-
-        # Output: [B, N, C] -> [B, C, N] -> [B, C, H, W]
-        src = memory.permute(0, 2, 1).reshape([-1, channels, fmp_h, fmp_w])
-
-        return src
-
-## PlainDETR's Decoder layer
-class GlobalDecoderLayer(nn.Module):
-    def __init__(self,
-                 d_model    :int   = 256,
-                 num_heads  :int   = 8,
-                 ffn_dim    :int = 1024,
-                 dropout    :float = 0.1,
-                 act_type   :str   = "relu",
-                 pre_norm   :bool  = False,
-                 rpe_hidden_dim :int = 512,
-                 feature_stride :int = 16,
-                 ) -> None:
-        super().__init__()
-        # ------------ Basic parameters ------------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.rpe_hidden_dim = rpe_hidden_dim
-        self.ffn_dim = ffn_dim
-        self.act_type = act_type
-        self.pre_norm = pre_norm
-
-        # ------------ Network parameters ------------
-        ## Multi-head Self-Attn
-        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(d_model)
-
-        ## Box-reparam Global Cross-Attn
-        self.cross_attn = GlobalCrossAttention(d_model, num_heads, rpe_hidden_dim=rpe_hidden_dim, feature_stride=feature_stride)
-        self.dropout2 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(d_model)
-
-        ## FFN
-        self.ffn = FFN(d_model, ffn_dim, dropout, act_type, pre_norm)
-
-    @staticmethod
-    def with_pos_embed(tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward_pre_norm(self,
-                         tgt,
-                         query_pos,
-                         reference_points,
-                         src,
-                         src_pos_embed,
-                         src_spatial_shapes,
-                         src_padding_mask=None,
-                         self_attn_mask=None,
-                         ):
-        # ----------- Multi-head self attention -----------
-        tgt1 = self.norm1(tgt)
-        q = k = self.with_pos_embed(tgt1, query_pos)
-        tgt1 = self.self_attn(q.transpose(0, 1),        # [B, N, C] -> [N, B, C], batch_first = False
-                              k.transpose(0, 1),        # [B, N, C] -> [N, B, C], batch_first = False
-                              tgt1.transpose(0, 1),     # [B, N, C] -> [N, B, C], batch_first = False
-                              attn_mask=self_attn_mask,
-                              )[0].transpose(0, 1)      # [N, B, C] -> [B, N, C]
-        tgt = tgt + self.dropout1(tgt1)
-
-        # ----------- Global corss attention -----------
-        tgt1 = self.norm2(tgt)
-        tgt1 = self.cross_attn(self.with_pos_embed(tgt1, query_pos),
-                               reference_points,
-                               self.with_pos_embed(src, src_pos_embed),
-                               src,
-                               src_spatial_shapes,
-                               src_padding_mask,
-                               )
-        tgt = tgt + self.dropout2(tgt1)
-
-        # ----------- FeedForward Network -----------
-        tgt = self.ffn(tgt)
-
-        return tgt
-
-    def forward_post_norm(self,
-                          tgt,
-                          query_pos,
-                          reference_points,
-                          src,
-                          src_pos_embed,
-                          src_spatial_shapes,
-                          src_padding_mask=None,
-                          self_attn_mask=None,
-                          ):
-        # ----------- Multi-head self attention -----------
-        q = k = self.with_pos_embed(tgt, query_pos)
-        tgt1 = self.self_attn(q.transpose(0, 1),        # [B, N, C] -> [N, B, C], batch_first = False
-                              k.transpose(0, 1),        # [B, N, C] -> [N, B, C], batch_first = False
-                              tgt.transpose(0, 1),     # [B, N, C] -> [N, B, C], batch_first = False
-                              attn_mask=self_attn_mask,
-                              )[0].transpose(0, 1)      # [N, B, C] -> [B, N, C]
-        tgt = tgt + self.dropout1(tgt1)
-        tgt = self.norm1(tgt)
-
-        # ----------- Global corss attention -----------
-        tgt1 = self.cross_attn(self.with_pos_embed(tgt, query_pos),
-                               reference_points,
-                               self.with_pos_embed(src, src_pos_embed),
-                               src,
-                               src_spatial_shapes,
-                               src_padding_mask,
-                               )
-        tgt = tgt + self.dropout2(tgt1)
-        tgt = self.norm2(tgt)
-
-        # ----------- FeedForward Network -----------
-        tgt = self.ffn(tgt)
-
-        return tgt
-
-    def forward(self,
-                tgt,
-                query_pos,
-                reference_points,
-                src,
-                src_pos_embed,
-                src_spatial_shapes,
-                src_padding_mask=None,
-                self_attn_mask=None,
-                ):
-        if self.pre_norm:
-            return self.forward_pre_norm(tgt, query_pos, reference_points, src, src_pos_embed, src_spatial_shapes,
-                                         src_padding_mask, self_attn_mask)
-        else:
-            return self.forward_post_norm(tgt, query_pos, reference_points, src, src_pos_embed, src_spatial_shapes,
-                                          src_padding_mask, self_attn_mask)
-
-## PlainDETR's Decoder
-class GlobalDecoder(nn.Module):
-    def __init__(self,
-                 # Decoder layer params
-                 d_model    :int   = 256,
-                 num_heads  :int   = 8,
-                 ffn_dim    :int = 1024,
-                 dropout    :float = 0.1,
-                 act_type   :str   = "relu",
-                 pre_norm   :bool  = False,
-                 rpe_hidden_dim :int = 512,
-                 feature_stride :int = 16,
-                 num_layers     :int = 6,
-                 # Decoder params
-                 return_intermediate :bool = False,
-                 use_checkpoint      :bool = False,
-                 ):
-        super().__init__()
-        # ------------ Basic parameters ------------
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.rpe_hidden_dim = rpe_hidden_dim
-        self.ffn_dim = ffn_dim
-        self.act_type = act_type
-        self.num_layers = num_layers
-        self.return_intermediate = return_intermediate
-        self.use_checkpoint = use_checkpoint
-
-        # ------------ Network parameters ------------
-        decoder_layer = GlobalDecoderLayer(
-            d_model, num_heads, ffn_dim, dropout, act_type, pre_norm, rpe_hidden_dim, feature_stride,)
-        self.layers = get_clones(decoder_layer, num_layers)
-        self.bbox_embed = None
-        self.class_embed = None
-
-        if pre_norm:
-            self.final_layer_norm = nn.LayerNorm(d_model)
-        else:
-            self.final_layer_norm = None
-
-    def _reset_parameters(self):            
-        # stolen from Swin Transformer
-        def _init_weights(m):
-            if isinstance(m, nn.Linear):
-                trunc_normal_(m.weight, std=0.02)
-                if isinstance(m, nn.Linear) and m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.LayerNorm):
-                nn.init.constant_(m.bias, 0)
-                nn.init.constant_(m.weight, 1.0)
-
-        self.apply(_init_weights)
-
-    def inverse_sigmoid(self, x, eps=1e-5):
-        x = x.clamp(min=0, max=1)
-        x1 = x.clamp(min=eps)
-        x2 = (1 - x).clamp(min=eps)
-
-        return torch.log(x1 / x2)
-
-    def box_xyxy_to_cxcywh(self, x):
-        x0, y0, x1, y1 = x.unbind(-1)
-        b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
-        
-        return torch.stack(b, dim=-1)
-
-    def delta2bbox(self, proposals,
-                   deltas,
-                   max_shape=None,
-                   wh_ratio_clip=16 / 1000,
-                   clip_border=True,
-                   add_ctr_clamp=False,
-                   ctr_clamp=32):
-
-        dxy = deltas[..., :2]
-        dwh = deltas[..., 2:]
-
-        # Compute width/height of each roi
-        pxy = proposals[..., :2]
-        pwh = proposals[..., 2:]
-
-        dxy_wh = pwh * dxy
-        wh_ratio_clip = torch.as_tensor(wh_ratio_clip)
-        max_ratio = torch.abs(torch.log(wh_ratio_clip)).item()
-        
-        if add_ctr_clamp:
-            dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
-            dwh = torch.clamp(dwh, max=max_ratio)
-        else:
-            dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
-
-        gxy = pxy + dxy_wh
-        gwh = pwh * dwh.exp()
-        x1y1 = gxy - (gwh * 0.5)
-        x2y2 = gxy + (gwh * 0.5)
-        bboxes = torch.cat([x1y1, x2y2], dim=-1)
-        if clip_border and max_shape is not None:
-            bboxes[..., 0::2].clamp_(min=0).clamp_(max=max_shape[1])
-            bboxes[..., 1::2].clamp_(min=0).clamp_(max=max_shape[0])
-
-        return bboxes
-
-    def forward(self,
-                tgt,
-                reference_points,
-                src,
-                src_pos_embed,
-                src_spatial_shapes,
-                query_pos=None,
-                src_padding_mask=None,
-                self_attn_mask=None,
-                max_shape=None,
-                ):
-        output = tgt
-
-        intermediate = []
-        intermediate_reference_points = []
-        for lid, layer in enumerate(self.layers):
-            reference_points_input = reference_points[:, :, None]
-            if self.use_checkpoint:
-                output = checkpoint.checkpoint(
-                    layer,
-                    output,
-                    query_pos,
-                    reference_points_input,
-                    src,
-                    src_pos_embed,
-                    src_spatial_shapes,
-                    src_padding_mask,
-                    self_attn_mask,
-                )
-            else:
-                output = layer(
-                    output,
-                    query_pos,
-                    reference_points_input,
-                    src,
-                    src_pos_embed,
-                    src_spatial_shapes,
-                    src_padding_mask,
-                    self_attn_mask,
-                )
-
-            if self.final_layer_norm is not None:
-                output_after_norm = self.final_layer_norm(output)
-            else:
-                output_after_norm = output
-
-            # hack implementation for iterative bounding box refinement
-            if self.bbox_embed is not None:
-                tmp = self.bbox_embed[lid](output_after_norm)
-                new_reference_points = self.box_xyxy_to_cxcywh(
-                    self.delta2bbox(reference_points, tmp, max_shape)) 
-                reference_points = new_reference_points.detach()
-
-            if self.return_intermediate:
-                intermediate.append(output_after_norm)
-                intermediate_reference_points.append(new_reference_points)
-
-        if self.return_intermediate:
-            return torch.stack(intermediate), torch.stack(intermediate_reference_points)
-
-        return output_after_norm, reference_points

+ 0 - 36
models/detectors/rtpdetr/build.py

@@ -1,36 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-
-import torch
-import torch.nn as nn
-
-from .loss import build_criterion
-from .rtpdetr import RT_PDETR
-
-
-# build object detector
-def build_rtpdetr(args, cfg, num_classes=80, trainable=False, deploy=False):
-    print('==============================')
-    print('Build {} ...'.format(args.model.upper()))
-    
-    print('==============================')
-    print('Model Configuration: \n', cfg)
-    
-    # -------------- Build RT-DETR --------------
-    model = RT_PDETR(cfg             = cfg,
-                     num_classes     = num_classes,
-                     conf_thresh     = args.conf_thresh,
-                     topk            = 300,
-                     deploy          = deploy,
-                     no_multi_labels = args.no_multi_labels,
-                     use_nms         = True,   # NMS is beneficial 
-                     nms_class_agnostic = args.nms_class_agnostic
-                     )
-            
-    # -------------- Build criterion --------------
-    criterion = None
-    if trainable:
-        # build criterion for training
-        criterion = build_criterion(cfg, num_classes, aux_loss=True)
-        
-    return model, criterion

+ 0 - 214
models/detectors/rtpdetr/loss.py

@@ -1,214 +0,0 @@
-import copy
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-try:
-    from .loss_utils import sigmoid_focal_loss
-    from .loss_utils import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, generalized_box_iou, bbox2delta
-    from .loss_utils import is_dist_avail_and_initialized, get_world_size
-    from .matcher import HungarianMatcher
-except:
-    from loss_utils import sigmoid_focal_loss
-    from loss_utils import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, generalized_box_iou, bbox2delta
-    from loss_utils import is_dist_avail_and_initialized, get_world_size
-    from matcher import HungarianMatcher
-
-
-class Criterion(nn.Module):
-    """ This class computes the loss for DETR.
-    The process happens in two steps:
-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
-    """
-    def __init__(self, cfg, num_classes=80, aux_loss=False):
-        super().__init__()
-        # ------------ Basic parameters ------------
-        self.cfg = cfg
-        self.num_classes = num_classes
-        self.k_one2many = cfg['k_one2many']
-        self.lambda_one2many = cfg['lambda_one2many']
-        self.aux_loss = aux_loss
-        self.losses = ['labels', 'boxes']
-        # ------------- Focal loss -------------
-        self.alpha = 0.25
-        self.gamma = 2.0
-        # ------------ Matcher ------------
-        self.matcher = HungarianMatcher(cost_class = cfg['matcher_hpy']['cost_class'],
-                                        cost_bbox  = cfg['matcher_hpy']['cost_bbox'],
-                                        cost_giou  = cfg['matcher_hpy']['cost_giou']
-                                        )
-        # ------------- Loss weight -------------
-        self.weight_dict = {'loss_cls':  cfg['loss_coeff']['class'],
-                            'loss_box':  cfg['loss_coeff']['bbox'],
-                            'loss_giou': cfg['loss_coeff']['giou']}
-
-    def _get_src_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
-        src_idx = torch.cat([src for (src, _) in indices])
-        return batch_idx, src_idx
-
-    def _get_tgt_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
-        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
-        return batch_idx, tgt_idx
-
-    def loss_labels(self, outputs, targets, indices, num_boxes):
-        """Classification loss (NLL)
-        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
-        """
-        assert 'pred_logits' in outputs
-        src_logits = outputs['pred_logits']
-        # prepare class targets
-        idx = self._get_src_permutation_idx(indices)
-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]).to(src_logits.device)
-        target_classes = torch.full(src_logits.shape[:2],
-                                    self.num_classes,
-                                    dtype=torch.int64,
-                                    device=src_logits.device)
-        target_classes[idx] = target_classes_o
-
-        # to one-hot labels
-        target_classes_onehot = torch.zeros([*src_logits.shape[:2], self.num_classes + 1],
-                                            dtype=src_logits.dtype,
-                                            layout=src_logits.layout,
-                                            device=src_logits.device)
-        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
-        target_classes_onehot = target_classes_onehot[..., :-1]
-
-        # focal loss
-        loss_cls = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, self.alpha, self.gamma)
-
-        losses = {}
-        losses['loss_cls'] = loss_cls * src_logits.shape[1]
-
-        return losses
-
-    def loss_boxes(self, outputs, targets, indices, num_boxes):
-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
-        """
-        assert 'pred_boxes' in outputs
-        # prepare bbox targets
-        idx = self._get_src_permutation_idx(indices)
-        src_boxes = outputs['pred_boxes'][idx]
-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0).to(src_boxes.device)
-        
-        # compute L1 loss
-        loss_bbox = F.l1_loss(src_boxes, box_xyxy_to_cxcywh(target_boxes), reduction='none')
-        src_deltas = outputs["pred_deltas"][idx]
-        src_boxes_old = outputs["pred_boxes_old"][idx]
-        target_deltas = bbox2delta(src_boxes_old, target_boxes)
-        loss_bbox = F.l1_loss(src_deltas, target_deltas, reduction="none")
-
-        # compute GIoU loss
-        bbox_giou = generalized_box_iou(box_cxcywh_to_xyxy(src_boxes),
-                                        box_cxcywh_to_xyxy(target_boxes))
-        loss_giou = 1 - torch.diag(bbox_giou)
-        
-        losses = {}
-        losses['loss_box'] = loss_bbox.sum() / num_boxes
-        losses['loss_giou'] = loss_giou.sum() / num_boxes
-
-        return losses
-
-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
-        loss_map = {
-            'labels': self.loss_labels,
-            'boxes': self.loss_boxes,
-        }
-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
-
-    def compute_loss(self, outputs, targets):
-        """ This performs the loss computation.
-        Parameters:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
-        """
-        outputs_without_aux = {
-            k: v
-            for k, v in outputs.items()
-            if k != "aux_outputs" and k != "enc_outputs"
-        }
-
-        # Retrieve the matching between the outputs of the last layer and the targets
-        indices = self.matcher(outputs_without_aux, targets)
-
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
-        num_boxes = sum(len(t["labels"]) for t in targets)
-        num_boxes = torch.as_tensor(
-            [num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device
-        )
-        if is_dist_avail_and_initialized():
-            torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
-
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            kwargs = {}
-            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes, **kwargs)
-            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
-            losses.update(l_dict)
-
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if "aux_outputs" in outputs:
-            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
-                indices = self.matcher(aux_outputs, targets)
-                for loss in self.losses:
-                    kwargs = {}
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
-                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
-                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
-                    losses.update(l_dict)
-
-        if "enc_outputs" in outputs:
-            enc_outputs = outputs["enc_outputs"]
-            bin_targets = copy.deepcopy(targets)
-            for bt in bin_targets:
-                bt["labels"] = torch.zeros_like(bt["labels"])
-            indices = self.matcher(enc_outputs, bin_targets)
-            for loss in self.losses:
-                kwargs = {}
-                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs)
-                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
-                l_dict = {k + "_enc": v for k, v in l_dict.items()}
-                losses.update(l_dict)
-
-        return losses
-
-    def forward(self, outputs, targets):
-        # --------------------- One-to-one losses ---------------------
-        outputs_one2one = {k: v for k, v in outputs.items() if "one2many" not in k}
-        loss_dict = self.compute_loss(outputs_one2one, targets)
-
-        # --------------------- One-to-many losses ---------------------
-        outputs_one2many = {k[:-9]: v for k, v in outputs.items() if "one2many" in k}
-        if len(outputs_one2many) > 0:
-            # Copy targets
-            multi_targets = copy.deepcopy(targets)
-            for target in multi_targets:
-                target["boxes"] = target["boxes"].repeat(self.k_one2many, 1)
-                target["labels"] = target["labels"].repeat(self.k_one2many)
-            # Compute one-to-many losses
-            one2many_loss_dict = self.compute_loss(outputs_one2many, multi_targets)
-            # add one2many losses in to the final loss_dict
-            for k, v in one2many_loss_dict.items():
-                if k + "_one2many" in loss_dict.keys():
-                    loss_dict[k + "_one2many"] += v * self.lambda_one2many
-                else:
-                    loss_dict[k + "_one2many"] = v * self.lambda_one2many
-
-        return loss_dict
-    
-# build criterion
-def build_criterion(cfg, num_classes, aux_loss=True):
-    criterion = Criterion(cfg, num_classes, aux_loss)
-
-    return criterion
-    

+ 0 - 124
models/detectors/rtpdetr/loss_utils.py

@@ -1,124 +0,0 @@
-import math
-import torch
-import torch.nn.functional as F
-import torch.distributed as dist
-from torchvision.ops.boxes import box_area
-
-
-# ------------------------- For loss -------------------------
-## FocalLoss
-def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
-    """
-    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
-    Args:
-        inputs: A float tensor of arbitrary shape.
-                The predictions for each example.
-        targets: A float tensor with the same shape as inputs. Stores the binary
-                 classification label for each element in inputs
-                (0 for the negative class and 1 for the positive class).
-        alpha: (optional) Weighting factor in range (0,1) to balance
-                positive vs negative examples. Default = -1 (no weighting).
-        gamma: Exponent of the modulating factor (1 - p_t) to
-               balance easy vs hard examples.
-    Returns:
-        Loss tensor
-    """
-    prob = inputs.sigmoid()
-    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
-    p_t = prob * targets + (1 - prob) * (1 - targets)
-    loss = ce_loss * ((1 - p_t) ** gamma)
-
-    if alpha >= 0:
-        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
-        loss = alpha_t * loss
-
-    return loss.mean(1).sum() / num_boxes
-
-
-# ------------------------- For box -------------------------
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return torch.stack(b, dim=-1)
-
-def box_xyxy_to_cxcywh(x):
-    x0, y0, x1, y1 = x.unbind(-1)
-    b = [(x0 + x1) / 2, (y0 + y1) / 2,
-         (x1 - x0), (y1 - y0)]
-    return torch.stack(b, dim=-1)
-
-def bbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)):
-    # hack for matcher
-    if proposals.size() != gt.size():
-        proposals = proposals[:, None]
-        gt = gt[None]
-
-    proposals = proposals.float()
-    gt = gt.float()
-    px, py, pw, ph = proposals.unbind(-1)
-    gx, gy, gw, gh = gt.unbind(-1)
-
-    dx = (gx - px) / (pw + 0.1)
-    dy = (gy - py) / (ph + 0.1)
-    dw = torch.log(gw / (pw + 0.1))
-    dh = torch.log(gh / (ph + 0.1))
-    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
-
-    means = deltas.new_tensor(means).unsqueeze(0)
-    stds = deltas.new_tensor(stds).unsqueeze(0)
-    deltas = deltas.sub_(means).div_(stds)
-
-    return deltas
-
-def box_iou(boxes1, boxes2):
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
-    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
-
-    union = area1[:, None] + area2 - inter
-
-    iou = inter / union
-    return iou, union
-
-def generalized_box_iou(boxes1, boxes2):
-    """
-    Generalized IoU from https://giou.stanford.edu/
-
-    The boxes should be in [x0, y0, x1, y1] format
-
-    Returns a [N, M] pairwise matrix, where N = len(boxes1)
-    and M = len(boxes2)
-    """
-    # degenerate boxes gives inf / nan results
-    # so do an early check
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
-    iou, union = box_iou(boxes1, boxes2)
-
-    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
-    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
-
-    wh = (rb - lt).clamp(min=0)  # [N,M,2]
-    area = wh[:, :, 0] * wh[:, :, 1]
-
-    return iou - (area - union) / area
-
-
-# ------------------------- For distributed -------------------------
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()

+ 0 - 115
models/detectors/rtpdetr/matcher.py

@@ -1,115 +0,0 @@
-# ------------------------------------------------------------------------
-# Plain-DETR
-# Copyright (c) 2023 Xi'an Jiaotong University & Microsoft Research Asia.
-# Licensed under The MIT License [see LICENSE for details]
-# ------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------
-# Modified from DETR (https://github.com/facebookresearch/detr)
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-# ------------------------------------------------------------------------
-
-"""
-Modules to compute the matching cost and solve the corresponding LSAP.
-"""
-import torch
-from scipy.optimize import linear_sum_assignment
-from torch import nn
-
-try:
-    from .loss_utils import box_cxcywh_to_xyxy, generalized_box_iou, bbox2delta
-except:
-    from loss_utils import box_cxcywh_to_xyxy, generalized_box_iou, bbox2delta
-
-
-class HungarianMatcher(nn.Module):
-    """This class computes an assignment between the targets and the predictions of the network
-
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
-    while the others are un-matched (and thus treated as non-objects).
-    """
-
-    def __init__(self,
-                 cost_class: float = 1,
-                 cost_bbox:  float = 1,
-                 cost_giou:  float = 1,
-                 ):
-        """Creates the matcher
-
-        Params:
-            cost_class: This is the relative weight of the classification error in the matching cost
-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
-        """
-        super().__init__()
-        self.cost_class = cost_class
-        self.cost_bbox = cost_bbox
-        self.cost_giou = cost_giou
-        assert (
-            cost_class != 0 or cost_bbox != 0 or cost_giou != 0
-        ), "all costs cant be 0"
-
-    def forward(self, outputs, targets):
-        """ Performs the matching
-
-        Params:
-            outputs: This is a dict that contains at least these entries:
-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
-
-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
-                           objects in the target) containing the class labels
-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
-
-        Returns:
-            A list of size batch_size, containing tuples of (index_i, index_j) where:
-                - index_i is the indices of the selected predictions (in order)
-                - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds:
-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        with torch.no_grad():
-            bs, num_queries = outputs["pred_logits"].shape[:2]
-
-            # We flatten to compute the cost matrices in a batch
-            out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
-            out_bbox = outputs["pred_boxes"].flatten(0, 1)
-
-            # Also concat the target labels and boxes
-            tgt_ids = torch.cat([v["labels"] for v in targets]).to(out_prob.device)
-            tgt_bbox = torch.cat([v["boxes"] for v in targets]).to(out_prob.device)
-
-            # Compute the classification cost.
-            alpha = 0.25
-            gamma = 2.0
-            neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
-            pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
-            cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
-
-            # Compute the L1 cost between boxes
-            out_delta = outputs["pred_deltas"].flatten(0, 1)
-            out_bbox_old = outputs["pred_boxes_old"].flatten(0, 1)
-            tgt_delta = bbox2delta(out_bbox_old, tgt_bbox)
-            cost_bbox = torch.cdist(out_delta[:, None], tgt_delta, p=1).squeeze(1)
-
-            # Compute the giou cost betwen boxes
-            cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
-                                             box_cxcywh_to_xyxy(tgt_bbox)
-            )
-
-            # Final cost matrix
-            C = self.cost_bbox  * cost_bbox + \
-                self.cost_class * cost_class + \
-                self.cost_giou  * cost_giou
-            C = C.view(bs, num_queries, -1).cpu()
-
-            sizes = [len(v["boxes"]) for v in targets]
-            indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
-            
-            return [(torch.as_tensor(i, dtype=torch.int64),  # batch index
-                     torch.as_tensor(j, dtype=torch.int64))  # query index
-                     for i, j in indices]

+ 0 - 433
models/detectors/rtpdetr/rtpdetr.py

@@ -1,433 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-
-try:
-    from .basic_modules.basic import MLP, multiclass_nms
-    from .basic_modules.transformer import get_clones
-    from .rtpdetr_encoder import build_image_encoder
-    from .rtpdetr_decoder import build_transformer
-except:
-    from  basic_modules.basic import MLP, multiclass_nms
-    from  basic_modules.transformer import get_clones
-    from  rtpdetr_encoder import build_image_encoder
-    from  rtpdetr_decoder import build_transformer
-
-
-# Real-time PlainDETR
-class RT_PDETR(nn.Module):
-    def __init__(self,
-                 cfg,
-                 num_classes = 80,
-                 conf_thresh = 0.1,
-                 nms_thresh  = 0.5,
-                 topk        = 300,
-                 deploy      = False,
-                 no_multi_labels = False,
-                 use_nms     = False,
-                 nms_class_agnostic = False,
-                 aux_loss    = False,
-                 ):
-        super().__init__()
-        # ----------- Basic setting -----------
-        self.num_queries_one2one = cfg['num_queries_one2one']
-        self.num_queries_one2many = cfg['num_queries_one2many']
-        self.num_queries = self.num_queries_one2one + self.num_queries_one2many
-        self.num_classes = num_classes
-        self.num_topk = topk
-        self.aux_loss = aux_loss
-        self.deploy = deploy
-        # scale hidden channels by width_factor
-        cfg['hidden_dim'] = round(cfg['hidden_dim'] * cfg['width'])
-        ## Post-process parameters
-        self.use_nms = use_nms
-        self.nms_thresh = nms_thresh
-        self.conf_thresh = conf_thresh
-        self.no_multi_labels = no_multi_labels
-        self.nms_class_agnostic = nms_class_agnostic
-
-        # ----------- Network setting -----------
-        ## Image encoder
-        self.image_encoder = build_image_encoder(cfg)
-
-        ## Transformer Decoder
-        self.transformer = build_transformer(cfg, return_intermediate=self.training)
-        self.query_embed = nn.Embedding(self.num_queries, cfg['hidden_dim'])
-
-        ## Detect Head
-        class_embed = nn.Linear(cfg['hidden_dim'], num_classes)
-        bbox_embed = MLP(cfg['hidden_dim'], cfg['hidden_dim'], 4, 3)
-
-        prior_prob = 0.01
-        bias_value = -math.log((1 - prior_prob) / prior_prob)
-        class_embed.bias.data = torch.ones(num_classes) * bias_value
-        nn.init.constant_(bbox_embed.layers[-1].weight.data, 0)
-        nn.init.constant_(bbox_embed.layers[-1].bias.data, 0)
-
-        self.class_embed = get_clones(class_embed, cfg['de_num_layers'] + 1)
-        self.bbox_embed  = get_clones(bbox_embed, cfg['de_num_layers'] + 1)
-        nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
-
-        self.transformer.decoder.bbox_embed = self.bbox_embed
-        self.transformer.decoder.class_embed = self.class_embed
-
-    def pos2posembed(self, d_model, pos, temperature=10000):
-        scale = 2 * torch.pi
-        num_pos_feats = d_model // 2
-
-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats
-        dim_t = temperature ** (2 * dim_t_)
-
-        # Position embedding for XY
-        x_embed = pos[..., 0] * scale
-        y_embed = pos[..., 1] * scale
-        pos_x = x_embed[..., None] / dim_t
-        pos_y = y_embed[..., None] / dim_t
-        pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
-        pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
-        posemb = torch.cat((pos_y, pos_x), dim=-1)
-        
-        # Position embedding for WH
-        if pos.size(-1) == 4:
-            w_embed = pos[..., 2] * scale
-            h_embed = pos[..., 3] * scale
-            pos_w = w_embed[..., None] / dim_t
-            pos_h = h_embed[..., None] / dim_t
-            pos_w = torch.stack((pos_w[..., 0::2].sin(), pos_w[..., 1::2].cos()), dim=-1).flatten(-2)
-            pos_h = torch.stack((pos_h[..., 0::2].sin(), pos_h[..., 1::2].cos()), dim=-1).flatten(-2)
-            posemb = torch.cat((posemb, pos_w, pos_h), dim=-1)
-        
-        return posemb
-
-    def get_posembed(self, d_model, mask, temperature=10000, normalize=False):
-        not_mask = ~mask
-        # [B, H, W]
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
-
-        if normalize:
-            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + 1e-6)
-            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + 1e-6)
-        else:
-            y_embed = y_embed - 0.5
-            x_embed = x_embed - 0.5
-    
-        # [H, W] -> [B, H, W, 2]
-        pos = torch.stack([x_embed, y_embed], dim=-1)
-
-        # [B, H, W, C]
-        pos_embed = self.pos2posembed(d_model, pos, temperature)
-        pos_embed = pos_embed.permute(0, 3, 1, 2)
-        
-        return pos_embed
-
-    def post_process(self, box_pred, cls_pred):
-        # xywh -> xyxy
-        box_preds_x1y1 = box_pred[..., :2] - 0.5 * box_pred[..., 2:]
-        box_preds_x2y2 = box_pred[..., :2] + 0.5 * box_pred[..., 2:]
-        box_pred = torch.cat([box_preds_x1y1, box_preds_x2y2], dim=-1)
-
-        cls_pred = cls_pred[0]
-        box_pred = box_pred[0]
-        if self.no_multi_labels:
-            # [M,]
-            scores, labels = torch.max(cls_pred.sigmoid(), dim=1)
-
-            # Keep top k top scoring indices only.
-            num_topk = min(self.num_topk, box_pred.size(0))
-
-            # Topk candidates
-            predicted_prob, topk_idxs = scores.sort(descending=True)
-            topk_scores = predicted_prob[:num_topk]
-            topk_idxs = topk_idxs[:num_topk]
-
-            # Filter out the proposals with low confidence score
-            keep_idxs = topk_scores > self.conf_thresh
-            topk_idxs = topk_idxs[keep_idxs]
-
-            # Top-k results
-            topk_scores = topk_scores[keep_idxs]
-            topk_labels = labels[topk_idxs]
-            topk_bboxes = box_pred[topk_idxs]
-
-        else:
-            # Top-k select
-            cls_pred = cls_pred.flatten().sigmoid_()
-            box_pred = box_pred
-
-            # Keep top k top scoring indices only.
-            num_topk = min(self.num_topk, box_pred.size(0))
-
-            # Topk candidates
-            predicted_prob, topk_idxs = cls_pred.sort(descending=True)
-            topk_scores = predicted_prob[:num_topk]
-            topk_idxs = topk_idxs[:self.num_topk]
-
-            # Filter out the proposals with low confidence score
-            keep_idxs = topk_scores > self.conf_thresh
-            topk_scores = topk_scores[keep_idxs]
-            topk_idxs = topk_idxs[keep_idxs]
-            topk_box_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
-
-            ## Top-k results
-            topk_labels = topk_idxs % self.num_classes
-            topk_bboxes = box_pred[topk_box_idxs]
-
-        topk_scores = topk_scores.cpu().numpy()
-        topk_labels = topk_labels.cpu().numpy()
-        topk_bboxes = topk_bboxes.cpu().numpy()
-
-        # nms
-        if self.use_nms:
-            topk_scores, topk_labels, topk_bboxes = multiclass_nms(
-                topk_scores, topk_labels, topk_bboxes, self.nms_thresh, self.num_classes, self.nms_class_agnostic)
-
-        return topk_bboxes, topk_scores, topk_labels
-    
-    @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_coord_old, outputs_deltas):
-        # this is a workaround to make torchscript happy, as torchscript
-        # doesn't support dictionary with non-homogeneous values, such
-        # as a dict having both a Tensor and a list.
-        return [
-            {"pred_logits": a, "pred_boxes": b, "pred_boxes_old": c, "pred_deltas": d, }
-            for a, b, c, d in zip(outputs_class[:-1], outputs_coord[:-1], outputs_coord_old[:-1], outputs_deltas[:-1])
-        ]
-
-    def inference_single_image(self, x):
-        # ----------- Image Encoder -----------
-        src = self.image_encoder(x)
-
-        # ----------- Prepare inputs for Transformer -----------
-        mask = torch.zeros([src.shape[0], src.shape[2], src.shape[3]]).bool().to(src.device)
-        pos_embed = self.get_posembed(src.shape[1], mask, normalize=False)
-        self_attn_mask = None
-        query_embeds = self.query_embed.weight[:self.num_queries_one2one]
-
-        # -----------Transformer -----------
-        (
-            hs,
-            init_reference,
-            inter_references,
-            _,
-            _,
-            _,
-            _,
-            max_shape
-        ) = self.transformer(src, mask, pos_embed, query_embeds, self_attn_mask)
-
-        # ----------- Process outputs -----------
-        outputs_classes_one2one = []
-        outputs_coords_one2one = []
-        outputs_deltas_one2one = []
-
-        for lid in range(hs.shape[0]):
-            if lid == 0:
-                reference = init_reference
-            else:
-                reference = inter_references[lid - 1]
-            outputs_class = self.class_embed[lid](hs[lid])
-            tmp = self.bbox_embed[lid](hs[lid])
-            outputs_coord = self.transformer.decoder.delta2bbox(reference, tmp, max_shape)  # xyxy
-
-            outputs_classes_one2one.append(outputs_class[:, :self.num_queries_one2one])
-            outputs_coords_one2one.append(outputs_coord[:, :self.num_queries_one2one])
-            outputs_deltas_one2one.append(tmp[:, :self.num_queries_one2one])
-
-        outputs_classes_one2one = torch.stack(outputs_classes_one2one)
-        outputs_coords_one2one = torch.stack(outputs_coords_one2one)
-
-        # ------------ Post process ------------
-        cls_pred = outputs_classes_one2one[-1]
-        box_pred = outputs_coords_one2one[-1]
-        
-        # post-process
-        bboxes, scores, labels = self.post_process(box_pred, cls_pred)
-
-        outputs = {
-            "scores": scores,
-            "labels": labels,
-            "bboxes": bboxes,
-        }
-
-        return outputs
-        
-    def forward(self, x):
-        if not self.training:
-            return self.inference_single_image(x)
-
-        # ----------- Image Encoder -----------
-        src = self.image_encoder(x)
-
-        # ----------- Prepare inputs for Transformer -----------
-        mask = torch.zeros([src.shape[0], src.shape[2], src.shape[3]]).bool().to(src.device)
-        pos_embed = self.get_posembed(src.shape[1], mask, normalize=False)
-        self_attn_mask = torch.zeros(
-            [self.num_queries, self.num_queries, ]).bool().to(src.device)
-        self_attn_mask[self.num_queries_one2one:, 0: self.num_queries_one2one, ] = True
-        self_attn_mask[0: self.num_queries_one2one, self.num_queries_one2one:, ] = True
-        query_embeds = self.query_embed.weight
-
-        # -----------Transformer -----------
-        (
-            hs,
-            init_reference,
-            inter_references,
-            enc_outputs_class,
-            enc_outputs_coord_unact,
-            enc_outputs_delta,
-            output_proposals,
-            max_shape
-        ) = self.transformer(src, mask, pos_embed, query_embeds, self_attn_mask)
-
-        # ----------- Process outputs -----------
-        outputs_classes_one2one = []
-        outputs_coords_one2one = []
-        outputs_classes_one2many = []
-        outputs_coords_one2many = []
-
-        outputs_coords_old_one2one = []
-        outputs_deltas_one2one = []
-        outputs_coords_old_one2many = []
-        outputs_deltas_one2many = []
-
-        for lid in range(hs.shape[0]):
-            if lid == 0:
-                reference = init_reference
-            else:
-                reference = inter_references[lid - 1]
-            outputs_class = self.class_embed[lid](hs[lid])
-            tmp = self.bbox_embed[lid](hs[lid])
-            outputs_coord = self.transformer.decoder.box_xyxy_to_cxcywh(
-                self.transformer.decoder.delta2bbox(reference, tmp, max_shape))
-
-            outputs_classes_one2one.append(outputs_class[:, 0: self.num_queries_one2one])
-            outputs_classes_one2many.append(outputs_class[:, self.num_queries_one2one:])
-
-            outputs_coords_one2one.append(outputs_coord[:, 0: self.num_queries_one2one])
-            outputs_coords_one2many.append(outputs_coord[:, self.num_queries_one2one:])
-
-            outputs_coords_old_one2one.append(reference[:, :self.num_queries_one2one])
-            outputs_coords_old_one2many.append(reference[:, self.num_queries_one2one:])
-            outputs_deltas_one2one.append(tmp[:, :self.num_queries_one2one])
-            outputs_deltas_one2many.append(tmp[:, self.num_queries_one2one:])
-
-        outputs_classes_one2one = torch.stack(outputs_classes_one2one)
-        outputs_coords_one2one = torch.stack(outputs_coords_one2one)
-
-        outputs_classes_one2many = torch.stack(outputs_classes_one2many)
-        outputs_coords_one2many = torch.stack(outputs_coords_one2many)
-
-        out = {
-            "pred_logits": outputs_classes_one2one[-1],
-            "pred_boxes": outputs_coords_one2one[-1],
-            "pred_logits_one2many": outputs_classes_one2many[-1],
-            "pred_boxes_one2many": outputs_coords_one2many[-1],
-
-            "pred_boxes_old": outputs_coords_old_one2one[-1],
-            "pred_deltas": outputs_deltas_one2one[-1],
-            "pred_boxes_old_one2many": outputs_coords_old_one2many[-1],
-            "pred_deltas_one2many": outputs_deltas_one2many[-1],
-        }
-
-        out["aux_outputs"] = self._set_aux_loss(
-            outputs_classes_one2one, outputs_coords_one2one, outputs_coords_old_one2one, outputs_deltas_one2one
-        )
-        out["aux_outputs_one2many"] = self._set_aux_loss(
-            outputs_classes_one2many, outputs_coords_one2many, outputs_coords_old_one2many, outputs_deltas_one2many
-        )
-
-        out["enc_outputs"] = {
-            "pred_logits": enc_outputs_class,
-            "pred_boxes": enc_outputs_coord_unact,
-            "pred_boxes_old": output_proposals,
-            "pred_deltas": enc_outputs_delta,
-        }
-
-        return out
-                
-
-if __name__ == '__main__':
-    import time
-    from thop import profile
-    from loss import build_criterion
-
-    # Model config
-    cfg = {
-        'width': 1.0,
-        'depth': 1.0,
-        'max_stride': 32,
-        'out_stride': 16,
-        # Image Encoder - Backbone
-        'backbone': 'resnet50',
-        'backbone_norm': 'FrozeBN',
-        'pretrained': True,
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'hidden_dim': 256,
-        'en_num_heads': 8,
-        'en_num_layers': 6,
-        'en_ffn_dim': 2048,
-        'en_dropout': 0.0,
-        'en_act': 'gelu',
-        # Transformer Decoder
-        'transformer': 'plain_detr_transformer',
-        'hidden_dim': 256,
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_ffn_dim': 2048,
-        'de_dropout': 0.0,
-        'de_act': 'gelu',
-        'de_pre_norm': True,
-        'rpe_hidden_dim': 512,
-        'use_checkpoint': False,
-        'proposal_feature_levels': 3,
-        'proposal_tgt_strides': [8, 16, 32],
-        'num_queries_one2one': 300,
-        'num_queries_one2many': 300,
-        # Matcher
-        'matcher_hpy': {'cost_class': 2.0,
-                        'cost_bbox': 1.0,
-                        'cost_giou': 2.0,},
-        # Loss
-        'use_vfl': True,
-        'k_one2many': 6,
-        'lambda_one2many': 1.0,
-        'loss_coeff': {'class': 2,
-                       'bbox': 1,
-                       'giou': 2,
-                       'no_object': 0.1,},
-        }
-    bs = 1
-    # Create a batch of images & targets
-    image = torch.randn(bs, 3, 640, 640)
-    targets = [{
-        'labels': torch.tensor([2, 4, 5, 8]).long(),
-        'boxes':  torch.tensor([[0, 0, 10, 10], [12, 23, 56, 70], [0, 10, 20, 30], [50, 60, 55, 150]]).float() / 640.
-    }] * bs
-
-    # Create model
-    model = RT_PDETR(cfg, num_classes=80)
-    model.train()
-
-    # Model inference
-    t0 = time.time()
-    outputs = model(image)
-    t1 = time.time()
-    print('Infer time: ', t1 - t0)
-
-    # Create criterion
-    criterion = build_criterion(cfg, num_classes=80, aux_loss=True)
-
-    # Compute loss
-    loss = criterion(outputs, targets)
-    for k in loss.keys():
-        print("{} : {}".format(k, loss[k].item()))
-
-    print('==============================')
-    model.eval()
-    flops, params = profile(model, inputs=(image, ), verbose=False)
-    print('==============================')
-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
-    print('Params : {:.2f} M'.format(params / 1e6))

+ 0 - 405
models/detectors/rtpdetr/rtpdetr_decoder.py

@@ -1,405 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-try:
-    from .basic_modules.basic import LayerNorm2D
-    from .basic_modules.transformer import GlobalDecoder
-except:
-    from  basic_modules.basic import LayerNorm2D
-    from  basic_modules.transformer import GlobalDecoder
-
-
-def build_transformer(cfg, return_intermediate=False):
-    if cfg['transformer'] == 'plain_detr_transformer':
-        return PlainDETRTransformer(d_model             = cfg['hidden_dim'],
-                                    num_heads           = cfg['de_num_heads'],
-                                    ffn_dim             = cfg['de_ffn_dim'],
-                                    dropout             = cfg['de_dropout'],
-                                    act_type            = cfg['de_act'],
-                                    pre_norm            = cfg['de_pre_norm'],
-                                    rpe_hidden_dim      = cfg['rpe_hidden_dim'],
-                                    feature_stride      = cfg['out_stride'],
-                                    num_layers          = cfg['de_num_layers'],
-                                    return_intermediate = return_intermediate,
-                                    use_checkpoint      = cfg['use_checkpoint'],
-                                    num_queries_one2one = cfg['num_queries_one2one'],
-                                    num_queries_one2many    = cfg['num_queries_one2many'],
-                                    proposal_feature_levels = cfg['proposal_feature_levels'],
-                                    proposal_in_stride      = cfg['out_stride'],
-                                    proposal_tgt_strides    = cfg['proposal_tgt_strides'],
-                                    )
-
-
-# ----------------- Dencoder for Detection task -----------------
-## PlainDETR's Transformer for Detection task
-class PlainDETRTransformer(nn.Module):
-    def __init__(self,
-                 # Decoder layer params
-                 d_model        :int   = 256,
-                 num_heads      :int   = 8,
-                 ffn_dim        :int   = 1024,
-                 dropout        :float = 0.1,
-                 act_type       :str   = "relu",
-                 pre_norm       :bool  = False,
-                 rpe_hidden_dim :int   = 512,
-                 feature_stride :int   = 16,
-                 num_layers     :int   = 6,
-                 # Decoder params
-                 return_intermediate     :bool = False,
-                 use_checkpoint          :bool = False,
-                 num_queries_one2one     :int  = 300,
-                 num_queries_one2many    :int  = 1500,
-                 proposal_feature_levels :int  = 3,
-                 proposal_in_stride      :int  = 16,
-                 proposal_tgt_strides    :int  = [8, 16, 32],
-                 ):
-        super().__init__()
-        # ------------ Basic setting ------------
-        ## Model
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.rpe_hidden_dim = rpe_hidden_dim
-        self.ffn_dim = ffn_dim
-        self.act_type = act_type
-        self.num_layers = num_layers
-        self.return_intermediate = return_intermediate
-        ## Trick
-        self.use_checkpoint = use_checkpoint
-        self.num_queries_one2one = num_queries_one2one
-        self.num_queries_one2many = num_queries_one2many
-        self.proposal_feature_levels = proposal_feature_levels
-        self.proposal_tgt_strides = proposal_tgt_strides
-        self.proposal_in_stride = proposal_in_stride
-        self.proposal_min_size = 50
-
-        # --------------- Network setting ---------------
-        ## Global Decoder
-        self.decoder = GlobalDecoder(d_model, num_heads, ffn_dim, dropout, act_type, pre_norm,
-                                     rpe_hidden_dim, feature_stride, num_layers, return_intermediate,
-                                     use_checkpoint,)
-        
-        ## Two stage
-        self.enc_output = nn.Linear(d_model, d_model)
-        self.enc_output_norm = nn.LayerNorm(d_model)
-        self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
-        self.pos_trans_norm = nn.LayerNorm(d_model * 2)
-
-        ## Expand layers
-        if proposal_feature_levels > 1:
-            assert len(proposal_tgt_strides) == proposal_feature_levels
-
-            self.enc_output_proj = nn.ModuleList([])
-            for stride in proposal_tgt_strides:
-                if stride == proposal_in_stride:
-                    self.enc_output_proj.append(nn.Identity())
-                elif stride > proposal_in_stride:
-                    scale = int(math.log2(stride / proposal_in_stride))
-                    layers = []
-                    for _ in range(scale - 1):
-                        layers += [
-                            nn.Conv2d(d_model, d_model, kernel_size=2, stride=2),
-                            LayerNorm2D(d_model),
-                            nn.GELU()
-                        ]
-                    layers.append(nn.Conv2d(d_model, d_model, kernel_size=2, stride=2))
-                    self.enc_output_proj.append(nn.Sequential(*layers))
-                else:
-                    scale = int(math.log2(proposal_in_stride / stride))
-                    layers = []
-                    for _ in range(scale - 1):
-                        layers += [
-                            nn.ConvTranspose2d(d_model, d_model, kernel_size=2, stride=2),
-                            LayerNorm2D(d_model),
-                            nn.GELU()
-                        ]
-                    layers.append(nn.ConvTranspose2d(d_model, d_model, kernel_size=2, stride=2))
-                    self.enc_output_proj.append(nn.Sequential(*layers))
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-
-        if hasattr(self.decoder, '_reset_parameters'):
-            print('decoder re-init')
-            self.decoder._reset_parameters()
-
-    def get_proposal_pos_embed(self, proposals):
-        num_pos_feats = self.d_model // 2
-        temperature = 10000
-        scale = 2 * torch.pi
-
-        dim_t = torch.arange(
-            num_pos_feats, dtype=torch.float32, device=proposals.device
-        )
-        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
-        # N, L, 4
-        proposals = proposals * scale
-        # N, L, 4, 128
-        pos = proposals[:, :, :, None] / dim_t
-        # N, L, 4, 64, 2
-        pos = torch.stack(
-            (pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4
-        ).flatten(2)
-
-        return pos
-
-    def get_valid_ratio(self, mask):
-        _, H, W = mask.shape
-        valid_H = torch.sum(~mask[:, :, 0], 1)
-        valid_W = torch.sum(~mask[:, 0, :], 1)
-        valid_ratio_h = valid_H.float() / H
-        valid_ratio_w = valid_W.float() / W
-        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
-
-        return valid_ratio
-
-    def expand_encoder_output(self, memory, memory_padding_mask, spatial_shapes):
-        assert spatial_shapes.size(0) == 1, f'Get encoder output of shape {spatial_shapes}, not sure how to expand'
-
-        bs, _, c = memory.shape
-        h, w = spatial_shapes[0]
-
-        _out_memory = memory.view(bs, h, w, c).permute(0, 3, 1, 2)
-        _out_memory_padding_mask = memory_padding_mask.view(bs, h, w)
-
-        out_memory, out_memory_padding_mask, out_spatial_shapes = [], [], []
-        for i in range(self.proposal_feature_levels):
-            mem = self.enc_output_proj[i](_out_memory)
-            mask = F.interpolate(
-                _out_memory_padding_mask[None].float(), size=mem.shape[-2:]
-            ).to(torch.bool)
-
-            out_memory.append(mem)
-            out_memory_padding_mask.append(mask.squeeze(0))
-            out_spatial_shapes.append(mem.shape[-2:])
-
-        out_memory = torch.cat([mem.flatten(2).transpose(1, 2) for mem in out_memory], dim=1)
-        out_memory_padding_mask = torch.cat([mask.flatten(1) for mask in out_memory_padding_mask], dim=1)
-        out_spatial_shapes = torch.as_tensor(out_spatial_shapes, dtype=torch.long, device=out_memory.device)
-        
-        return out_memory, out_memory_padding_mask, out_spatial_shapes
-
-    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
-        if self.proposal_feature_levels > 1:
-            memory, memory_padding_mask, spatial_shapes = self.expand_encoder_output(
-                memory, memory_padding_mask, spatial_shapes
-            )
-        N_, S_, C_ = memory.shape
-        # base_scale = 4.0
-        proposals = []
-        _cur = 0
-        for lvl, (H_, W_) in enumerate(spatial_shapes):
-            stride = self.proposal_tgt_strides[lvl]
-
-            grid_y, grid_x = torch.meshgrid(
-                torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
-                torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
-            )
-            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
-            grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) * stride
-            wh = torch.ones_like(grid) * self.proposal_min_size * (2.0 ** lvl)
-            proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
-            proposals.append(proposal)
-            _cur += H_ * W_
-        output_proposals = torch.cat(proposals, 1)
-
-        H_, W_ = spatial_shapes[0]
-        stride = self.proposal_tgt_strides[0]
-        mask_flatten_ = memory_padding_mask[:, :H_*W_].view(N_, H_, W_, 1)
-        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1, keepdim=True) * stride
-        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1, keepdim=True) * stride
-        img_size = torch.cat([valid_W, valid_H, valid_W, valid_H], dim=-1)
-        img_size = img_size.unsqueeze(1) # [BS, 1, 4]
-
-        output_proposals_valid = (
-            (output_proposals > 0.01 * img_size) & (output_proposals < 0.99 * img_size)
-        ).all(-1, keepdim=True)
-        output_proposals = output_proposals.masked_fill(
-            memory_padding_mask.unsqueeze(-1).repeat(1, 1, 1),
-            max(H_, W_) * stride,
-        )
-        output_proposals = output_proposals.masked_fill(
-            ~output_proposals_valid,
-            max(H_, W_) * stride,
-        )
-
-        output_memory = memory
-        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
-        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
-        output_memory = self.enc_output_norm(self.enc_output(output_memory))
-
-        max_shape = (valid_H[:, None, :], valid_W[:, None, :])
-        return output_memory, output_proposals, max_shape
-    
-    def get_reference_points(self, memory, mask_flatten, spatial_shapes):
-        output_memory, output_proposals, max_shape = self.gen_encoder_output_proposals(
-            memory, mask_flatten, spatial_shapes
-        )
-
-        # hack implementation for two-stage Deformable DETR
-        enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
-        enc_outputs_delta = self.decoder.bbox_embed[self.decoder.num_layers](output_memory)
-        enc_outputs_coord_unact = self.decoder.box_xyxy_to_cxcywh(self.decoder.delta2bbox(
-            output_proposals,
-            enc_outputs_delta,
-            max_shape
-        ))
-
-        topk = self.two_stage_num_proposals
-        topk_proposals = torch.topk(enc_outputs_class.max(-1)[0], topk, dim=1)[1]
-        topk_coords_unact = torch.gather(
-            enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
-        )
-        topk_coords_unact = topk_coords_unact.detach()
-        reference_points = topk_coords_unact
-        
-        return (reference_points, max_shape, enc_outputs_class,
-                enc_outputs_coord_unact, enc_outputs_delta, output_proposals)
-
-    def forward(self, src, mask, pos_embed, query_embed=None, self_attn_mask=None):
-        # Prepare input for encoder
-        bs, c, h, w = src.shape
-        src_flatten = src.flatten(2).transpose(1, 2)
-        mask_flatten = mask.flatten(1)
-        pos_embed_flatten = pos_embed.flatten(2).transpose(1, 2)
-        spatial_shapes = torch.as_tensor([(h, w)], dtype=torch.long, device=src_flatten.device)
-
-        # Prepare input for decoder
-        memory = src_flatten
-        bs, _, c = memory.shape
-
-        # Two stage trick
-        if self.training:
-            self.two_stage_num_proposals = self.num_queries_one2one + self.num_queries_one2many
-        else:
-            self.two_stage_num_proposals = self.num_queries_one2one
-        (reference_points, max_shape, enc_outputs_class,
-        enc_outputs_coord_unact, enc_outputs_delta, output_proposals) \
-            = self.get_reference_points(memory, mask_flatten, spatial_shapes)
-        init_reference_out = reference_points
-        pos_trans_out = torch.zeros((bs, self.two_stage_num_proposals, 2*c), device=init_reference_out.device)
-        pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(reference_points)))
-
-        # Mixed selection trick
-        tgt = query_embed.unsqueeze(0).expand(bs, -1, -1)
-        query_embed, _ = torch.split(pos_trans_out, c, dim=2)
-
-        # Decoder
-        hs, inter_references = self.decoder(tgt,
-                                            reference_points,
-                                            memory,
-                                            pos_embed_flatten,
-                                            spatial_shapes,
-                                            query_embed,
-                                            mask_flatten,
-                                            self_attn_mask,
-                                            max_shape
-                                            )
-        inter_references_out = inter_references
-
-        return (hs,
-                init_reference_out,
-                inter_references_out,
-                enc_outputs_class,
-                enc_outputs_coord_unact,
-                enc_outputs_delta,
-                output_proposals,
-                max_shape
-                )
-
-
-# ----------------- Dencoder for Segmentation task -----------------
-## PlainDETR's Transformer for Segmentation task
-class SegTransformerDecoder(nn.Module):
-    def __init__(self, ):
-        super().__init__()
-        # TODO: design seg-decoder
-
-    def forward(self, x):
-        return
-
-
-# ----------------- Dencoder for Pose estimation task -----------------
-## PlainDETR's Transformer for Pose estimation task
-class PosTransformerDecoder(nn.Module):
-    def __init__(self, ):
-        super().__init__()
-        # TODO: design seg-decoder
-
-    def forward(self, x):
-        return
-
-
-if __name__ == '__main__':
-    import time
-    from thop import profile
-    from basic_modules.basic import MLP
-    from basic_modules.transformer import get_clones
-
-    cfg = {
-        'out_stride': 16,
-        'hidden_dim': 256,
-        # Transformer Decoder
-        'transformer': 'plain_detr_transformer',
-        'de_num_heads': 8,
-        'de_num_layers': 6,
-        'de_ffn_dim': 1024,
-        'de_dropout': 0.0,
-        'de_act': 'gelu',
-        'de_pre_norm': True,
-        'rpe_hidden_dim': 512,
-        'use_checkpoint': False,
-        'proposal_feature_levels': 3,
-        'proposal_tgt_strides': [8, 16, 32],
-        'num_queries_one2one': 300,
-        'num_queries_one2many': 100,
-    }
-    feat = torch.randn(1, cfg['hidden_dim'], 40, 40)
-    mask = torch.zeros(1, 40, 40)
-    pos_embed = torch.randn(1, cfg['hidden_dim'], 40, 40)
-    query_embed = torch.randn(cfg['num_queries_one2one'] + cfg['num_queries_one2many'], cfg['hidden_dim'])
-
-    model = build_transformer(cfg, True)
-
-    class_embed = nn.Linear(cfg['hidden_dim'], 80)
-    bbox_embed = MLP(cfg['hidden_dim'], cfg['hidden_dim'], 4, 3)
-    class_embed = get_clones(class_embed, cfg['de_num_layers'] + 1)
-    bbox_embed = get_clones(bbox_embed, cfg['de_num_layers'] + 1)
-
-    model.decoder.bbox_embed = bbox_embed
-    model.decoder.class_embed = class_embed
-
-    model.train()
-    t0 = time.time()
-    outputs = model(feat, mask, pos_embed, query_embed)
-    (hs,
-     init_reference_out,
-     inter_references_out,
-     enc_outputs_class,
-     enc_outputs_coord_unact,
-     enc_outputs_delta,
-     output_proposals,
-     max_shape
-     ) = outputs
-    t1 = time.time()
-    print('Time: ', t1 - t0)
-    print(hs.shape)
-    print(init_reference_out.shape)
-    print(inter_references_out.shape)
-    print(enc_outputs_class.shape)
-    print(enc_outputs_coord_unact.shape)
-    print(enc_outputs_delta.shape)
-    print(output_proposals.shape)
-
-    print('==============================')
-    model.eval()
-    query_embed = torch.randn(cfg['num_queries_one2one'], cfg['hidden_dim'])
-    flops, params = profile(model, inputs=(feat, mask, pos_embed, query_embed, ), verbose=False)
-    print('==============================')
-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
-    print('Params : {:.2f} M'.format(params / 1e6))

+ 0 - 99
models/detectors/rtpdetr/rtpdetr_encoder.py

@@ -1,99 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-try:
-    from .basic_modules.basic    import BasicConv, UpSampleWrapper
-    from .basic_modules.backbone import build_backbone
-    from .basic_modules.transformer import TransformerEncoder
-except:
-    from  basic_modules.basic    import BasicConv, UpSampleWrapper
-    from  basic_modules.backbone import build_backbone
-    from  basic_modules.transformer import TransformerEncoder
-
-
-# ----------------- Image Encoder -----------------
-def build_image_encoder(cfg):
-    return ImageEncoder(cfg)
-
-class ImageEncoder(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-        # ---------------- Basic settings ----------------
-        ## Basic parameters
-        self.cfg = cfg
-        ## Network parameters
-        self.stride = cfg['out_stride']
-        self.upsample_factor = 32 // self.stride
-        self.hidden_dim = cfg['hidden_dim']
-        
-        # ---------------- Network settings ----------------
-        ## Backbone Network
-        self.backbone, fpn_feat_dims = build_backbone(cfg, pretrained=cfg['pretrained']&self.training)
-
-        ## Input projection
-        self.input_proj = BasicConv(fpn_feat_dims[-1], cfg['hidden_dim'], kernel_size=1, act_type=None, norm_type='BN')
-
-        # ---------------- Transformer Encoder ----------------
-        self.transformer_encoder = TransformerEncoder(d_model     = cfg['hidden_dim'],
-                                                      num_heads   = cfg['en_num_heads'],
-                                                      num_layers  = cfg['en_num_layers'],
-                                                      ffn_dim     = cfg['en_ffn_dim'],
-                                                      dropout     = cfg['en_dropout'],
-                                                      act_type    = cfg['en_act']
-                                                      )
-
-        ## Upsample layer
-        self.upsample = UpSampleWrapper(cfg['hidden_dim'], self.upsample_factor)
-        
-        ## Output projection
-        self.output_proj = BasicConv(cfg['hidden_dim'], cfg['hidden_dim'], kernel_size=3, padding=1, act_type='silu', norm_type='BN')
-
-
-    def forward(self, x):
-        pyramid_feats = self.backbone(x)
-        feat = self.input_proj(pyramid_feats[-1])
-        feat = self.transformer_encoder(feat)
-        feat = self.upsample(feat)
-        feat = self.output_proj(feat)
-
-        return feat
-
-
-if __name__ == '__main__':
-    import time
-    from thop import profile
-    cfg = {
-        'width': 1.0,
-        'depth': 1.0,
-        'out_stride': 16,
-        # Image Encoder - Backbone
-        'backbone': 'resnet50',
-        'backbone_norm': 'FrozeBN',
-        'pretrained': True,
-        'freeze_at': 0,
-        'freeze_stem_only': False,
-        'hidden_dim': 256,
-        'en_num_heads': 8,
-        'en_num_layers': 1,
-        'en_ffn_dim': 1024,
-        'en_dropout': 0.0,
-        'en_act': 'gelu',
-    }
-    x = torch.rand(2, 3, 640, 640)
-    model = build_image_encoder(cfg)
-    model.train()
-
-    t0 = time.time()
-    outputs = model(x)
-    t1 = time.time()
-    print('Time: ', t1 - t0)
-    print(outputs.shape)
-
-    print('==============================')
-    model.eval()
-    x = torch.rand(1, 3, 640, 640)
-    flops, params = profile(model, inputs=(x, ), verbose=False)
-    print('==============================')
-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
-    print('Params : {:.2f} M'.format(params / 1e6))