1 년 전 · 0ad3b62fc7
--- a/image_classification/.gitignore
+++ b/image_classification/.gitignore
--- a/image_classification/README.md
+++ b/image_classification/README.md
--- a/image_classification/data/__init__.py
+++ b/image_classification/data/__init__.py
--- a/image_classification/data/cifar.py
+++ b/image_classification/data/cifar.py
--- a/image_classification/data/custom.py
+++ b/image_classification/data/custom.py
--- a/image_classification/data/mnist.py
+++ b/image_classification/data/mnist.py
--- a/image_classification/engine.py
+++ b/image_classification/engine.py
--- a/image_classification/main.py
+++ b/image_classification/main.py
--- a/image_classification/models/__init__.py
+++ b/image_classification/models/__init__.py
@@ -1,6 +1,5 @@
 
				 from .mlp.build     import build_mlp
			
 
				 from .convnet.build import build_convnet
			
 
				-from .resnet.build  import build_resnet
			
 
				 from .vit.build     import build_vit
			
 
				 
			
 
				 
			
@@ -10,8 +9,6 @@ def build_model(args):
 
				         model = build_mlp(args)
			
 
				     elif 'convnet' in args.model:
			
 
				         model = build_convnet(args)
			
 
				-    elif 'resnet' in args.model:
			
 
				-        model = build_resnet(args)
			
 
				     elif 'vit' in args.model:
			
 
				         model = build_vit(args)
			
 
				     else:
			
--- a/image_classification/models/convnet/build.py
+++ b/image_classification/models/convnet/build.py
--- a/image_classification/models/convnet/convnet.py
+++ b/image_classification/models/convnet/convnet.py
--- a/image_classification/models/convnet/modules.py
+++ b/image_classification/models/convnet/modules.py
--- a/image_classification/models/mlp/build.py
+++ b/image_classification/models/mlp/build.py
--- a/image_classification/models/mlp/mlp.py
+++ b/image_classification/models/mlp/mlp.py
--- a/image_classification/models/mlp/modules.py
+++ b/image_classification/models/mlp/modules.py
--- a/image_classification/models/vit/build.py
+++ b/image_classification/models/vit/build.py
--- a/image_classification/models/vit/modules.py
+++ b/image_classification/models/vit/modules.py
--- a/image_classification/models/vit/vit.py
+++ b/image_classification/models/vit/vit.py
--- a/image_classification/requirements.txt
+++ b/image_classification/requirements.txt
--- a/image_classification/utils/__init__.py
+++ b/image_classification/utils/__init__.py
--- a/image_classification/utils/lr_scheduler.py
+++ b/image_classification/utils/lr_scheduler.py
--- a/image_classification/utils/misc.py
+++ b/image_classification/utils/misc.py
--- a/image_classification/utils/optimzer.py
+++ b/image_classification/utils/optimzer.py
--- a/image_classification/models/resnet/build.py
+++ b/image_classification/models/resnet/build.py
@@ -1,21 +0,0 @@
 
				-from .resnet import ResNet
			
 
				-from .modules import PlainResBlock, BottleneckResBlock
			
 
				-
			
 
				-
			
 
				-def build_resnet(args):
			
 
				-    if args.model == 'resnet18':
			
 
				-        model = ResNet(in_dim=args.img_dim,
			
 
				-                       block=PlainResBlock,
			
 
				-                       expansion=1.0,
			
 
				-                       num_blocks=[2, 2, 2, 2],
			
 
				-                       )
			
 
				-    elif args.model == 'resnet50':
			
 
				-        model = ResNet(in_dim=args.img_dim,
			
 
				-                       block=BottleneckResBlock,
			
 
				-                       expansion=4.0,
			
 
				-                       num_blocks=[3, 4, 6, 3],
			
 
				-                       )
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown resnet: {}".format(args.model))
			
 
				-    
			
 
				-    return model
			
--- a/image_classification/models/resnet/modules.py
+++ b/image_classification/models/resnet/modules.py
@@ -1,164 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-
			
 
				-def get_activation(act_type=None):
			
 
				-    if   act_type == 'sigmoid':
			
 
				-        return nn.Sigmoid()
			
 
				-    elif act_type == 'relu':
			
 
				-        return nn.ReLU(inplace=True)
			
 
				-    elif act_type == 'lrelu':
			
 
				-        return nn.LeakyReLU(0.1, inplace=True)
			
 
				-    elif act_type == 'mish':
			
 
				-        return nn.Mish(inplace=True)
			
 
				-    elif act_type == 'silu':
			
 
				-        return nn.SiLU(inplace=True)
			
 
				-    elif act_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-        
			
 
				-def get_norm(norm_type, dim):
			
 
				-    if   norm_type == 'bn':
			
 
				-        return nn.BatchNorm2d(dim)
			
 
				-    elif norm_type == 'ln':
			
 
				-        return LayerNorm2d(dim)
			
 
				-    elif norm_type == 'gn':
			
 
				-        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				-    elif norm_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-
			
 
				-class LayerNorm2d(nn.Module):
			
 
				-    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
			
 
				-        super().__init__()
			
 
				-        self.weight = nn.Parameter(torch.ones(num_channels))
			
 
				-        self.bias = nn.Parameter(torch.zeros(num_channels))
			
 
				-        self.eps = eps
			
 
				-
			
 
				-    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				-        u = x.mean(1, keepdim=True)
			
 
				-        s = (x - u).pow(2).mean(1, keepdim=True)
			
 
				-        x = (x - u) / torch.sqrt(s + self.eps)
			
 
				-        x = self.weight[:, None, None] * x + self.bias[:, None, None]
			
 
				-        
			
 
				-        return x
			
 
				-    
			
 
				-class ConvModule(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim      :int,
			
 
				-                 out_dim     :int,
			
 
				-                 kernel_size :int  = 1,
			
 
				-                 padding     :int  = 0,
			
 
				-                 stride      :int  = 1,
			
 
				-                 act_type    :str  = "relu",
			
 
				-                 norm_type   :str  = "bn",
			
 
				-                 depthwise   :bool = False) -> None:
			
 
				-        super().__init__()
			
 
				-        use_bias = False if norm_type is not None else True
			
 
				-        self.depthwise = depthwise
			
 
				-        if not depthwise:
			
 
				-            self.conv = nn.Conv2d(in_channels=in_dim, out_channels=out_dim,
			
 
				-                                kernel_size=kernel_size, padding=padding, stride=stride,
			
 
				-                                bias=use_bias)
			
 
				-            self.norm  = get_norm(norm_type, out_dim)
			
 
				-        else:
			
 
				-            self.conv1 = nn.Conv2d(in_channels=in_dim, out_channels=in_dim,
			
 
				-                                   kernel_size=kernel_size, padding=padding, stride=stride, groups=in_dim,
			
 
				-                                   bias=use_bias)
			
 
				-            self.norm1 = get_norm(norm_type, in_dim)
			
 
				-            self.conv2 = nn.Conv2d(in_channels=in_dim, out_channels=out_dim,
			
 
				-                                   kernel_size=1, padding=0, stride=1,
			
 
				-                                   bias=use_bias)
			
 
				-            self.norm2 = get_norm(norm_type, out_dim)
			
 
				-        self.act   = get_activation(act_type)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        if self.depthwise:
			
 
				-            x = self.norm1(self.conv1(x))
			
 
				-            x = self.act(self.norm2(self.conv2(x)))
			
 
				-        else:
			
 
				-            x = self.act(self.norm(self.conv(x)))
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-# -------------- ResNet's modules --------------
			
 
				-class PlainResBlock(nn.Module):
			
 
				-    def __init__(self, in_dim, inter_dim, out_dim, stride=1):
			
 
				-        super().__init__()
			
 
				-        # -------- Basic parameters --------
			
 
				-        self.in_dim = in_dim
			
 
				-        self.out_dim = out_dim
			
 
				-        self.inter_dim = inter_dim
			
 
				-        self.stride = stride
			
 
				-        self.downsample = stride > 1 or in_dim != out_dim
			
 
				-
			
 
				-        # -------- Model parameters --------
			
 
				-        self.conv_layer_1 = ConvModule(in_dim, inter_dim,
			
 
				-                                       kernel_size=3, padding=1, stride=stride,
			
 
				-                                       act_type='relu', norm_type='bn', depthwise=False)
			
 
				-        self.conv_layer_2 = ConvModule(inter_dim, out_dim,
			
 
				-                                       kernel_size=3, padding=1, stride=1,
			
 
				-                                       act_type=None, norm_type='bn', depthwise=False)
			
 
				-        self.out_act = nn.ReLU(inplace=True)
			
 
				-
			
 
				-        if self.downsample:
			
 
				-            self.res_layer = ConvModule(in_dim, out_dim,
			
 
				-                                       kernel_size=1, padding=0, stride=stride,
			
 
				-                                       act_type=None, norm_type='bn', depthwise=False)
			
 
				-        else:
			
 
				-            self.res_layer = nn.Identity()
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        out = self.conv_layer_1(x)
			
 
				-        out = self.conv_layer_2(out)
			
 
				-
			
 
				-        x = self.res_layer(x)
			
 
				-
			
 
				-        out = x + out
			
 
				-        out = self.out_act(out)
			
 
				-
			
 
				-        return out
			
 
				-
			
 
				-class BottleneckResBlock(nn.Module):
			
 
				-    def __init__(self, in_dim, inter_dim, out_dim, stride=1):
			
 
				-        super().__init__()
			
 
				-        # -------- Basic parameters --------
			
 
				-        self.in_dim = in_dim
			
 
				-        self.out_dim = out_dim
			
 
				-        self.stride = stride
			
 
				-        self.downsample = stride > 1 or in_dim != out_dim
			
 
				-
			
 
				-        # -------- Model parameters --------
			
 
				-        self.conv_layer_1 = ConvModule(in_dim, inter_dim,
			
 
				-                                       kernel_size=1, padding=0, stride=1,
			
 
				-                                       act_type='relu', norm_type='bn', depthwise=False)
			
 
				-        self.conv_layer_2 = ConvModule(inter_dim, inter_dim,
			
 
				-                                       kernel_size=3, padding=1, stride=stride,
			
 
				-                                       act_type='relu', norm_type='bn', depthwise=False)
			
 
				-        self.conv_layer_3 = ConvModule(inter_dim, out_dim,
			
 
				-                                       kernel_size=1, padding=0, stride=1,
			
 
				-                                       act_type=None, norm_type='bn', depthwise=False)
			
 
				-        self.out_act = nn.ReLU(inplace=True)
			
 
				-
			
 
				-        if self.downsample:
			
 
				-            self.res_layer = ConvModule(in_dim, out_dim,
			
 
				-                                       kernel_size=1, padding=0, stride=stride,
			
 
				-                                       act_type=None, norm_type='bn', depthwise=False)
			
 
				-        else:
			
 
				-            self.res_layer = nn.Identity()
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        out = self.conv_layer_1(x)
			
 
				-        out = self.conv_layer_2(out)
			
 
				-        out = self.conv_layer_3(out)
			
 
				-
			
 
				-        x = self.res_layer(x)
			
 
				-
			
 
				-        out = x + out
			
 
				-        out = self.out_act(out)
			
 
				-
			
 
				-        return out
			
 
				-
			
--- a/image_classification/models/resnet/resnet.py
+++ b/image_classification/models/resnet/resnet.py
@@ -1,110 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-try:
			
 
				-    from .modules import ConvModule, PlainResBlock, BottleneckResBlock
			
 
				-except:
			
 
				-    from  modules import ConvModule, PlainResBlock, BottleneckResBlock
			
 
				-
			
 
				-
			
 
				-class ResNet(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim,
			
 
				-                 block,
			
 
				-                 expansion = 1.0,
			
 
				-                 num_blocks = [2, 2, 2, 2],
			
 
				-                 num_classes = 1000,
			
 
				-                 ) -> None:
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.expansion = expansion
			
 
				-        self.num_blocks = num_blocks
			
 
				-        self.feat_dims  = [64,                      # C2 level
			
 
				-                           round(64 * expansion),   # C2 level
			
 
				-                           round(128 * expansion),  # C3 level
			
 
				-                           round(256 * expansion),  # C4 level
			
 
				-                           round(512 * expansion),  # C5 level
			
 
				-                           ]
			
 
				-        # ----------- Model parameters -----------
			
 
				-        ## Backbone
			
 
				-        self.layer_1 = nn.Sequential(
			
 
				-            ConvModule(in_dim, self.feat_dims[0],
			
 
				-                       kernel_size=7, padding=3, stride=2,
			
 
				-                       act_type='relu', norm_type='bn', depthwise=False),
			
 
				-            nn.MaxPool2d(kernel_size=(3, 3), padding=(1, 1), stride=(2, 2))
			
 
				-        )
			
 
				-        self.layer_2 = self.make_layer(block, self.feat_dims[0], self.feat_dims[1], depth=num_blocks[0], downsample=False)
			
 
				-        self.layer_3 = self.make_layer(block, self.feat_dims[1], self.feat_dims[2], depth=num_blocks[1], downsample=True)
			
 
				-        self.layer_4 = self.make_layer(block, self.feat_dims[2], self.feat_dims[3], depth=num_blocks[2], downsample=True)
			
 
				-        self.layer_5 = self.make_layer(block, self.feat_dims[3], self.feat_dims[4], depth=num_blocks[3], downsample=True)
			
 
				-
			
 
				-        ## Classifier
			
 
				-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
			
 
				-        self.fc      = nn.Linear(self.feat_dims[4] , num_classes)
			
 
				-        
			
 
				-    def make_layer(self, block, in_dim, out_dim, depth=1, downsample=False):
			
 
				-        stage_blocks = []
			
 
				-        for i in range(depth):
			
 
				-            if i == 0:
			
 
				-                stride = 2 if downsample else 1
			
 
				-                inter_dim = round(out_dim / self.expansion)
			
 
				-                stage_blocks.append(block(in_dim, inter_dim, out_dim, stride))
			
 
				-            else:
			
 
				-                stride = 1
			
 
				-                inter_dim = round(out_dim / self.expansion)
			
 
				-                stage_blocks.append(block(out_dim, inter_dim, out_dim, stride))
			
 
				-        
			
 
				-        layers = nn.Sequential(*stage_blocks)
			
 
				-
			
 
				-        return layers
			
 
				-    
			
 
				-    def forward(self, x):
			
 
				-        x = self.layer_1(x)
			
 
				-        x = self.layer_2(x)
			
 
				-        x = self.layer_3(x)
			
 
				-        x = self.layer_4(x)
			
 
				-        x = self.layer_5(x)
			
 
				-
			
 
				-        x = self.avgpool(x)
			
 
				-        x = x.flatten(1)
			
 
				-        x = self.fc(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-def build_resnet(model_name='resnet18', img_dim=3):
			
 
				-    if model_name == 'resnet18':
			
 
				-        model = ResNet(in_dim=img_dim,
			
 
				-                       block=PlainResBlock,
			
 
				-                       expansion=1.0,
			
 
				-                       num_blocks=[2, 2, 2, 2],
			
 
				-                       )
			
 
				-    elif model_name == 'resnet50':
			
 
				-        model = ResNet(in_dim=img_dim,
			
 
				-                       block=BottleneckResBlock,
			
 
				-                       expansion=4.0,
			
 
				-                       num_blocks=[3, 4, 6, 3],
			
 
				-                       )
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown resnet: {}".format(model_name))
			
 
				-    
			
 
				-    return model
			
 
				-
			
 
				-
			
 
				-if __name__=='__main__':
			
 
				-    import time
			
 
				-
			
 
				-    # 构建ResNet模型
			
 
				-    model = build_resnet(model_name='resnet18')
			
 
				-
			
 
				-    # 打印模型结构
			
 
				-    print(model)
			
 
				-
			
 
				-    # 随即成生数据
			
 
				-    x = torch.randn(1, 3, 224, 224)
			
 
				-
			
 
				-    # 模型前向推理
			
 
				-    t0 = time.time()
			
 
				-    output = model(x)
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
--- a/masked_image_modeling/.gitignore
+++ b/masked_image_modeling/.gitignore
@@ -1,12 +0,0 @@
 
				-*.pt
			
 
				-*.pth
			
 
				-*.pkl
			
 
				-*.onnx
			
 
				-*.pyc
			
 
				-*.zip
			
 
				-weights
			
 
				-__pycache__
			
 
				-data/cifar/
			
 
				-data/cifar_data/
			
 
				-data/mnist_data/
			
 
				-vis_results/
			
--- a/masked_image_modeling/README.md
+++ b/masked_image_modeling/README.md
@@ -1,73 +0,0 @@
 
				-# Masked AutoEncoder
			
 
				-
			
 
				-## 1. Pretrain
			
 
				-We have kindly provided the bash script `main_pretrain.sh` file for pretraining. You can modify some hyperparameters in the script file according to your own needs.
			
 
				-
			
 
				-```Shell
			
 
				-cd Vision-Pretraining-Tutorial/masked_image_modeling/
			
 
				-python main_pretrain.py --cuda \
			
 
				-                        --dataset cifar10 \
			
 
				-                        --model vit_t \
			
 
				-                        --mask_ratio 0.75 \
			
 
				-                        --batch_size 128 \
			
 
				-                        --optimizer adamw \
			
 
				-                        --weight_decay 0.05 \
			
 
				-                        --lr_scheduler cosine \
			
 
				-                        --base_lr 0.00015 \
			
 
				-                        --min_lr 0.0 \
			
 
				-                        --max_epoch 400 \
			
 
				-                        --eval_epoch 20
			
 
				-```
			
 
				-
			
 
				-## 2. Finetune
			
 
				-We have kindly provided the bash script `main_finetune.sh` file for finetuning. You can modify some hyperparameters in the script file according to your own needs.
			
 
				-
			
 
				-```Shell
			
 
				-cd Vision-Pretraining-Tutorial/masked_image_modeling/
			
 
				-python main_finetune.py --cuda \
			
 
				-                        --dataset cifar10 \
			
 
				-                        --model vit_t \
			
 
				-                        --batch_size 256 \
			
 
				-                        --optimizer adamw \
			
 
				-                        --weight_decay 0.05 \
			
 
				-                        --base_lr 0.0005 \
			
 
				-                        --min_lr 0.000001 \
			
 
				-                        --max_epoch 100 \
			
 
				-                        --wp_epoch 5 \
			
 
				-                        --eval_epoch 5 \
			
 
				-                        --pretrained path/to/vit_t.pth
			
 
				-```
			
 
				-## 3. Evaluate 
			
 
				-- Evaluate the `top1 & top5` accuracy of `ViT-Tiny` on CIFAR10 dataset:
			
 
				-```Shell
			
 
				-python main_finetune.py --cuda \
			
 
				-                        --dataset cifar10 \
			
 
				-                        -m vit_t \
			
 
				-                        --batch_size 256 \
			
 
				-                        --eval \
			
 
				-                        --resume path/to/vit_t_cifar10.pth
			
 
				-```
			
 
				-
			
 
				-
			
 
				-## 4. Visualize Image Reconstruction
			
 
				-- Evaluate `ViT-Tiny` pretrained by MAE framework on CIFAR10 dataset:
			
 
				-```Shell
			
 
				-python main_pretrain.py --cuda \
			
 
				-                        --dataset cifar10 \
			
 
				-                        -m vit_t \
			
 
				-                        --resume path/to/mae_vit_t_cifar10.pth \
			
 
				-                        --eval \
			
 
				-                        --batch_size 1
			
 
				-```
			
 
				-
			
 
				-
			
 
				-## 5. Experiments
			
 
				-- On CIFAR10
			
 
				-
			
 
				-| Method |  Model  | Epoch | Top 1    | Weight |  MAE weight  |
			
 
				-|  :---: |  :---:  | :---: | :---:    | :---:  |    :---:     |
			
 
				-|  MAE   |  ViT-T  | 100   |   91.2   | [ckpt](https://github.com/yjh0410/MAE/releases/download/checkpoints/ViT-T_Cifar10.pth) | [ckpt](https://github.com/yjh0410/MAE/releases/download/checkpoints/MAE_ViT-T_Cifar10.pth) |
			
 
				-
			
 
				-
			
 
				-## 6. Acknowledgment
			
 
				-Thank you to **Kaiming He** for his inspiring work on [MAE](http://openaccess.thecvf.com/content/CVPR2022/papers/He_Masked_Autoencoders_Are_Scalable_Vision_Learners_CVPR_2022_paper.pdf). His research effectively elucidates the semantic distinctions between vision and language, offering valuable insights for subsequent vision-related studies. I would also like to express my gratitude for the official source code of [MAE](https://github.com/facebookresearch/mae). Additionally, I appreciate the efforts of [**IcarusWizard**](https://github.com/IcarusWizard) for reproducing the [MAE](https://github.com/IcarusWizard/MAE) implementation.
			
--- a/masked_image_modeling/data/__init__.py
+++ b/masked_image_modeling/data/__init__.py
@@ -1,38 +0,0 @@
 
				-import torch
			
 
				-
			
 
				-from .cifar import CifarDataset
			
 
				-from .custom import CustomDataset
			
 
				-
			
 
				-
			
 
				-def build_dataset(args, is_train=False):
			
 
				-    # ----------------- CIFAR dataset -----------------
			
 
				-    if args.dataset == 'cifar10':
			
 
				-        args.num_classes = 10
			
 
				-        args.img_dim = 3
			
 
				-        args.img_size = 32
			
 
				-        args.patch_size = 4
			
 
				-        return CifarDataset(is_train)
			
 
				-        
			
 
				-    # ----------------- Customed dataset -----------------
			
 
				-    elif args.dataset == 'custom':
			
 
				-        assert args.num_classes is not None and isinstance(args.num_classes, int)
			
 
				-        args.img_size = 224
			
 
				-        args.patch_size = 16
			
 
				-        return CustomDataset(args, is_train)
			
 
				-    
			
 
				-    else:
			
 
				-        print("Unknown dataset: {}".format(args.dataset))
			
 
				-    
			
 
				-
			
 
				-def build_dataloader(args, dataset, is_train=False):
			
 
				-    if is_train:
			
 
				-        sampler = torch.utils.data.RandomSampler(dataset)
			
 
				-        batch_sampler_train = torch.utils.data.BatchSampler(
			
 
				-            sampler, args.batch_size, drop_last=True if is_train else False)
			
 
				-        dataloader = torch.utils.data.DataLoader(
			
 
				-            dataset, batch_sampler=batch_sampler_train, num_workers=args.num_workers, pin_memory=True)
			
 
				-    else:
			
 
				-        dataloader = torch.utils.data.DataLoader(
			
 
				-            dataset=dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
			
 
				-
			
 
				-    return dataloader
			
--- a/masked_image_modeling/data/cifar.py
+++ b/masked_image_modeling/data/cifar.py
@@ -1,65 +0,0 @@
 
				-import os
			
 
				-import numpy as np
			
 
				-import torch.utils.data as data
			
 
				-import torchvision.transforms as T
			
 
				-from torchvision.datasets import CIFAR10
			
 
				-
			
 
				-
			
 
				-class CifarDataset(data.Dataset):
			
 
				-    def __init__(self, is_train=False):
			
 
				-        super().__init__()
			
 
				-        # ----------------- basic parameters -----------------
			
 
				-        self.pixel_mean = [0.5, 0.5, 0.5]
			
 
				-        self.pixel_std =  [0.5, 0.5, 0.5]
			
 
				-        self.is_train  = is_train
			
 
				-        self.image_set = 'train' if is_train else 'val'
			
 
				-        # ----------------- dataset & transforms -----------------
			
 
				-        self.transform = self.build_transform()
			
 
				-        path = os.path.dirname(os.path.abspath(__file__))
			
 
				-        if is_train:
			
 
				-            self.dataset = CIFAR10(os.path.join(path, 'cifar_data/'), train=True, download=True, transform=self.transform)
			
 
				-        else:
			
 
				-            self.dataset = CIFAR10(os.path.join(path, 'cifar_data/'), train=False, download=True, transform=self.transform)
			
 
				-
			
 
				-    def __len__(self):
			
 
				-        return len(self.dataset)
			
 
				-    
			
 
				-    def __getitem__(self, index):
			
 
				-        image, target = self.dataset[index]
			
 
				-            
			
 
				-        return image, target
			
 
				-    
			
 
				-    def pull_image(self, index):
			
 
				-        # laod data
			
 
				-        image, target = self.dataset[index]
			
 
				-
			
 
				-        # denormalize image
			
 
				-        image = image.permute(1, 2, 0).numpy()
			
 
				-        image = (image * self.pixel_std + self.pixel_mean) * 255.
			
 
				-        image = image.astype(np.uint8)
			
 
				-        image = image.copy()
			
 
				-
			
 
				-        return image, target
			
 
				-
			
 
				-    def build_transform(self):
			
 
				-        if self.is_train:
			
 
				-            transforms = T.Compose([T.ToTensor(), T.Normalize(0.5, 0.5)])
			
 
				-        else:
			
 
				-            transforms = T.Compose([T.ToTensor(), T.Normalize(0.5, 0.5)])
			
 
				-
			
 
				-        return transforms
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    import cv2
			
 
				-    
			
 
				-    # dataset
			
 
				-    dataset = CifarDataset(is_train=True)  
			
 
				-    print('Dataset size: ', len(dataset))
			
 
				-
			
 
				-    for i in range(len(dataset)):
			
 
				-        image, target = dataset.pull_image(i)
			
 
				-        # to BGR
			
 
				-        image = image[..., (2, 1, 0)]
			
 
				-
			
 
				-        cv2.imshow('image', image)
			
 
				-        cv2.waitKey(0)
			
--- a/masked_image_modeling/data/custom.py
+++ b/masked_image_modeling/data/custom.py
@@ -1,87 +0,0 @@
 
				-import os
			
 
				-import PIL
			
 
				-import numpy as np
			
 
				-import torch.utils.data as data
			
 
				-import torchvision.transforms as T
			
 
				-from torchvision.datasets import ImageFolder
			
 
				-
			
 
				-
			
 
				-class CustomDataset(data.Dataset):
			
 
				-    def __init__(self, args, is_train=False):
			
 
				-        super().__init__()
			
 
				-        # ----------------- basic parameters -----------------
			
 
				-        self.args = args
			
 
				-        self.is_train   = is_train
			
 
				-        self.pixel_mean = [0.485, 0.456, 0.406]
			
 
				-        self.pixel_std  = [0.229, 0.224, 0.225]
			
 
				-        print("Pixel mean: {}".format(self.pixel_mean))
			
 
				-        print("Pixel std:  {}".format(self.pixel_std))
			
 
				-        self.image_set = 'train' if is_train else 'val'
			
 
				-        self.data_path = os.path.join(args.root, self.image_set)
			
 
				-        # ----------------- dataset & transforms -----------------
			
 
				-        self.transform = self.build_transform()
			
 
				-        self.dataset = ImageFolder(root=self.data_path, transform=self.transform)
			
 
				-
			
 
				-    def __len__(self):
			
 
				-        return len(self.dataset)
			
 
				-    
			
 
				-    def __getitem__(self, index):
			
 
				-        image, target = self.dataset[index]
			
 
				-
			
 
				-        return image, target
			
 
				-    
			
 
				-    def pull_image(self, index):
			
 
				-        # laod data
			
 
				-        image, target = self.dataset[index]
			
 
				-
			
 
				-        # denormalize image
			
 
				-        image = image.permute(1, 2, 0).numpy()
			
 
				-        image = (image * self.pixel_std + self.pixel_mean) * 255.
			
 
				-        image = image.astype(np.uint8)
			
 
				-        image = image.copy()
			
 
				-
			
 
				-        return image, target
			
 
				-
			
 
				-    def build_transform(self):
			
 
				-        if self.is_train:
			
 
				-            transforms = T.Compose([
			
 
				-                            T.RandomResizedCrop(224),
			
 
				-                            T.RandomHorizontalFlip(0.5),
			
 
				-                            T.ToTensor(),
			
 
				-                            T.Normalize(self.pixel_mean,
			
 
				-                                        self.pixel_std)])
			
 
				-        else:
			
 
				-            transforms = T.Compose([
			
 
				-                T.Resize(224, interpolation=PIL.Image.BICUBIC),
			
 
				-                T.CenterCrop(224),
			
 
				-                T.ToTensor(),
			
 
				-                T.Normalize(self.pixel_mean, self.pixel_std),
			
 
				-            ])
			
 
				-
			
 
				-        return transforms
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    import cv2
			
 
				-    import argparse
			
 
				-    
			
 
				-    parser = argparse.ArgumentParser(description='Custom-Dataset')
			
 
				-
			
 
				-    # opt
			
 
				-    parser.add_argument('--root', default='/Users/liuhaoran/Desktop/python_work/classification/dataset/Animals/',
			
 
				-                        help='data root')
			
 
				-    parser.add_argument('--img_size', default=224, type=int,
			
 
				-                        help='input image size.')
			
 
				-    args = parser.parse_args()
			
 
				-  
			
 
				-    # Dataset
			
 
				-    dataset = CustomDataset(args, is_train=True)  
			
 
				-    print('Dataset size: ', len(dataset))
			
 
				-
			
 
				-    for i in range(len(dataset)):
			
 
				-        image, target = dataset.pull_image(i)
			
 
				-        # to BGR
			
 
				-        image = image[..., (2, 1, 0)]
			
 
				-
			
 
				-        cv2.imshow('image', image)
			
 
				-        cv2.waitKey(0)
			
--- a/masked_image_modeling/engine_finetune.py
+++ b/masked_image_modeling/engine_finetune.py
@@ -1,107 +0,0 @@
 
				-import sys
			
 
				-import math
			
 
				-import torch
			
 
				-
			
 
				-from utils.misc import MetricLogger, SmoothedValue, accuracy
			
 
				-
			
 
				-
			
 
				-def train_one_epoch(args,
			
 
				-                    device,
			
 
				-                    model,
			
 
				-                    data_loader,
			
 
				-                    optimizer,
			
 
				-                    epoch,
			
 
				-                    lr_scheduler_warmup,
			
 
				-                    criterion,
			
 
				-                    ):
			
 
				-    model.train(True)
			
 
				-    metric_logger = MetricLogger(delimiter="  ")
			
 
				-    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
			
 
				-    header = 'Epoch: [{}]'.format(epoch)
			
 
				-    print_freq = 20
			
 
				-    epoch_size = len(data_loader)
			
 
				-
			
 
				-    optimizer.zero_grad()
			
 
				-
			
 
				-    # train one epoch
			
 
				-    for iter_i, (images, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
			
 
				-        ni = iter_i + epoch * epoch_size
			
 
				-        nw = args.wp_epoch * epoch_size
			
 
				-
			
 
				-        # Warmup
			
 
				-        if nw > 0 and ni < nw:
			
 
				-            lr_scheduler_warmup(ni, optimizer)
			
 
				-        elif ni == nw:
			
 
				-            print("Warmup stage is over.")
			
 
				-            lr_scheduler_warmup.set_lr(optimizer, args.base_lr)
			
 
				-
			
 
				-        # To device
			
 
				-        images = images.to(device, non_blocking=True)
			
 
				-        targets = targets.to(device, non_blocking=True)
			
 
				-
			
 
				-        # Inference
			
 
				-        output = model(images)
			
 
				-
			
 
				-        # Compute loss
			
 
				-        loss = criterion(output, targets)
			
 
				-
			
 
				-        # Check loss
			
 
				-        loss_value = loss.item()
			
 
				-        if not math.isfinite(loss_value):
			
 
				-            print("Loss is {}, stopping training".format(loss_value))
			
 
				-            sys.exit(1)
			
 
				-
			
 
				-        # Backward
			
 
				-        loss.backward()
			
 
				-
			
 
				-        # Optimize
			
 
				-        optimizer.step()
			
 
				-        optimizer.zero_grad()
			
 
				-
			
 
				-        # Logs
			
 
				-        lr = optimizer.param_groups[0]["lr"]
			
 
				-        metric_logger.update(loss=loss_value)
			
 
				-        metric_logger.update(lr=lr)
			
 
				-
			
 
				-    # gather the stats from all processes
			
 
				-    print("Averaged stats: {}".format(metric_logger))
			
 
				-
			
 
				-    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
			
 
				-
			
 
				-
			
 
				-@torch.no_grad()
			
 
				-def evaluate(data_loader, model, device):
			
 
				-    criterion = torch.nn.CrossEntropyLoss()
			
 
				-
			
 
				-    metric_logger = MetricLogger(delimiter="  ")
			
 
				-    header = 'Test:'
			
 
				-
			
 
				-    # switch to evaluation mode
			
 
				-    model.eval()
			
 
				-
			
 
				-    for batch in metric_logger.log_every(data_loader, 10, header):
			
 
				-        images = batch[0]
			
 
				-        target = batch[1]
			
 
				-        images = images.to(device, non_blocking=True)
			
 
				-        target = target.to(device, non_blocking=True)
			
 
				-
			
 
				-        # Inference
			
 
				-        output = model(images)
			
 
				-
			
 
				-        # Compute loss
			
 
				-        loss = criterion(output, target)
			
 
				-
			
 
				-        # Compute accuracy
			
 
				-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
			
 
				-
			
 
				-        batch_size = images.shape[0]
			
 
				-        metric_logger.update(loss=loss.item())
			
 
				-        metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
			
 
				-        metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
			
 
				-
			
 
				-    # gather the stats from all processes
			
 
				-    print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
			
 
				-          .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss),
			
 
				-          )
			
 
				-
			
 
				-    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
			
--- a/masked_image_modeling/engine_pretrain.py
+++ b/masked_image_modeling/engine_pretrain.py
@@ -1,64 +0,0 @@
 
				-import sys
			
 
				-import math
			
 
				-
			
 
				-from utils.misc import MetricLogger, SmoothedValue
			
 
				-
			
 
				-
			
 
				-def train_one_epoch(args,
			
 
				-                    device,
			
 
				-                    model,
			
 
				-                    data_loader,
			
 
				-                    optimizer,
			
 
				-                    epoch,
			
 
				-                    lr_scheduler_warmup,
			
 
				-                    ):
			
 
				-    model.train(True)
			
 
				-    metric_logger = MetricLogger(delimiter="  ")
			
 
				-    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
			
 
				-    header = 'Epoch: [{}]'.format(epoch)
			
 
				-    print_freq = 20
			
 
				-    epoch_size = len(data_loader)
			
 
				-
			
 
				-    # Train one epoch
			
 
				-    for iter_i, (images, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
			
 
				-        ni = iter_i + epoch * epoch_size
			
 
				-        nw = args.wp_epoch * epoch_size
			
 
				-        
			
 
				-        # Warmup
			
 
				-        if nw > 0 and ni < nw:
			
 
				-            lr_scheduler_warmup(ni, optimizer)
			
 
				-        elif ni == nw:
			
 
				-            print("Warmup stage is over.")
			
 
				-            lr_scheduler_warmup.set_lr(optimizer, args.base_lr)
			
 
				-
			
 
				-        # To device
			
 
				-        images = images.to(device, non_blocking=True)
			
 
				-
			
 
				-        # Inference
			
 
				-        output = model(images)
			
 
				-
			
 
				-        # Compute loss
			
 
				-        loss = output["loss"]
			
 
				-
			
 
				-        # Check loss
			
 
				-        loss_value = loss.item()
			
 
				-        if not math.isfinite(loss_value):
			
 
				-            print("Loss is {}, stopping training".format(loss_value))
			
 
				-            sys.exit(1)
			
 
				-
			
 
				-        # Backward
			
 
				-        loss.backward()
			
 
				-
			
 
				-        # Optimize
			
 
				-        optimizer.step()
			
 
				-        optimizer.zero_grad()
			
 
				-
			
 
				-        # Logs
			
 
				-        lr = optimizer.param_groups[0]["lr"]
			
 
				-        metric_logger.update(loss=loss_value)
			
 
				-        metric_logger.update(lr=lr)
			
 
				-
			
 
				-    # gather the stats from all processes
			
 
				-    print("Averaged stats: {}".format(metric_logger))
			
 
				-
			
 
				-    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
			
--- a/masked_image_modeling/main_finetune.py
+++ b/masked_image_modeling/main_finetune.py
@@ -1,175 +0,0 @@
 
				-import os
			
 
				-import time
			
 
				-import argparse
			
 
				-import datetime
			
 
				-
			
 
				-# ---------------- Torch compoments ----------------
			
 
				-import torch
			
 
				-import torch.backends.cudnn as cudnn
			
 
				-
			
 
				-# ---------------- Dataset compoments ----------------
			
 
				-from data import build_dataset, build_dataloader
			
 
				-
			
 
				-# ---------------- Model compoments ----------------
			
 
				-from models import build_model
			
 
				-
			
 
				-# ---------------- Utils compoments ----------------
			
 
				-from utils.misc import setup_seed, load_model, save_model
			
 
				-from utils.optimizer import build_optimizer
			
 
				-from utils.lr_scheduler import build_lr_scheduler, LinearWarmUpLrScheduler
			
 
				-
			
 
				-# ---------------- Training engine ----------------
			
 
				-from engine_finetune import train_one_epoch, evaluate
			
 
				-
			
 
				-
			
 
				-def parse_args():
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-    # Input
			
 
				-    parser.add_argument('--img_dim', type=int, default=3,
			
 
				-                        help='3 for RGB; 1 for Gray.')    
			
 
				-    parser.add_argument('--patch_size', type=int, default=16,
			
 
				-                        help='patch_size.')
			
 
				-    # Basic
			
 
				-    parser.add_argument('--seed', type=int, default=42,
			
 
				-                        help='random seed.')
			
 
				-    parser.add_argument('--cuda', action='store_true', default=False,
			
 
				-                        help='use cuda')
			
 
				-    parser.add_argument('--batch_size', type=int, default=256,
			
 
				-                        help='batch size on all GPUs')
			
 
				-    parser.add_argument('--num_workers', type=int, default=4,
			
 
				-                        help='number of workers')
			
 
				-    parser.add_argument('--path_to_save', type=str, default='weights/',
			
 
				-                        help='path to save trained model.')
			
 
				-    parser.add_argument('--eval', action='store_true', default=False,
			
 
				-                        help='evaluate model.')
			
 
				-    # Epoch
			
 
				-    parser.add_argument('--wp_epoch', type=int, default=5, 
			
 
				-                        help='warmup epoch')
			
 
				-    parser.add_argument('--start_epoch', type=int, default=0, 
			
 
				-                        help='start epoch')
			
 
				-    parser.add_argument('--max_epoch', type=int, default=50, 
			
 
				-                        help='max epoch')
			
 
				-    parser.add_argument('--eval_epoch', type=int, default=5, 
			
 
				-                        help='max epoch')
			
 
				-    # Dataset
			
 
				-    parser.add_argument('--dataset', type=str, default='cifar10',
			
 
				-                        help='dataset name')
			
 
				-    parser.add_argument('--root', type=str, default='/mnt/share/ssd2/dataset',
			
 
				-                        help='path to dataset folder')
			
 
				-    parser.add_argument('--num_classes', type=int, default=None, 
			
 
				-                        help='number of classes.')
			
 
				-    # Model
			
 
				-    parser.add_argument('-m', '--model', type=str, default='vit_t',
			
 
				-                        help='model name')
			
 
				-    parser.add_argument('--pretrained', default=None, type=str,
			
 
				-                        help='load pretrained weight.')
			
 
				-    parser.add_argument('--resume', default=None, type=str,
			
 
				-                        help='keep training')
			
 
				-    parser.add_argument('--drop_path', type=float, default=0.1,
			
 
				-                        help='drop_path')
			
 
				-    # Optimizer
			
 
				-    parser.add_argument('-opt', '--optimizer', type=str, default='adamw',
			
 
				-                        help='sgd, adam')
			
 
				-    parser.add_argument('-wd', '--weight_decay', type=float, default=0.05,
			
 
				-                        help='weight decay')
			
 
				-    parser.add_argument('--base_lr', type=float, default=0.001,
			
 
				-                        help='learning rate for training model')
			
 
				-    parser.add_argument('--min_lr', type=float, default=0,
			
 
				-                        help='the final lr')
			
 
				-    # Lr scheduler
			
 
				-    parser.add_argument('-lrs', '--lr_scheduler', type=str, default='cosine',
			
 
				-                        help='step, cosine')
			
 
				-
			
 
				-    return parser.parse_args()
			
 
				-
			
 
				-    
			
 
				-def main():
			
 
				-    args = parse_args()
			
 
				-    # set random seed
			
 
				-    setup_seed(args.seed)
			
 
				-
			
 
				-    # Path to save model
			
 
				-    path_to_save = os.path.join(args.path_to_save, args.dataset, "finetune", args.model)
			
 
				-    os.makedirs(path_to_save, exist_ok=True)
			
 
				-    args.output_dir = path_to_save
			
 
				-
			
 
				-    # ------------------------- Build CUDA -------------------------
			
 
				-    if args.cuda:
			
 
				-        if torch.cuda.is_available():
			
 
				-            cudnn.benchmark = True
			
 
				-            device = torch.device("cuda")
			
 
				-        else:
			
 
				-            print('There is no available GPU.')
			
 
				-            args.cuda = False
			
 
				-            device = torch.device("cpu")
			
 
				-    else:
			
 
				-        device = torch.device("cpu")
			
 
				-
			
 
				-    # ------------------------- Build Dataset -------------------------
			
 
				-    train_dataset = build_dataset(args, is_train=True)
			
 
				-    val_dataset   = build_dataset(args, is_train=False)
			
 
				-
			
 
				-    # ------------------------- Build Dataloader -------------------------
			
 
				-    train_dataloader = build_dataloader(args, train_dataset, is_train=True)
			
 
				-    val_dataloader   = build_dataloader(args, val_dataset,   is_train=False)
			
 
				-
			
 
				-    print('=================== Dataset Information ===================')
			
 
				-    print('Train dataset size : ', len(train_dataset))
			
 
				-    print('Val dataset size   : ', len(val_dataset))
			
 
				-
			
 
				-    # ------------------------- Build Model -------------------------
			
 
				-    model = build_model(args, model_type='cls')
			
 
				-    model.train().to(device)
			
 
				-    print(model)
			
 
				-
			
 
				-    # ------------------------- Build Optimzier -------------------------
			
 
				-    optimizer = build_optimizer(args, model)
			
 
				-
			
 
				-    # ------------------------- Build Lr Scheduler -------------------------
			
 
				-    lr_scheduler_warmup = LinearWarmUpLrScheduler(args.base_lr, wp_iter=args.wp_epoch * len(train_dataloader))
			
 
				-    lr_scheduler = build_lr_scheduler(args, optimizer)
			
 
				-
			
 
				-
			
 
				-    # ------------------------- Build Criterion -------------------------
			
 
				-    criterion = torch.nn.CrossEntropyLoss()
			
 
				-    load_model(args, model, optimizer, lr_scheduler)
			
 
				-
			
 
				-    # ------------------------- Eval before Train Pipeline -------------------------
			
 
				-    if args.eval:
			
 
				-        print('evaluating ...')
			
 
				-        test_stats = evaluate(val_dataloader, model, device)
			
 
				-        print('Eval Results: [loss: %.2f][acc1: %.2f][acc5 : %.2f]' %
			
 
				-                (test_stats['loss'], test_stats['acc1'], test_stats['acc5']), flush=True)
			
 
				-        return
			
 
				-
			
 
				-    # ------------------------- Training Pipeline -------------------------
			
 
				-    start_time = time.time()
			
 
				-    max_accuracy = -1.0
			
 
				-    print("=============== Start training for {} epochs ===============".format(args.max_epoch))
			
 
				-    for epoch in range(args.start_epoch, args.max_epoch):
			
 
				-        # Train one epoch
			
 
				-        train_one_epoch(args, device, model, train_dataloader, optimizer,
			
 
				-                        epoch, lr_scheduler_warmup, criterion)
			
 
				-
			
 
				-        # LR scheduler
			
 
				-        if (epoch + 1) > args.wp_epoch:
			
 
				-            lr_scheduler.step()
			
 
				-
			
 
				-        # Evaluate
			
 
				-        if (epoch % args.eval_epoch) == 0 or (epoch + 1 == args.max_epoch):
			
 
				-            test_stats = evaluate(val_dataloader, model, device)
			
 
				-            print(f"Accuracy of the network on the {len(val_dataset)} test images: {test_stats['acc1']:.1f}%")
			
 
				-            max_accuracy = max(max_accuracy, test_stats["acc1"])
			
 
				-            print(f'Max accuracy: {max_accuracy:.2f}%')
			
 
				-
			
 
				-            # Save model
			
 
				-            print('- saving the model after {} epochs ...'.format(epoch))
			
 
				-            save_model(args, epoch, model, optimizer, lr_scheduler, acc1=max_accuracy)
			
 
				-
			
 
				-    total_time = time.time() - start_time
			
 
				-    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
			
 
				-    print('Training time {}'.format(total_time_str))
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/masked_image_modeling/main_pretrain.py
+++ b/masked_image_modeling/main_pretrain.py
@@ -1,211 +0,0 @@
 
				-import os
			
 
				-import cv2
			
 
				-import time
			
 
				-import datetime
			
 
				-import argparse
			
 
				-import numpy as np
			
 
				-
			
 
				-# ---------------- Torch compoments ----------------
			
 
				-import torch
			
 
				-import torch.backends.cudnn as cudnn
			
 
				-
			
 
				-# ---------------- Dataset compoments ----------------
			
 
				-from data import build_dataset, build_dataloader
			
 
				-from models import build_model
			
 
				-
			
 
				-# ---------------- Utils compoments ----------------
			
 
				-from utils.misc import setup_seed
			
 
				-from utils.misc import load_model, save_model, unpatchify
			
 
				-from utils.optimizer import build_optimizer
			
 
				-from utils.lr_scheduler import build_lr_scheduler, LinearWarmUpLrScheduler
			
 
				-
			
 
				-# ---------------- Training engine ----------------
			
 
				-from engine_pretrain import train_one_epoch
			
 
				-
			
 
				-
			
 
				-def parse_args():
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-    # Basic
			
 
				-    parser.add_argument('--seed', type=int, default=42,
			
 
				-                        help='random seed.')
			
 
				-    parser.add_argument('--cuda', action='store_true', default=False,
			
 
				-                        help='use cuda')
			
 
				-    parser.add_argument('--batch_size', type=int, default=256,
			
 
				-                        help='batch size on all GPUs')
			
 
				-    parser.add_argument('--num_workers', type=int, default=4,
			
 
				-                        help='number of workers')
			
 
				-    parser.add_argument('--path_to_save', type=str, default='weights/',
			
 
				-                        help='path to save trained model.')
			
 
				-    parser.add_argument('--eval', action='store_true', default=False,
			
 
				-                        help='evaluate model.')
			
 
				-    # Epoch
			
 
				-    parser.add_argument('--wp_epoch', type=int, default=20, 
			
 
				-                        help='warmup epoch for finetune with MAE pretrained')
			
 
				-    parser.add_argument('--start_epoch', type=int, default=0, 
			
 
				-                        help='start epoch for finetune with MAE pretrained')
			
 
				-    parser.add_argument('--eval_epoch', type=int, default=10, 
			
 
				-                        help='warmup epoch for finetune with MAE pretrained')
			
 
				-    parser.add_argument('--max_epoch', type=int, default=200, 
			
 
				-                        help='max epoch')
			
 
				-    # Dataset
			
 
				-    parser.add_argument('--dataset', type=str, default='cifar10',
			
 
				-                        help='dataset name')
			
 
				-    parser.add_argument('--root', type=str, default='/mnt/share/ssd2/dataset',
			
 
				-                        help='path to dataset folder')
			
 
				-    parser.add_argument('--num_classes', type=int, default=None, 
			
 
				-                        help='number of classes.')
			
 
				-    # Model
			
 
				-    parser.add_argument('-m', '--model', type=str, default='vit_t',
			
 
				-                        help='model name')
			
 
				-    parser.add_argument('--resume', default=None, type=str,
			
 
				-                        help='keep training')
			
 
				-    parser.add_argument('--drop_path', type=float, default=0.,
			
 
				-                        help='drop_path')
			
 
				-    parser.add_argument('--mask_ratio', type=float, default=0.75,
			
 
				-                        help='mask ratio.')    
			
 
				-    # Optimizer
			
 
				-    parser.add_argument('-opt', '--optimizer', type=str, default='adamw',
			
 
				-                        help='sgd, adam')
			
 
				-    parser.add_argument('-wd', '--weight_decay', type=float, default=0.05,
			
 
				-                        help='weight decay')
			
 
				-    parser.add_argument('--base_lr', type=float, default=0.00015,
			
 
				-                        help='learning rate for training model')
			
 
				-    parser.add_argument('--min_lr', type=float, default=0,
			
 
				-                        help='the final lr')
			
 
				-    # Optimizer
			
 
				-    parser.add_argument('-lrs', '--lr_scheduler', type=str, default='cosine',
			
 
				-                        help='step, cosine')
			
 
				-
			
 
				-    return parser.parse_args()
			
 
				-
			
 
				-    
			
 
				-def main():
			
 
				-    args = parse_args()
			
 
				-    # set random seed
			
 
				-    setup_seed(args.seed)
			
 
				-
			
 
				-    # Path to save model
			
 
				-    path_to_save = os.path.join(args.path_to_save, args.dataset, "pretrained", args.model)
			
 
				-    os.makedirs(path_to_save, exist_ok=True)
			
 
				-    args.output_dir = path_to_save
			
 
				-    
			
 
				-    # ------------------------- Build CUDA -------------------------
			
 
				-    if args.cuda:
			
 
				-        if torch.cuda.is_available():
			
 
				-            cudnn.benchmark = True
			
 
				-            device = torch.device("cuda")
			
 
				-        else:
			
 
				-            print('There is no available GPU.')
			
 
				-            args.cuda = False
			
 
				-            device = torch.device("cpu")
			
 
				-    else:
			
 
				-        device = torch.device("cpu")
			
 
				-
			
 
				-    # ------------------------- Build Dataset -------------------------
			
 
				-    train_dataset = build_dataset(args, is_train=True)
			
 
				-
			
 
				-    # ------------------------- Build Dataloader -------------------------
			
 
				-    train_dataloader = build_dataloader(args, train_dataset, is_train=True)
			
 
				-    print('=================== Dataset Information ===================')
			
 
				-    print('Train dataset size : {}'.format(len(train_dataset)))
			
 
				-
			
 
				-   # ------------------------- Build Model -------------------------
			
 
				-    model = build_model(args, model_type='mae')
			
 
				-    model.train().to(device)
			
 
				-    print(model)
			
 
				-
			
 
				-    # ------------------------- Build Optimzier -------------------------
			
 
				-    optimizer = build_optimizer(args, model)
			
 
				-
			
 
				-    # ------------------------- Build Lr Scheduler -------------------------
			
 
				-    lr_scheduler_warmup = LinearWarmUpLrScheduler(args.base_lr, wp_iter=args.wp_epoch * len(train_dataloader))
			
 
				-    lr_scheduler = build_lr_scheduler(args, optimizer)
			
 
				-
			
 
				-    # ------------------------- Build checkpoint -------------------------
			
 
				-    load_model(args, model, optimizer, lr_scheduler)
			
 
				-
			
 
				-    # ------------------------- Eval before Train Pipeline -------------------------
			
 
				-    if args.eval:
			
 
				-        print('visualizing ...')
			
 
				-        visualize(args, device, model)
			
 
				-        return
			
 
				-
			
 
				-    # ------------------------- Training Pipeline -------------------------
			
 
				-    start_time = time.time()
			
 
				-    print("=================== Start training for {} epochs ===================".format(args.max_epoch))
			
 
				-    for epoch in range(args.start_epoch, args.max_epoch):
			
 
				-        # Train one epoch
			
 
				-        train_one_epoch(args, device, model, train_dataloader,
			
 
				-                        optimizer, epoch, lr_scheduler_warmup)
			
 
				-
			
 
				-        # LR scheduler
			
 
				-        if (epoch + 1) > args.wp_epoch:
			
 
				-            lr_scheduler.step()
			
 
				-
			
 
				-        # Evaluate
			
 
				-        if epoch % args.eval_epoch == 0 or epoch + 1 == args.max_epoch:
			
 
				-            print('- saving the model after {} epochs ...'.format(epoch))
			
 
				-            save_model(args, epoch, model, optimizer, lr_scheduler, mae_task=True)
			
 
				-
			
 
				-    total_time = time.time() - start_time
			
 
				-    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
			
 
				-    print('Training time {}'.format(total_time_str))
			
 
				-
			
 
				-def visualize(args, device, model):
			
 
				-    # test dataset
			
 
				-    val_dataset = build_dataset(args, is_train=False)
			
 
				-    val_dataloader = build_dataloader(args, val_dataset, is_train=False)
			
 
				-
			
 
				-    # save path
			
 
				-    save_path = "vis_results/{}/{}".format(args.dataset, args.model)
			
 
				-    os.makedirs(save_path, exist_ok=True)
			
 
				-
			
 
				-    # switch to evaluate mode
			
 
				-    model.eval()
			
 
				-    patch_size = args.patch_size
			
 
				-    pixel_mean = val_dataloader.dataset.pixel_mean
			
 
				-    pixel_std  = val_dataloader.dataset.pixel_std
			
 
				-
			
 
				-    with torch.no_grad():
			
 
				-        for i, (images, target) in enumerate(val_dataloader):
			
 
				-            # To device
			
 
				-            images = images.to(device, non_blocking=True)
			
 
				-            target = target.to(device, non_blocking=True)
			
 
				-
			
 
				-            # Inference
			
 
				-            output = model(images)
			
 
				-
			
 
				-            # Denormalize input image
			
 
				-            org_img = images[0].permute(1, 2, 0).cpu().numpy()
			
 
				-            org_img = (org_img * pixel_std + pixel_mean) * 255.
			
 
				-            org_img = org_img.astype(np.uint8)
			
 
				-
			
 
				-            # 调整mask的格式：[B, H*W] -> [B, H*W, p*p*3]
			
 
				-            mask = output['mask'].unsqueeze(-1).repeat(1, 1, patch_size**2 *3)  # [B, H*W] -> [B, H*W, p*p*3]
			
 
				-            # 将序列格式的mask逆转回二维图像格式
			
 
				-            mask = unpatchify(mask, patch_size)
			
 
				-            mask = mask[0].permute(1, 2, 0).cpu().numpy()
			
 
				-            # 掩盖图像中被遮掩的图像patch区域
			
 
				-            masked_img = org_img * (1 - mask)  # 1 is removing, 0 is keeping
			
 
				-            masked_img = masked_img.astype(np.uint8)
			
 
				-
			
 
				-            # 将序列格式的重构图像逆转回二维图像格式
			
 
				-            pred_img = unpatchify(output['x_pred'], patch_size)
			
 
				-            pred_img = pred_img[0].permute(1, 2, 0).cpu().numpy()
			
 
				-            pred_img = (pred_img * pixel_std + pixel_mean) * 255.
			
 
				-            # 将原图中被保留的图像patch和网络预测的重构的图像patch拼在一起
			
 
				-            pred_img = org_img * (1 - mask) + pred_img * mask
			
 
				-            pred_img = pred_img.astype(np.uint8)
			
 
				-
			
 
				-            # visualize
			
 
				-            vis_image = np.concatenate([masked_img, org_img, pred_img], axis=1)
			
 
				-            vis_image = vis_image[..., (2, 1, 0)]
			
 
				-            cv2.imshow('masked | origin | reconstruct ', vis_image)
			
 
				-            cv2.waitKey(0)
			
 
				-
			
 
				-            # save
			
 
				-            cv2.imwrite('{}/{:06}.png'.format(save_path, i), vis_image)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/masked_image_modeling/models/__init__.py
+++ b/masked_image_modeling/models/__init__.py
@@ -1,9 +0,0 @@
 
				-from .vit.build import build_vision_transformer
			
 
				-
			
 
				-
			
 
				-def build_model(args, model_type='default'):
			
 
				-    # ----------- Vision Transformer -----------
			
 
				-    if "vit" in args.model:
			
 
				-        return build_vision_transformer(args, model_type)
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown model: {}".format(args.model))
			
--- a/masked_image_modeling/models/vit/__init__.py
+++ b/masked_image_modeling/models/vit/__init__.py
@@ -1,3 +0,0 @@
 
				-from .vit import build_vit
			
 
				-from .vit_mae import build_vit_mae
			
 
				-from .vit_cls import ViTForImageClassification
			
--- a/masked_image_modeling/models/vit/build.py
+++ b/masked_image_modeling/models/vit/build.py
@@ -1,45 +0,0 @@
 
				-import os
			
 
				-import torch
			
 
				-
			
 
				-from .vit     import build_vit
			
 
				-from .vit_mae import build_vit_mae
			
 
				-from .vit_cls import ViTForImageClassification
			
 
				-
			
 
				-
			
 
				-def build_vision_transformer(args, model_type='default'):
			
 
				-    assert args.model in ['vit_t', 'vit_s', 'vit_b', 'vit_l', 'vit_h'], "Unknown vit model: {}".format(args.model)
			
 
				-
			
 
				-    # ----------- Masked Image Modeling task -----------
			
 
				-    if model_type == 'mae':
			
 
				-        model = build_vit_mae(args.model, args.img_size, args.patch_size, args.img_dim, args.mask_ratio)
			
 
				-    
			
 
				-    # ----------- Image Classification task -----------
			
 
				-    elif model_type == 'cls':
			
 
				-        image_encoder = build_vit(args.model, args.img_size, args.patch_size, args.img_dim)
			
 
				-        model = ViTForImageClassification(image_encoder, num_classes=args.num_classes, qkv_bias=True)
			
 
				-        load_mae_pretrained(model.encoder, args.pretrained)
			
 
				-
			
 
				-    # ----------- Vison Backbone -----------
			
 
				-    elif model_type == 'default':
			
 
				-        model = build_vit(args.model, args.img_size, args.patch_size, args.img_dim)
			
 
				-        load_mae_pretrained(model, args.pretrained)
			
 
				-        
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown model type: {}".format(model_type))
			
 
				-    
			
 
				-    return model
			
 
				-
			
 
				-
			
 
				-def load_mae_pretrained(model, ckpt=None):
			
 
				-    if ckpt is not None:
			
 
				-        # check path
			
 
				-        if not os.path.exists(ckpt):
			
 
				-            print("No pretrained model.")
			
 
				-            return model
			
 
				-        print('- Loading pretrained from: {}'.format(ckpt))
			
 
				-        checkpoint = torch.load(ckpt, map_location='cpu')
			
 
				-        # checkpoint state dict
			
 
				-        encoder_state_dict = checkpoint.pop("encoder")
			
 
				-
			
 
				-        # load encoder weight into ViT's encoder
			
 
				-        model.load_state_dict(encoder_state_dict)
			
--- a/masked_image_modeling/models/vit/modules.py
+++ b/masked_image_modeling/models/vit/modules.py
@@ -1,186 +0,0 @@
 
				-# --------------------------------------------------------------------
			
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# All rights reserved.
			
 
				-
			
 
				-# This source code is licensed under the license found in the
			
 
				-# LICENSE file in the root directory of this source tree.
			
 
				-# --------------------------------------------------------------------
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-from typing import Type
			
 
				-
			
 
				-
			
 
				-# ----------------------- Basic modules -----------------------
			
 
				-class FeedFroward(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 embedding_dim: int,
			
 
				-                 mlp_dim: int,
			
 
				-                 act: Type[nn.Module] = nn.GELU,
			
 
				-                 dropout: float = 0.0,
			
 
				-                 ) -> None:
			
 
				-        super().__init__()
			
 
				-        self.fc1   = nn.Linear(embedding_dim, mlp_dim)
			
 
				-        self.drop1 = nn.Dropout(dropout)
			
 
				-        self.fc2   = nn.Linear(mlp_dim, embedding_dim)
			
 
				-        self.drop2 = nn.Dropout(dropout)
			
 
				-        self.act   = act()
			
 
				-
			
 
				-    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				-        x = self.fc1(x)
			
 
				-        x = self.act(x)
			
 
				-        x = self.drop1(x)
			
 
				-        x = self.fc2(x)
			
 
				-        x = self.drop2(x)
			
 
				-        return x
			
 
				-
			
 
				-class PatchEmbed(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_chans    : int = 3,
			
 
				-                 embed_dim   : int = 768,
			
 
				-                 kernel_size : int = 16,
			
 
				-                 padding     : int = 0,
			
 
				-                 stride      : int = 16,
			
 
				-                 ) -> None:
			
 
				-        super().__init__()
			
 
				-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
			
 
				-
			
 
				-    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				-        return self.proj(x)
			
 
				-
			
 
				-
			
 
				-# ----------------------- Model modules -----------------------
			
 
				-class ViTBlock(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 dim       :int,
			
 
				-                 num_heads :int,
			
 
				-                 mlp_ratio :float = 4.0,
			
 
				-                 qkv_bias  :bool = True,
			
 
				-                 act_layer :Type[nn.Module] = nn.GELU,
			
 
				-                 dropout   :float = 0.
			
 
				-                 ) -> None:
			
 
				-        super().__init__()
			
 
				-        # -------------- Model parameters --------------
			
 
				-        self.norm1 = nn.LayerNorm(dim)
			
 
				-        self.attn  = Attention(dim         = dim,
			
 
				-                               qkv_bias    = qkv_bias,
			
 
				-                               num_heads   = num_heads,
			
 
				-                               dropout     = dropout
			
 
				-                               )
			
 
				-        self.norm2 = nn.LayerNorm(dim)
			
 
				-        self.ffn   = FeedFroward(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
			
 
				-
			
 
				-    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				-        shortcut = x
			
 
				-        # Attention (with prenorm)
			
 
				-        x = self.norm1(x)
			
 
				-        x = self.attn(x)
			
 
				-        x = shortcut + x
			
 
				-
			
 
				-        # Feedforward (with prenorm)
			
 
				-        x = x + self.ffn(self.norm2(x))
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-class Attention(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 dim       :int,
			
 
				-                 qkv_bias  :bool  = False,
			
 
				-                 num_heads :int   = 8,
			
 
				-                 dropout   :float = 0.
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # --------------- Basic parameters ---------------
			
 
				-        self.dim = dim
			
 
				-        self.num_heads = num_heads
			
 
				-        self.head_dim = dim // num_heads
			
 
				-        self.scale = self.head_dim ** -0.5
			
 
				-
			
 
				-        # --------------- Network parameters ---------------
			
 
				-        self.qkv_proj = nn.Linear(dim, dim*3, bias = qkv_bias)
			
 
				-        self.attn_drop = nn.Dropout(dropout)
			
 
				-        self.proj = nn.Linear(dim, dim)
			
 
				-        self.proj_drop = nn.Dropout(dropout)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        bs, N, _ = x.shape
			
 
				-        # ----------------- Input proj -----------------
			
 
				-        qkv = self.qkv_proj(x)
			
 
				-        q, k, v = torch.chunk(qkv, 3, dim=-1)
			
 
				-
			
 
				-        # ----------------- Multi-head Attn -----------------
			
 
				-        ## [B, N, C] -> [B, N, H, C_h] -> [B, H, N, C_h]
			
 
				-        q = q.view(bs, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3).contiguous()
			
 
				-        k = k.view(bs, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3).contiguous()
			
 
				-        v = v.view(bs, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3).contiguous()
			
 
				-        ## [B, H, Nq, C_h] X [B, H, C_h, Nk] = [B, H, Nq, Nk]
			
 
				-        attn = q * self.scale @ k.transpose(-1, -2)
			
 
				-        attn = attn.softmax(dim=-1)
			
 
				-        attn = self.attn_drop(attn)
			
 
				-        x = attn @ v # [B, H, Nq, C_h]
			
 
				-
			
 
				-        # ----------------- Output -----------------
			
 
				-        x = x.permute(0, 2, 1, 3).contiguous().view(bs, N, -1)
			
 
				-        x = self.proj(x)
			
 
				-        x = self.proj_drop(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-# ----------------------- Classifier -----------------------
			
 
				-class AttentionPoolingClassifier(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        in_dim      : int,
			
 
				-        out_dim     : int,
			
 
				-        num_heads   : int = 12,
			
 
				-        qkv_bias    : bool = False,
			
 
				-        num_queries : int = 1,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-        self.num_heads = num_heads
			
 
				-        head_dim = in_dim // num_heads
			
 
				-        self.scale = head_dim**-0.5
			
 
				-
			
 
				-        self.k = nn.Linear(in_dim, in_dim, bias=qkv_bias)
			
 
				-        self.v = nn.Linear(in_dim, in_dim, bias=qkv_bias)
			
 
				-
			
 
				-        self.cls_token = nn.Parameter(torch.randn(1, num_queries, in_dim) * 0.02)
			
 
				-        self.linear = nn.Linear(in_dim, out_dim)
			
 
				-        self.bn = nn.BatchNorm1d(in_dim, affine=False, eps=1e-6)
			
 
				-
			
 
				-        self.num_queries = num_queries
			
 
				-
			
 
				-    def forward(self, x: torch.Tensor):
			
 
				-        B, N, C = x.shape
			
 
				-
			
 
				-        x = self.bn(x.transpose(-2, -1)).transpose(-2, -1)
			
 
				-        cls_token = self.cls_token.expand(B, -1, -1)  # newly created class token
			
 
				-
			
 
				-        q = cls_token.reshape(
			
 
				-            B, self.num_queries, self.num_heads, C // self.num_heads
			
 
				-        ).permute(0, 2, 1, 3)
			
 
				-        k = (
			
 
				-            self.k(x)
			
 
				-            .reshape(B, N, self.num_heads, C // self.num_heads)
			
 
				-            .permute(0, 2, 1, 3)
			
 
				-        )
			
 
				-
			
 
				-        q = q * self.scale
			
 
				-        v = (
			
 
				-            self.v(x)
			
 
				-            .reshape(B, N, self.num_heads, C // self.num_heads)
			
 
				-            .permute(0, 2, 1, 3)
			
 
				-        )
			
 
				-
			
 
				-        attn = q @ k.transpose(-2, -1)
			
 
				-        attn = attn.softmax(dim=-1)
			
 
				-
			
 
				-        x_cls = (attn @ v).transpose(1, 2).reshape(B, self.num_queries, C)
			
 
				-        x_cls = x_cls.mean(dim=1)
			
 
				-
			
 
				-        out = self.linear(x_cls)
			
 
				-
			
 
				-        return out, x_cls
			
--- a/masked_image_modeling/models/vit/pos_embed.py
+++ b/masked_image_modeling/models/vit/pos_embed.py
@@ -1,96 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# All rights reserved.
			
 
				-
			
 
				-# This source code is licensed under the license found in the
			
 
				-# LICENSE file in the root directory of this source tree.
			
 
				-# --------------------------------------------------------
			
 
				-# Position embedding utils
			
 
				-# --------------------------------------------------------
			
 
				-
			
 
				-import numpy as np
			
 
				-
			
 
				-import torch
			
 
				-
			
 
				-# --------------------------------------------------------
			
 
				-# 2D sine-cosine position embedding
			
 
				-# References:
			
 
				-# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
			
 
				-# MoCo v3: https://github.com/facebookresearch/moco-v3
			
 
				-# --------------------------------------------------------
			
 
				-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
			
 
				-    """
			
 
				-    grid_size: int of the grid height and width
			
 
				-    return:
			
 
				-    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
			
 
				-    """
			
 
				-    grid_h = np.arange(grid_size, dtype=np.float32)
			
 
				-    grid_w = np.arange(grid_size, dtype=np.float32)
			
 
				-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
			
 
				-    grid = np.stack(grid, axis=0)
			
 
				-
			
 
				-    grid = grid.reshape([2, 1, grid_size, grid_size])
			
 
				-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
			
 
				-    if cls_token:
			
 
				-        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
			
 
				-    return pos_embed
			
 
				-
			
 
				-
			
 
				-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
			
 
				-    assert embed_dim % 2 == 0
			
 
				-
			
 
				-    # use half of dimensions to encode grid_h
			
 
				-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
			
 
				-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
			
 
				-
			
 
				-    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
			
 
				-    return emb
			
 
				-
			
 
				-
			
 
				-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
			
 
				-    """
			
 
				-    embed_dim: output dimension for each position
			
 
				-    pos: a list of positions to be encoded: size (M,)
			
 
				-    out: (M, D)
			
 
				-    """
			
 
				-    assert embed_dim % 2 == 0
			
 
				-    omega = np.arange(embed_dim // 2, dtype=np.float)
			
 
				-    omega /= embed_dim / 2.
			
 
				-    omega = 1. / 10000**omega  # (D/2,)
			
 
				-
			
 
				-    pos = pos.reshape(-1)  # (M,)
			
 
				-    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
			
 
				-
			
 
				-    emb_sin = np.sin(out) # (M, D/2)
			
 
				-    emb_cos = np.cos(out) # (M, D/2)
			
 
				-
			
 
				-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
			
 
				-    return emb
			
 
				-
			
 
				-
			
 
				-# --------------------------------------------------------
			
 
				-# Interpolate position embeddings for high-resolution
			
 
				-# References:
			
 
				-# DeiT: https://github.com/facebookresearch/deit
			
 
				-# --------------------------------------------------------
			
 
				-def interpolate_pos_embed(model, checkpoint_model):
			
 
				-    if 'pos_embed' in checkpoint_model:
			
 
				-        pos_embed_checkpoint = checkpoint_model['pos_embed']
			
 
				-        embedding_size = pos_embed_checkpoint.shape[-1]
			
 
				-        num_patches = model.num_patches
			
 
				-        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
			
 
				-        # height (== width) for the checkpoint position embedding
			
 
				-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
			
 
				-        # height (== width) for the new position embedding
			
 
				-        new_size = int(num_patches ** 0.5)
			
 
				-        # class_token and dist_token are kept unchanged
			
 
				-        if orig_size != new_size:
			
 
				-            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
			
 
				-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
			
 
				-            # only the position tokens are interpolated
			
 
				-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
			
 
				-            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
			
 
				-            pos_tokens = torch.nn.functional.interpolate(
			
 
				-                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
			
 
				-            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
			
 
				-            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
			
 
				-            checkpoint_model['pos_embed'] = new_pos_embed
			
--- a/masked_image_modeling/models/vit/vit.py
+++ b/masked_image_modeling/models/vit/vit.py
@@ -1,180 +0,0 @@
 
				-# --------------------------------------------------------------------
			
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# All rights reserved.
			
 
				-
			
 
				-# This source code is licensed under the license found in the
			
 
				-# LICENSE file in the root directory of this source tree.
			
 
				-# --------------------------------------------------------------------
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-try:
			
 
				-    from .modules import PatchEmbed, ViTBlock
			
 
				-except:
			
 
				-    from  modules import PatchEmbed, ViTBlock
			
 
				-
			
 
				-
			
 
				-# ---------------------- Vision transformer ----------------------
			
 
				-class ImageEncoderViT(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 img_size: int,
			
 
				-                 patch_size: int,
			
 
				-                 in_chans: int,
			
 
				-                 patch_embed_dim: int,
			
 
				-                 depth: int,
			
 
				-                 num_heads: int,
			
 
				-                 mlp_ratio: float,
			
 
				-                 act_layer: nn.GELU,
			
 
				-                 dropout: float = 0.0,
			
 
				-                 ) -> None:
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.img_size = img_size
			
 
				-        self.patch_size = patch_size
			
 
				-        self.image_embedding_size = img_size // ((patch_size if patch_size > 0 else 1))
			
 
				-        self.patch_embed_dim = patch_embed_dim
			
 
				-        self.num_heads = num_heads
			
 
				-        self.num_patches = (img_size // patch_size) ** 2
			
 
				-        # ----------- Model parameters -----------
			
 
				-        self.patch_embed = PatchEmbed(in_chans, patch_embed_dim, patch_size, stride=patch_size)
			
 
				-        self.pos_embed   = nn.Parameter(torch.zeros(1, self.num_patches, patch_embed_dim))
			
 
				-        self.norm_layer  = nn.LayerNorm(patch_embed_dim)
			
 
				-        self.blocks      = nn.ModuleList([
			
 
				-            ViTBlock(patch_embed_dim, num_heads, mlp_ratio, True, act_layer, dropout)
			
 
				-            for _ in range(depth)])
			
 
				-
			
 
				-        self._init_weights()
			
 
				-
			
 
				-    def _init_weights(self):
			
 
				-        # initialize (and freeze) pos_embed by sin-cos embedding
			
 
				-        pos_embed = self.get_posembed(self.pos_embed.shape[-1], int(self.num_patches**.5))
			
 
				-        self.pos_embed.data.copy_(pos_embed)
			
 
				-
			
 
				-        # initialize nn.Linear and nn.LayerNorm
			
 
				-        for m in self.modules():           
			
 
				-            if isinstance(m, nn.Linear):
			
 
				-                # we use xavier_uniform following official JAX ViT:
			
 
				-                torch.nn.init.xavier_uniform_(m.weight)
			
 
				-                if isinstance(m, nn.Linear) and m.bias is not None:
			
 
				-                    nn.init.constant_(m.bias, 0)
			
 
				-            elif isinstance(m, nn.LayerNorm):
			
 
				-                nn.init.constant_(m.bias, 0)
			
 
				-                nn.init.constant_(m.weight, 1.0)
			
 
				-
			
 
				-    def get_posembed(self, embed_dim, grid_size, temperature=10000):
			
 
				-        scale = 2 * torch.pi
			
 
				-        grid_h, grid_w = grid_size, grid_size
			
 
				-        num_pos_feats = embed_dim // 2
			
 
				-        # get grid
			
 
				-        y_embed, x_embed = torch.meshgrid([torch.arange(grid_h, dtype=torch.float32),
			
 
				-                                           torch.arange(grid_w, dtype=torch.float32)])
			
 
				-        # normalize grid coords
			
 
				-        y_embed = y_embed / (grid_h + 1e-6) * scale
			
 
				-        x_embed = x_embed / (grid_w + 1e-6) * scale
			
 
				-    
			
 
				-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32)
			
 
				-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats
			
 
				-        dim_t = temperature ** (2 * dim_t_)
			
 
				-
			
 
				-        pos_x = torch.div(x_embed[..., None], dim_t)
			
 
				-        pos_y = torch.div(y_embed[..., None], dim_t)
			
 
				-        pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-        pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-
			
 
				-        # [H, W, C] -> [N, C]
			
 
				-        pos_embed = torch.cat((pos_y, pos_x), dim=-1).view(-1, embed_dim)
			
 
				-
			
 
				-        return pos_embed.unsqueeze(0)
			
 
				-
			
 
				-    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				-        # Patch embed
			
 
				-        x = self.patch_embed(x)
			
 
				-        x = x.flatten(2).permute(0, 2, 1).contiguous()
			
 
				-
			
 
				-        # Add pos embed
			
 
				-        x = x + self.pos_embed
			
 
				-
			
 
				-        # Apply Transformer blocks
			
 
				-        for block in self.blocks:
			
 
				-            x = block(x)
			
 
				-        x = self.norm_layer(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-# ------------------------ Model Functions ------------------------
			
 
				-def build_vit(model_name="vit_t", img_size=224, patch_size=16, img_dim=3):
			
 
				-    if model_name == "vit_t":
			
 
				-        return ImageEncoderViT(img_size=img_size,
			
 
				-                               patch_size=patch_size,
			
 
				-                               in_chans=img_dim,
			
 
				-                               patch_embed_dim=192,
			
 
				-                               depth=12,
			
 
				-                               num_heads=3,
			
 
				-                               mlp_ratio=4.0,
			
 
				-                               act_layer=nn.GELU,
			
 
				-                               dropout = 0.1)
			
 
				-    if model_name == "vit_s":
			
 
				-        return ImageEncoderViT(img_size=img_size,
			
 
				-                               patch_size=patch_size,
			
 
				-                               in_chans=img_dim,
			
 
				-                               patch_embed_dim=384,
			
 
				-                               depth=12,
			
 
				-                               num_heads=6,
			
 
				-                               mlp_ratio=4.0,
			
 
				-                               act_layer=nn.GELU,
			
 
				-                               dropout = 0.1)
			
 
				-    if model_name == "vit_b":
			
 
				-        return ImageEncoderViT(img_size=img_size,
			
 
				-                               patch_size=patch_size,
			
 
				-                               in_chans=img_dim,
			
 
				-                               patch_embed_dim=768,
			
 
				-                               depth=12,
			
 
				-                               num_heads=12,
			
 
				-                               mlp_ratio=4.0,
			
 
				-                               act_layer=nn.GELU,
			
 
				-                               dropout = 0.1)
			
 
				-    if model_name == "vit_l":
			
 
				-        return ImageEncoderViT(img_size=img_size,
			
 
				-                               patch_size=patch_size,
			
 
				-                               in_chans=img_dim,
			
 
				-                               patch_embed_dim=1024,
			
 
				-                               depth=24,
			
 
				-                               num_heads=16,
			
 
				-                               mlp_ratio=4.0,
			
 
				-                               act_layer=nn.GELU,
			
 
				-                               dropout = 0.1)
			
 
				-    if model_name == "vit_h":
			
 
				-        return ImageEncoderViT(img_size=img_size,
			
 
				-                               patch_size=patch_size,
			
 
				-                               in_chans=img_dim,
			
 
				-                               patch_embed_dim=1280,
			
 
				-                               depth=32,
			
 
				-                               num_heads=16,
			
 
				-                               mlp_ratio=4.0,
			
 
				-                               act_layer=nn.GELU,
			
 
				-                               dropout = 0.1)
			
 
				-    
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import torch
			
 
				-    from thop import profile
			
 
				-
			
 
				-    # Prepare an image as the input
			
 
				-    bs, c, h, w = 2, 3, 224, 224
			
 
				-    x = torch.randn(bs, c, h, w)
			
 
				-    patch_size = 16
			
 
				-
			
 
				-    # Build model
			
 
				-    model = build_vit(patch_size=patch_size)
			
 
				-
			
 
				-    # Inference
			
 
				-    outputs = model(x)
			
 
				-
			
 
				-    # Compute FLOPs & Params
			
 
				-    print('==============================')
			
 
				-    model.eval()
			
 
				-    flops, params = profile(model, inputs=(x, ), verbose=False)
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/masked_image_modeling/models/vit/vit_cls.py
+++ b/masked_image_modeling/models/vit/vit_cls.py
@@ -1,28 +0,0 @@
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .modules import AttentionPoolingClassifier
			
 
				-from .vit     import ImageEncoderViT
			
 
				-
			
 
				-
			
 
				-class ViTForImageClassification(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 image_encoder :ImageEncoderViT,
			
 
				-                 num_classes   :int   = 1000,
			
 
				-                 qkv_bias      :bool  = True,
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # -------- Model parameters --------
			
 
				-        self.encoder    = image_encoder
			
 
				-        self.classifier = AttentionPoolingClassifier(
			
 
				-            image_encoder.patch_embed_dim, num_classes, image_encoder.num_heads, qkv_bias, num_queries=1)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        """
			
 
				-        Inputs:
			
 
				-            x: (torch.Tensor) -> [B, C, H, W]. Input image.
			
 
				-        """
			
 
				-        x = self.encoder(x)
			
 
				-        x, x_cls = self.classifier(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
--- a/masked_image_modeling/models/vit/vit_mae.py
+++ b/masked_image_modeling/models/vit/vit_mae.py
@@ -1,399 +0,0 @@
 
				-import math
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-try:
			
 
				-    from .modules import ViTBlock, PatchEmbed
			
 
				-except:
			
 
				-    from  modules import ViTBlock, PatchEmbed
			
 
				-
			
 
				-
			
 
				-# ------------------------ Basic Modules ------------------------
			
 
				-class MaeEncoder(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 img_size: int,
			
 
				-                 patch_size: int,
			
 
				-                 in_chans: int,
			
 
				-                 patch_embed_dim: int,
			
 
				-                 depth: int,
			
 
				-                 num_heads: int,
			
 
				-                 mlp_ratio: float,
			
 
				-                 act_layer: nn.GELU,
			
 
				-                 mask_ratio: float = 0.75,
			
 
				-                 dropout: float = 0.0,
			
 
				-                 ) -> None:
			
 
				-        super().__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.img_size = img_size
			
 
				-        self.patch_size = patch_size
			
 
				-        self.image_embedding_size = img_size // ((patch_size if patch_size > 0 else 1))
			
 
				-        self.patch_embed_dim = patch_embed_dim
			
 
				-        self.num_heads = num_heads
			
 
				-        self.num_patches = (img_size // patch_size) ** 2
			
 
				-        self.mask_ratio = mask_ratio
			
 
				-        # ----------- Model parameters -----------
			
 
				-        self.patch_embed = PatchEmbed(in_chans, patch_embed_dim, patch_size, 0, patch_size)
			
 
				-        self.pos_embed   = nn.Parameter(torch.zeros(1, self.num_patches, patch_embed_dim), requires_grad=False)
			
 
				-        self.norm_layer  = nn.LayerNorm(patch_embed_dim)
			
 
				-        self.blocks      = nn.ModuleList([
			
 
				-            ViTBlock(patch_embed_dim, num_heads, mlp_ratio, True, act_layer=act_layer, dropout=dropout)
			
 
				-            for _ in range(depth)])
			
 
				-        self._init_weights()
			
 
				-
			
 
				-    def _init_weights(self):
			
 
				-        # initialize (and freeze) pos_embed by sin-cos embedding
			
 
				-        pos_embed = self.get_posembed(self.pos_embed.shape[-1], int(self.num_patches**.5))
			
 
				-        self.pos_embed.data.copy_(pos_embed)
			
 
				-
			
 
				-        # initialize nn.Linear and nn.LayerNorm
			
 
				-        for m in self.modules():           
			
 
				-            if isinstance(m, nn.Linear):
			
 
				-                # we use xavier_uniform following official JAX ViT:
			
 
				-                torch.nn.init.xavier_uniform_(m.weight)
			
 
				-                if isinstance(m, nn.Linear) and m.bias is not None:
			
 
				-                    nn.init.constant_(m.bias, 0)
			
 
				-            elif isinstance(m, nn.LayerNorm):
			
 
				-                nn.init.constant_(m.bias, 0)
			
 
				-                nn.init.constant_(m.weight, 1.0)
			
 
				-
			
 
				-    def get_posembed(self, embed_dim, grid_size, temperature=10000):
			
 
				-        scale = 2 * math.pi
			
 
				-        grid_h, grid_w = grid_size, grid_size
			
 
				-        num_pos_feats = embed_dim // 2
			
 
				-        # get grid
			
 
				-        y_embed, x_embed = torch.meshgrid([torch.arange(grid_h, dtype=torch.float32),
			
 
				-                                           torch.arange(grid_w, dtype=torch.float32)])
			
 
				-        # normalize grid coords
			
 
				-        y_embed = y_embed / (grid_h + 1e-6) * scale
			
 
				-        x_embed = x_embed / (grid_w + 1e-6) * scale
			
 
				-    
			
 
				-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32)
			
 
				-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats
			
 
				-        dim_t = temperature ** (2 * dim_t_)
			
 
				-
			
 
				-        pos_x = torch.div(x_embed[..., None], dim_t)
			
 
				-        pos_y = torch.div(y_embed[..., None], dim_t)
			
 
				-        pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-        pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-
			
 
				-        # [H, W, C] -> [N, C]
			
 
				-        pos_embed = torch.cat((pos_y, pos_x), dim=-1).view(-1, embed_dim)
			
 
				-
			
 
				-        return pos_embed.unsqueeze(0)
			
 
				-
			
 
				-    def random_masking(self, x):
			
 
				-        B, N, C = x.shape
			
 
				-        len_keep = int(N * (1 - self.mask_ratio))
			
 
				-
			
 
				-        noise = torch.rand(B, N, device=x.device)  # noise in [0, 1]
			
 
				-
			
 
				-        # sort noise for each sample
			
 
				-        ids_shuffle = torch.argsort(noise, dim=1)        # ascend: small is keep, large is remove
			
 
				-        ids_restore = torch.argsort(ids_shuffle, dim=1)  # restore the original position of each patch
			
 
				-
			
 
				-        # keep the first subset
			
 
				-        ids_keep = ids_shuffle[:, :len_keep]
			
 
				-        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, C))
			
 
				-
			
 
				-        # generate the binary mask: 0 is keep, 1 is remove
			
 
				-        mask = torch.ones([B, N], device=x.device)
			
 
				-        mask[:, :len_keep] = 0
			
 
				-
			
 
				-        # unshuffle to get th binary mask
			
 
				-        mask = torch.gather(mask, dim=1, index=ids_restore)
			
 
				-
			
 
				-        return x_masked, mask, ids_restore
			
 
				-    
			
 
				-    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				-        # patch embed
			
 
				-        x = self.patch_embed(x)
			
 
				-        # [B, C, H, W] -> [B, C, N] -> [B, N, C], N = H x W
			
 
				-        x = x.flatten(2).permute(0, 2, 1).contiguous()
			
 
				-
			
 
				-        # add pos embed
			
 
				-        x = x + self.pos_embed
			
 
				-
			
 
				-        # masking: length -> length * mask_ratio
			
 
				-        x, mask, ids_restore = self.random_masking(x)
			
 
				-
			
 
				-        # apply Transformer blocks
			
 
				-        for block in self.blocks:
			
 
				-            x = block(x)
			
 
				-        x = self.norm_layer(x)
			
 
				-        
			
 
				-        return x, mask, ids_restore
			
 
				-
			
 
				-class MaeDecoder(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 img_dim       :int   = 3,
			
 
				-                 img_size      :int   = 16,
			
 
				-                 patch_size    :int   = 16,
			
 
				-                 en_emb_dim    :int   = 784,
			
 
				-                 de_emb_dim    :int   = 512,
			
 
				-                 de_num_layers :int   = 12,
			
 
				-                 de_num_heads  :int   = 12,
			
 
				-                 qkv_bias      :bool  = True,
			
 
				-                 mlp_ratio     :float = 4.0,
			
 
				-                 dropout       :float = 0.1,
			
 
				-                 mask_ratio    :float = 0.75,
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # -------- basic parameters --------
			
 
				-        self.img_size = img_size
			
 
				-        self.patch_size = patch_size
			
 
				-        self.num_patches = (img_size // patch_size) ** 2
			
 
				-        self.en_emb_dim = en_emb_dim
			
 
				-        self.de_emb_dim = de_emb_dim
			
 
				-        self.de_num_layers = de_num_layers
			
 
				-        self.de_num_heads = de_num_heads
			
 
				-        self.mask_ratio = mask_ratio
			
 
				-        # -------- network parameters --------
			
 
				-        self.mask_token        = nn.Parameter(torch.zeros(1, 1, de_emb_dim))
			
 
				-        self.decoder_embed     = nn.Linear(en_emb_dim, de_emb_dim)
			
 
				-        self.mask_token        = nn.Parameter(torch.zeros(1, 1, de_emb_dim))
			
 
				-        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, de_emb_dim), requires_grad=False)  # fixed sin-cos embedding
			
 
				-        self.decoder_norm      = nn.LayerNorm(de_emb_dim)
			
 
				-        self.decoder_pred      = nn.Linear(de_emb_dim, patch_size**2 * img_dim, bias=True)
			
 
				-        self.blocks            = nn.ModuleList([
			
 
				-            ViTBlock(de_emb_dim, de_num_heads, mlp_ratio, qkv_bias, dropout=dropout)
			
 
				-            for _ in range(de_num_layers)])
			
 
				-        
			
 
				-        self._init_weights()
			
 
				-
			
 
				-    def _init_weights(self):
			
 
				-        # initialize (and freeze) pos_embed by sin-cos embedding
			
 
				-        decoder_pos_embed = self.get_posembed(self.decoder_pos_embed.shape[-1], int(self.num_patches**.5))
			
 
				-        self.decoder_pos_embed.data.copy_(decoder_pos_embed)
			
 
				-
			
 
				-        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
			
 
				-        torch.nn.init.normal_(self.mask_token, std=.02)
			
 
				-
			
 
				-        # initialize nn.Linear and nn.LayerNorm
			
 
				-        for m in self.modules():           
			
 
				-            if isinstance(m, nn.Linear):
			
 
				-                # we use xavier_uniform following official JAX ViT:
			
 
				-                torch.nn.init.xavier_uniform_(m.weight)
			
 
				-                if isinstance(m, nn.Linear) and m.bias is not None:
			
 
				-                    nn.init.constant_(m.bias, 0)
			
 
				-            elif isinstance(m, nn.LayerNorm):
			
 
				-                nn.init.constant_(m.bias, 0)
			
 
				-                nn.init.constant_(m.weight, 1.0)
			
 
				-
			
 
				-    def get_posembed(self, embed_dim, grid_size, temperature=10000):
			
 
				-        scale = 2 * math.pi
			
 
				-        grid_h, grid_w = grid_size, grid_size
			
 
				-        num_pos_feats = embed_dim // 2
			
 
				-        # get grid
			
 
				-        y_embed, x_embed = torch.meshgrid([torch.arange(grid_h, dtype=torch.float32),
			
 
				-                                           torch.arange(grid_w, dtype=torch.float32)])
			
 
				-        # normalize grid coords
			
 
				-        y_embed = y_embed / (grid_h + 1e-6) * scale
			
 
				-        x_embed = x_embed / (grid_w + 1e-6) * scale
			
 
				-    
			
 
				-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32)
			
 
				-        dim_t_ = torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats
			
 
				-        dim_t = temperature ** (2 * dim_t_)
			
 
				-
			
 
				-        pos_x = torch.div(x_embed[..., None], dim_t)
			
 
				-        pos_y = torch.div(y_embed[..., None], dim_t)
			
 
				-        pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-        pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
			
 
				-
			
 
				-        # [H, W, C] -> [N, C]
			
 
				-        pos_embed = torch.cat((pos_y, pos_x), dim=-1).view(-1, embed_dim)
			
 
				-
			
 
				-        return pos_embed.unsqueeze(0)
			
 
				-
			
 
				-    def forward(self, x_enc, ids_restore):
			
 
				-        # embed tokens
			
 
				-        x_enc = self.decoder_embed(x_enc)
			
 
				-        B, N_nomask, C = x_enc.shape
			
 
				-
			
 
				-        # append mask tokens to sequence
			
 
				-        mask_tokens = self.mask_token.repeat(B, ids_restore.shape[1] - N_nomask, 1)     # [B, N_mask, C], N_mask = (N-1) - N_nomask
			
 
				-        x_all = torch.cat([x_enc, mask_tokens], dim=1)
			
 
				-        x_all = torch.gather(x_all, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, C))  # unshuffle
			
 
				-
			
 
				-        # add pos embed
			
 
				-        x_all = x_all + self.decoder_pos_embed
			
 
				-
			
 
				-        # apply Transformer blocks
			
 
				-        for block in self.blocks:
			
 
				-            x_all = block(x_all)
			
 
				-        x_all = self.decoder_norm(x_all)
			
 
				-
			
 
				-        # predict
			
 
				-        x_out = self.decoder_pred(x_all)
			
 
				-
			
 
				-        return x_out
			
 
				-
			
 
				-
			
 
				-# ------------------------ MAE Vision Transformer ------------------------
			
 
				-class ViTforMaskedAutoEncoder(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 encoder :MaeEncoder,
			
 
				-                 decoder :MaeDecoder,
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        self.mae_encoder = encoder
			
 
				-        self.mae_decoder = decoder
			
 
				-
			
 
				-    def patchify(self, imgs, patch_size):
			
 
				-        """
			
 
				-        imgs: (B, 3, H, W)
			
 
				-        x: (N, L, patch_size**2 *3)
			
 
				-        """
			
 
				-        p = patch_size
			
 
				-        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
			
 
				-
			
 
				-        h = w = imgs.shape[2] // p
			
 
				-        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
			
 
				-        x = torch.einsum('nchpwq->nhwpqc', x)
			
 
				-        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
			
 
				-
			
 
				-        return x
			
 
				-    
			
 
				-    def unpatchify(self, x, patch_size):
			
 
				-        """
			
 
				-        x: (B, N, patch_size**2 *3)
			
 
				-        imgs: (B, 3, H, W)
			
 
				-        """
			
 
				-        p = patch_size
			
 
				-        h = w = int(x.shape[1]**.5)
			
 
				-        assert h * w == x.shape[1]
			
 
				-        
			
 
				-        x = x.reshape(shape=(x.shape[0], h, w, p, p, 3))
			
 
				-        x = torch.einsum('nhwpqc->nchpwq', x)
			
 
				-        imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p))
			
 
				-
			
 
				-        return imgs
			
 
				-
			
 
				-    def compute_loss(self, x, output):
			
 
				-        """
			
 
				-        imgs: [B, 3, H, W]
			
 
				-        pred: [B, N, C], C = p*p*3
			
 
				-        mask: [B, N], 0 is keep, 1 is remove, 
			
 
				-        """
			
 
				-        target = self.patchify(x, self.mae_encoder.patch_size)
			
 
				-        pred, mask = output["x_pred"], output["mask"]
			
 
				-        loss = (pred - target) ** 2
			
 
				-        loss = loss.mean(dim=-1)  # [B, N], mean loss per patch
			
 
				-        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
			
 
				-        
			
 
				-        return loss
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        imgs = x
			
 
				-        x, mask, ids_restore = self.mae_encoder(x)
			
 
				-        x = self.mae_decoder(x, ids_restore)
			
 
				-        output = {
			
 
				-            'x_pred': x,
			
 
				-            'mask': mask
			
 
				-        }
			
 
				-
			
 
				-        if self.training:
			
 
				-            loss = self.compute_loss(imgs, output)
			
 
				-            output["loss"] = loss
			
 
				-
			
 
				-        return output
			
 
				-
			
 
				-
			
 
				-# ------------------------ Model Functions ------------------------
			
 
				-def build_vit_mae(model_name="vit_t", img_size=224, patch_size=16, img_dim=3, mask_ratio=0.75):
			
 
				-    # ---------------- MAE Encoder ----------------
			
 
				-    if model_name == "vit_t":
			
 
				-        encoder = MaeEncoder(img_size=img_size,
			
 
				-                             patch_size=patch_size,
			
 
				-                             in_chans=img_dim,
			
 
				-                             patch_embed_dim=192,
			
 
				-                             depth=12,
			
 
				-                             num_heads=3,
			
 
				-                             mlp_ratio=4.0,
			
 
				-                             act_layer=nn.GELU,
			
 
				-                             mask_ratio=mask_ratio,
			
 
				-                             dropout = 0.1)
			
 
				-    if model_name == "vit_s":
			
 
				-        encoder = MaeEncoder(img_size=img_size,
			
 
				-                             patch_size=patch_size,
			
 
				-                             in_chans=img_dim,
			
 
				-                             patch_embed_dim=384,
			
 
				-                             depth=12,
			
 
				-                             num_heads=6,
			
 
				-                             mlp_ratio=4.0,
			
 
				-                             act_layer=nn.GELU,
			
 
				-                             mask_ratio=mask_ratio,
			
 
				-                             dropout = 0.1)
			
 
				-    if model_name == "vit_b":
			
 
				-        encoder = MaeEncoder(img_size=img_size,
			
 
				-                             patch_size=patch_size,
			
 
				-                             in_chans=img_dim,
			
 
				-                             patch_embed_dim=768,
			
 
				-                             depth=12,
			
 
				-                             num_heads=12,
			
 
				-                             mlp_ratio=4.0,
			
 
				-                             act_layer=nn.GELU,
			
 
				-                             mask_ratio=mask_ratio,
			
 
				-                             dropout = 0.1)
			
 
				-    if model_name == "vit_l":
			
 
				-        encoder = MaeEncoder(img_size=img_size,
			
 
				-                             patch_size=patch_size,
			
 
				-                             in_chans=img_dim,
			
 
				-                             patch_embed_dim=1024,
			
 
				-                             depth=24,
			
 
				-                             num_heads=16,
			
 
				-                             mlp_ratio=4.0,
			
 
				-                             act_layer=nn.GELU,
			
 
				-                             mask_ratio=mask_ratio,
			
 
				-                             dropout = 0.1)
			
 
				-    if model_name == "vit_h":
			
 
				-        encoder = MaeEncoder(img_size=img_size,
			
 
				-                             patch_size=patch_size,
			
 
				-                             in_chans=img_dim,
			
 
				-                             patch_embed_dim=1280,
			
 
				-                             depth=32,
			
 
				-                             num_heads=16,
			
 
				-                             mlp_ratio=4.0,
			
 
				-                             act_layer=nn.GELU,
			
 
				-                             mask_ratio=mask_ratio,
			
 
				-                             dropout = 0.1)
			
 
				-    
			
 
				-    # ---------------- MAE Decoder ----------------
			
 
				-    decoder = MaeDecoder(img_dim = img_dim,
			
 
				-                         img_size=img_size,
			
 
				-                         patch_size=patch_size,
			
 
				-                         en_emb_dim=encoder.patch_embed_dim,
			
 
				-                         de_emb_dim=512,
			
 
				-                         de_num_layers=8,
			
 
				-                         de_num_heads=16,
			
 
				-                         qkv_bias=True,
			
 
				-                         mlp_ratio=4.0,
			
 
				-                         mask_ratio=mask_ratio,
			
 
				-                         dropout=0.1,)
			
 
				-    
			
 
				-    return ViTforMaskedAutoEncoder(encoder, decoder)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import torch
			
 
				-    from thop import profile
			
 
				-
			
 
				-    # Prepare an image as the input
			
 
				-    bs, c, h, w = 2, 3, 224, 224
			
 
				-    x = torch.randn(bs, c, h, w)
			
 
				-    patch_size = 16
			
 
				-
			
 
				-    # Build model
			
 
				-    model = build_vit_mae(patch_size=patch_size)
			
 
				-
			
 
				-    # Inference
			
 
				-    outputs = model(x)
			
 
				-    if "loss" in outputs:
			
 
				-        print("Loss: ", outputs["loss"].item())
			
 
				-
			
 
				-    # Compute FLOPs & Params
			
 
				-    print('==============================')
			
 
				-    model.eval()
			
 
				-    flops, params = profile(model, inputs=(x, ), verbose=False)
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
 
				-
			
--- a/masked_image_modeling/requirements.txt
+++ b/masked_image_modeling/requirements.txt
@@ -1,5 +0,0 @@
 
				-torch
			
 
				-torchvision
			
 
				-opencv-python
			
 
				-thop
			
 
				-timm
			
--- a/masked_image_modeling/utils/lr_scheduler.py
+++ b/masked_image_modeling/utils/lr_scheduler.py
@@ -1,37 +0,0 @@
 
				-import torch
			
 
				-
			
 
				-
			
 
				-# Basic Warmup Scheduler
			
 
				-class LinearWarmUpLrScheduler(object):
			
 
				-    def __init__(self, base_lr=0.01, wp_iter=500, warmup_factor=0.00066667):
			
 
				-        self.base_lr = base_lr
			
 
				-        self.wp_iter = wp_iter
			
 
				-        self.warmup_factor = warmup_factor
			
 
				-
			
 
				-    def set_lr(self, optimizer, cur_lr):
			
 
				-        for param_group in optimizer.param_groups:
			
 
				-            param_group['lr'] = cur_lr
			
 
				-
			
 
				-    def __call__(self, iter, optimizer):
			
 
				-        # warmup
			
 
				-        assert iter < self.wp_iter
			
 
				-        alpha = iter / self.wp_iter
			
 
				-        warmup_factor = self.warmup_factor * (1 - alpha) + alpha
			
 
				-        tmp_lr = self.base_lr * warmup_factor
			
 
				-        self.set_lr(optimizer, tmp_lr)
			
 
				-
			
 
				-
			
 
				-def build_lr_scheduler(args, optimizer):
			
 
				-    if args.lr_scheduler == "step":
			
 
				-        lr_step = [args.max_epoch // 3, args.max_epoch // 3 * 2]
			
 
				-        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=lr_step, gamma=0.1)
			
 
				-    elif args.lr_scheduler == "cosine":
			
 
				-        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.max_epoch - args.wp_epoch - 1, eta_min=args.min_lr)
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown lr scheduler: {}".format(args.lr_scheduler))
			
 
				-    
			
 
				-    print("=================== LR Scheduler information ===================")
			
 
				-    print("LR Scheduler: ", args.lr_scheduler)
			
 
				-
			
 
				-    return scheduler
			
 
				-        
			
--- a/masked_image_modeling/utils/misc.py
+++ b/masked_image_modeling/utils/misc.py
@@ -1,231 +0,0 @@
 
				-import time
			
 
				-import torch
			
 
				-import numpy as np
			
 
				-import random
			
 
				-import datetime
			
 
				-from collections import defaultdict, deque
			
 
				-from pathlib import Path
			
 
				-
			
 
				-
			
 
				-# ---------------------- Common functions ----------------------
			
 
				-def setup_seed(seed=42):
			
 
				-    torch.manual_seed(seed)
			
 
				-    torch.cuda.manual_seed_all(seed)
			
 
				-    np.random.seed(seed)
			
 
				-    random.seed(seed)
			
 
				-    torch.backends.cudnn.deterministic = True
			
 
				-
			
 
				-def accuracy(output, target, topk=(1,)):
			
 
				-    """Computes the accuracy over the k top predictions for the specified values of k"""
			
 
				-    with torch.no_grad():
			
 
				-        maxk = max(topk)
			
 
				-        batch_size = target.size(0)
			
 
				-
			
 
				-        _, pred = output.topk(maxk, 1, True, True)
			
 
				-        pred = pred.t()
			
 
				-        correct = pred.eq(target.reshape(1, -1).expand_as(pred))
			
 
				-
			
 
				-        res = []
			
 
				-        for k in topk:
			
 
				-            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
			
 
				-            res.append(correct_k.mul_(100.0 / batch_size))
			
 
				-        return res
			
 
				-
			
 
				-class SmoothedValue(object):
			
 
				-    """Track a series of values and provide access to smoothed values over a
			
 
				-    window or the global series average.
			
 
				-    """
			
 
				-    def __init__(self, window_size=20, fmt=None):
			
 
				-        if fmt is None:
			
 
				-            fmt = "{median:.4f} ({global_avg:.4f})"
			
 
				-        self.deque = deque(maxlen=window_size)
			
 
				-        self.total = 0.0
			
 
				-        self.count = 0
			
 
				-        self.fmt = fmt
			
 
				-
			
 
				-    def update(self, value, n=1):
			
 
				-        self.deque.append(value)
			
 
				-        self.count += n
			
 
				-        self.total += value * n
			
 
				-
			
 
				-    @property
			
 
				-    def median(self):
			
 
				-        d = torch.tensor(list(self.deque))
			
 
				-        return d.median().item()
			
 
				-
			
 
				-    @property
			
 
				-    def avg(self):
			
 
				-        d = torch.tensor(list(self.deque), dtype=torch.float32)
			
 
				-        return d.mean().item()
			
 
				-
			
 
				-    @property
			
 
				-    def global_avg(self):
			
 
				-        return self.total / self.count
			
 
				-
			
 
				-    @property
			
 
				-    def max(self):
			
 
				-        return max(self.deque)
			
 
				-
			
 
				-    @property
			
 
				-    def value(self):
			
 
				-        return self.deque[-1]
			
 
				-
			
 
				-    def __str__(self):
			
 
				-        return self.fmt.format(
			
 
				-            median=self.median,
			
 
				-            avg=self.avg,
			
 
				-            global_avg=self.global_avg,
			
 
				-            max=self.max,
			
 
				-            value=self.value)
			
 
				-
			
 
				-class MetricLogger(object):
			
 
				-    def __init__(self, delimiter="\t"):
			
 
				-        self.meters = defaultdict(SmoothedValue)
			
 
				-        self.delimiter = delimiter
			
 
				-
			
 
				-    def update(self, **kwargs):
			
 
				-        for k, v in kwargs.items():
			
 
				-            if v is None:
			
 
				-                continue
			
 
				-            if isinstance(v, torch.Tensor):
			
 
				-                v = v.item()
			
 
				-            assert isinstance(v, (float, int))
			
 
				-            self.meters[k].update(v)
			
 
				-
			
 
				-    def __getattr__(self, attr):
			
 
				-        if attr in self.meters:
			
 
				-            return self.meters[attr]
			
 
				-        if attr in self.__dict__:
			
 
				-            return self.__dict__[attr]
			
 
				-        raise AttributeError("'{}' object has no attribute '{}'".format(
			
 
				-            type(self).__name__, attr))
			
 
				-
			
 
				-    def __str__(self):
			
 
				-        loss_str = []
			
 
				-        for name, meter in self.meters.items():
			
 
				-            loss_str.append(
			
 
				-                "{}: {}".format(name, str(meter))
			
 
				-            )
			
 
				-        return self.delimiter.join(loss_str)
			
 
				-
			
 
				-    def add_meter(self, name, meter):
			
 
				-        self.meters[name] = meter
			
 
				-
			
 
				-    def log_every(self, iterable, print_freq, header=None):
			
 
				-        i = 0
			
 
				-        if not header:
			
 
				-            header = ''
			
 
				-        start_time = time.time()
			
 
				-        end = time.time()
			
 
				-        iter_time = SmoothedValue(fmt='{avg:.4f}')
			
 
				-        data_time = SmoothedValue(fmt='{avg:.4f}')
			
 
				-        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
			
 
				-        log_msg = [
			
 
				-            header,
			
 
				-            '[{0' + space_fmt + '}/{1}]',
			
 
				-            'eta: {eta}',
			
 
				-            '{meters}',
			
 
				-            'time: {time}',
			
 
				-            'data: {data}'
			
 
				-        ]
			
 
				-        if torch.cuda.is_available():
			
 
				-            log_msg.append('max mem: {memory:.0f}')
			
 
				-        log_msg = self.delimiter.join(log_msg)
			
 
				-        MB = 1024.0 * 1024.0
			
 
				-        for obj in iterable:
			
 
				-            data_time.update(time.time() - end)
			
 
				-            yield obj
			
 
				-            iter_time.update(time.time() - end)
			
 
				-            if i % print_freq == 0 or i == len(iterable) - 1:
			
 
				-                eta_seconds = iter_time.global_avg * (len(iterable) - i)
			
 
				-                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
			
 
				-                if torch.cuda.is_available():
			
 
				-                    print(log_msg.format(
			
 
				-                        i, len(iterable), eta=eta_string,
			
 
				-                        meters=str(self),
			
 
				-                        time=str(iter_time), data=str(data_time),
			
 
				-                        memory=torch.cuda.max_memory_allocated() / MB))
			
 
				-                else:
			
 
				-                    print(log_msg.format(
			
 
				-                        i, len(iterable), eta=eta_string,
			
 
				-                        meters=str(self),
			
 
				-                        time=str(iter_time), data=str(data_time)))
			
 
				-            i += 1
			
 
				-            end = time.time()
			
 
				-        total_time = time.time() - start_time
			
 
				-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
			
 
				-        print('{} Total time: {} ({:.4f} s / it)'.format(
			
 
				-            header, total_time_str, total_time / len(iterable)))
			
 
				-
			
 
				-
			
 
				-# ---------------------- Model functions ----------------------
			
 
				-def load_model(args, model, optimizer, lr_scheduler):
			
 
				-    if args.resume and args.resume.lower() != 'none':
			
 
				-        print("=================== Load checkpoint ===================")
			
 
				-        if args.resume.startswith('https'):
			
 
				-            checkpoint = torch.hub.load_state_dict_from_url(
			
 
				-                args.resume, map_location='cpu', check_hash=True)
			
 
				-        else:
			
 
				-            checkpoint = torch.load(args.resume, map_location='cpu')
			
 
				-        model.load_state_dict(checkpoint['model'])
			
 
				-        print("Resume checkpoint %s" % args.resume)
			
 
				-        
			
 
				-        if 'optimizer' in checkpoint and 'epoch' in checkpoint and not (hasattr(args, 'eval') and args.eval):
			
 
				-            print('- Load optimizer from the checkpoint. ')
			
 
				-            optimizer.load_state_dict(checkpoint['optimizer'])
			
 
				-            args.start_epoch = checkpoint['epoch'] + 1
			
 
				-
			
 
				-        if 'lr_scheduler' in checkpoint:
			
 
				-            print('- Load lr scheduler from the checkpoint. ')
			
 
				-            lr_scheduler.load_state_dict(checkpoint.pop("lr_scheduler"))
			
 
				-
			
 
				-def save_model(args, epoch, model, optimizer, lr_scheduler, acc1=None, mae_task=False):
			
 
				-    output_dir = Path(args.output_dir)
			
 
				-    epoch_name = str(epoch)
			
 
				-    if acc1 is not None:
			
 
				-        checkpoint_paths = [output_dir / ('checkpoint-{}-Acc1-{:.2f}.pth'.format(epoch_name, acc1))]
			
 
				-    else:
			
 
				-        checkpoint_paths = [output_dir / ('checkpoint-{}.pth'.format(epoch_name))]
			
 
				-    for checkpoint_path in checkpoint_paths:
			
 
				-        to_save = {
			
 
				-            'model': model.state_dict(),
			
 
				-            'optimizer': optimizer.state_dict(),
			
 
				-            'lr_scheduler': lr_scheduler.state_dict(),
			
 
				-            'epoch': epoch,
			
 
				-            'args': args,
			
 
				-        }
			
 
				-        if mae_task:
			
 
				-            to_save['encoder'] = model.mae_encoder.state_dict()
			
 
				-        torch.save(to_save, checkpoint_path)
			
 
				-
			
 
				-
			
 
				-# ---------------------- Patch operations ----------------------
			
 
				-def patchify(imgs, patch_size):
			
 
				-    """
			
 
				-    imgs: (B, 3, H, W)
			
 
				-    x: (N, L, patch_size**2 *3)
			
 
				-    """
			
 
				-    p = patch_size
			
 
				-    assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
			
 
				-
			
 
				-    h = w = imgs.shape[2] // p
			
 
				-    x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
			
 
				-    x = torch.einsum('nchpwq->nhwpqc', x)
			
 
				-    x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
			
 
				-
			
 
				-    return x
			
 
				-
			
 
				-def unpatchify(x, patch_size):
			
 
				-    """
			
 
				-    x: (B, N, patch_size**2 *3)
			
 
				-    imgs: (B, 3, H, W)
			
 
				-    """
			
 
				-    p = patch_size
			
 
				-    h = w = int(x.shape[1]**.5)
			
 
				-    assert h * w == x.shape[1]
			
 
				-    
			
 
				-    x = x.reshape(shape=(x.shape[0], h, w, p, p, 3))
			
 
				-    x = torch.einsum('nhwpqc->nchpwq', x)
			
 
				-    imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p))
			
 
				-
			
 
				-    return imgs
			
--- a/masked_image_modeling/utils/optimizer.py
+++ b/masked_image_modeling/utils/optimizer.py
@@ -1,25 +0,0 @@
 
				-import torch
			
 
				-
			
 
				-
			
 
				-def build_optimizer(args, model):
			
 
				-    ## learning rate
			
 
				-    if args.optimizer == "adamw":
			
 
				-        args.base_lr = args.base_lr / 256 * args.batch_size
			
 
				-        optimizer = torch.optim.AdamW(model.parameters(),
			
 
				-                                      lr=args.base_lr,
			
 
				-                                      weight_decay=args.weight_decay)
			
 
				-    elif args.optimizer == "sgd":
			
 
				-        args.base_lr = args.base_lr / 256 * args.batch_size
			
 
				-        optimizer = torch.optim.SGD(model.parameters(),
			
 
				-                                    lr=args.base_lr,
			
 
				-                                    momentum=0.9,
			
 
				-                                    weight_decay=args.weight_decay)
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown optimizer: {}".format(args.optimizer))
			
 
				-
			
 
				-    print("=================== Optimizer information ===================")
			
 
				-    print("Optimizer: ", args.optimizer)
			
 
				-    print('- base lr: ', args.base_lr)
			
 
				-    print('- min  lr: ', args.min_lr)
			
 
				-
			
 
				-    return optimizer
			
--- a/yolo/.gitignore
+++ b/yolo/.gitignore
@@ -8,3 +8,4 @@ weights
 
				 __pycache__
			
 
				 det_results
			
 
				 .vscode
			
 
				+odlab/
			
--- a/yolo/benchmark.py
+++ b/yolo/benchmark.py
@@ -1,110 +0,0 @@
 
				-import argparse
			
 
				-import time
			
 
				-import torch
			
 
				-
			
 
				-# load transform
			
 
				-from dataset.build import build_dataset, build_transform
			
 
				-
			
 
				-# load some utils
			
 
				-from utils.misc import load_weight, compute_flops
			
 
				-from config import build_config
			
 
				-from models import build_model
			
 
				-
			
 
				-
			
 
				-def parse_args():
			
 
				-    parser = argparse.ArgumentParser(description='Real-time Object Detection LAB')
			
 
				-    # Basic setting
			
 
				-    parser.add_argument('-size', '--img_size', default=640, type=int,
			
 
				-                        help='the max size of input image')
			
 
				-    parser.add_argument('--cuda', action='store_true', default=False, 
			
 
				-                        help='use cuda.')
			
 
				-
			
 
				-    # Model setting
			
 
				-    parser.add_argument('-m', '--model', default='yolov1_r18', type=str,
			
 
				-                        help='build yolo')
			
 
				-    parser.add_argument('--weight', default=None,
			
 
				-                        type=str, help='Trained state_dict file path to open')
			
 
				-    parser.add_argument('--fuse_conv_bn', action='store_true', default=False,
			
 
				-                        help='fuse Conv & BN')
			
 
				-
			
 
				-    # Data setting
			
 
				-    parser.add_argument('--root', default='D:/python_work/dataset/COCO/',
			
 
				-                        help='data root')
			
 
				-
			
 
				-    return parser.parse_args()
			
 
				-
			
 
				-
			
 
				-@torch.no_grad()
			
 
				-def test_det(model, 
			
 
				-             device, 
			
 
				-             dataset,
			
 
				-             transform=None
			
 
				-             ):
			
 
				-    # Step-1: Compute FLOPs and Params
			
 
				-    compute_flops(model, cfg.test_img_size, device)
			
 
				-
			
 
				-    # Step-2: Compute FPS
			
 
				-    num_images = 2002
			
 
				-    total_time = 0
			
 
				-    count = 0
			
 
				-    with torch.no_grad():
			
 
				-        for index in range(num_images):
			
 
				-            if index % 500 == 0:
			
 
				-                print('Testing image {:d}/{:d}....'.format(index+1, num_images))
			
 
				-
			
 
				-            # Load an image
			
 
				-            image, _ = dataset.pull_image(index)
			
 
				-
			
 
				-            # Preprocess
			
 
				-            x, _, ratio = transform(image)
			
 
				-            x = x.unsqueeze(0).to(device)
			
 
				-
			
 
				-            # Start
			
 
				-            torch.cuda.synchronize()
			
 
				-            start_time = time.perf_counter()   
			
 
				-
			
 
				-            # Inference
			
 
				-            outputs = model(x)
			
 
				-
			
 
				-            # End
			
 
				-            torch.cuda.synchronize()
			
 
				-            elapsed = time.perf_counter() - start_time
			
 
				-        
			
 
				-            if index > 1:
			
 
				-                total_time += elapsed
			
 
				-                count += 1
			
 
				-
			
 
				-        print('- FPS :', 1.0 / (total_time / count))
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    args = parse_args()
			
 
				-    # cuda
			
 
				-    if args.cuda:
			
 
				-        print('use cuda')
			
 
				-        device = torch.device("cuda")
			
 
				-    else:
			
 
				-        device = torch.device("cpu")
			
 
				-
			
 
				-    # Model Config
			
 
				-    cfg = build_config(args)
			
 
				-
			
 
				-    # Transform
			
 
				-    transform = build_transform(cfg, is_train=False)
			
 
				-
			
 
				-    # Dataset
			
 
				-    args.dataset = 'coco'
			
 
				-    dataset = build_dataset(args, cfg, transform, is_train=False)
			
 
				-
			
 
				-    # Build model
			
 
				-    model = build_model(args, cfg, is_val=False)
			
 
				-
			
 
				-    # Load trained weight
			
 
				-    model = load_weight(model, args.weight, args.fuse_conv_bn, rep_conv=True)
			
 
				-    model.to(device).eval()
			
 
				-        
			
 
				-    # Run
			
 
				-    test_det(model     = model, 
			
 
				-             device    = device, 
			
 
				-             dataset   = dataset,
			
 
				-             transform = transform,
			
 
				-             )
			
--- a/yolo/config/__init__.py
+++ b/yolo/config/__init__.py
@@ -6,7 +6,7 @@ from .yolov5_config     import build_yolov5_config
 
				 from .yolov5_af_config  import build_yolov5af_config
			
 
				 from .yolov6_config     import build_yolov6_config
			
 
				 from .yolov8_config     import build_yolov8_config
			
 
				-from .gelan_config      import build_gelan_config
			
 
				+from .yolov9_config     import build_gelan_config
			
 
				 from .rtdetr_config     import build_rtdetr_config
			
 
				 
			
 
				 
			
--- a/masked_image_modeling/utils/__init__.py
+++ b/masked_image_modeling/utils/__init__.py
--- a/yolo/models/gelan/README.md
+++ b/yolo/models/gelan/README.md
--- a/yolo/config/yolof_config.py
+++ b/yolo/config/yolof_config.py
--- a/yolo/config/yolov10_config.py
+++ b/yolo/config/yolov10_config.py
--- a/yolo/config/yolov11_config.py
+++ b/yolo/config/yolov11_config.py
--- a/yolo/config/yolov4_config.py
+++ b/yolo/config/yolov4_config.py
--- a/yolo/config/yolov7_config.py
+++ b/yolo/config/yolov7_config.py
--- a/yolo/config/yolov9_config.py
+++ b/yolo/config/yolov9_config.py
@@ -106,8 +106,6 @@ class GElanBaseConfig(object):
 
				 
			
 
				         # ---------------- Data process config ----------------
			
 
				         self.aug_type = 'yolo'
			
 
				-        self.box_format = 'xyxy'
			
 
				-        self.normalize_coords = False
			
 
				         self.mosaic_prob = 0.0
			
 
				         self.mixup_prob  = 0.0
			
 
				         self.copy_paste  = 0.0           # approximated by the YOLOX's mixup
			
--- a/yolo/models/__init__.py
+++ b/yolo/models/__init__.py
@@ -9,7 +9,7 @@ from .yolov5.build     import build_yolov5
 
				 from .yolov5_af.build  import build_yolov5af
			
 
				 from .yolov6.build     import build_yolov6
			
 
				 from .yolov8.build     import build_yolov8
			
 
				-from .gelan.build      import build_gelan
			
 
				+from .yolov9.build     import build_gelan
			
 
				 from .rtdetr.build     import build_rtdetr
			
 
				 
			
 
				 
			
--- a/yolo/models/detr/build.py
+++ b/yolo/models/detr/build.py
--- a/yolo/models/detr/detr.py
+++ b/yolo/models/detr/detr.py
--- a/yolo/models/detr/detr_backbone.py
+++ b/yolo/models/detr/detr_backbone.py
--- a/yolo/models/detr/detr_transformer.py
+++ b/yolo/models/detr/detr_transformer.py
--- a/yolo/models/detr/loss.py
+++ b/yolo/models/detr/loss.py
--- a/yolo/models/detr/matcher.py
+++ b/yolo/models/detr/matcher.py
--- a/yolo/models/detr/modules.py
+++ b/yolo/models/detr/modules.py
@@ -0,0 +1,148 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+from typing import List
			
 
				+
			
 
				+
			
 
				+# --------------------- Basic modules ---------------------
			
 
				+def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
			
 
				+    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
			
 
				+
			
 
				+    return conv
			
 
				+
			
 
				+def get_activation(act_type=None):
			
 
				+    if act_type == 'relu':
			
 
				+        return nn.ReLU(inplace=True)
			
 
				+    elif act_type == 'lrelu':
			
 
				+        return nn.LeakyReLU(0.1, inplace=True)
			
 
				+    elif act_type == 'mish':
			
 
				+        return nn.Mish(inplace=True)
			
 
				+    elif act_type == 'silu':
			
 
				+        return nn.SiLU(inplace=True)
			
 
				+    elif act_type is None:
			
 
				+        return nn.Identity()
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+        
			
 
				+def get_norm(norm_type, dim):
			
 
				+    if norm_type == 'BN':
			
 
				+        return nn.BatchNorm2d(dim)
			
 
				+    elif norm_type == 'GN':
			
 
				+        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				+    elif norm_type is None:
			
 
				+        return nn.Identity()
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+class BasicConv(nn.Module):
			
 
				+    def __init__(self, 
			
 
				+                 in_dim,                   # in channels
			
 
				+                 out_dim,                  # out channels 
			
 
				+                 kernel_size=1,            # kernel size 
			
 
				+                 padding=0,                # padding
			
 
				+                 stride=1,                 # padding
			
 
				+                 dilation=1,               # dilation
			
 
				+                 act_type  :str = 'lrelu', # activation
			
 
				+                 norm_type :str = 'BN',    # normalization
			
 
				+                 depthwise :bool = False
			
 
				+                ):
			
 
				+        super(BasicConv, self).__init__()
			
 
				+        self.depthwise = depthwise
			
 
				+        use_bias = False if norm_type is not None else True
			
 
				+        if not depthwise:
			
 
				+            self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=1, bias=use_bias)
			
 
				+            self.norm = get_norm(norm_type, out_dim)
			
 
				+        else:
			
 
				+            self.conv1 = get_conv2d(in_dim, in_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=in_dim, bias=use_bias)
			
 
				+            self.norm1 = get_norm(norm_type, in_dim)
			
 
				+            self.conv2 = get_conv2d(in_dim, out_dim, k=1, p=0, s=1, d=1, g=1)
			
 
				+            self.norm2 = get_norm(norm_type, out_dim)
			
 
				+        self.act  = get_activation(act_type)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        if not self.depthwise:
			
 
				+            return self.act(self.norm(self.conv(x)))
			
 
				+        else:
			
 
				+            # Depthwise conv
			
 
				+            x = self.norm1(self.conv1(x))
			
 
				+            # Pointwise conv
			
 
				+            x = self.act(self.norm2(self.conv2(x)))
			
 
				+            return x
			
 
				+
			
 
				+
			
 
				+# --------------------- ResNet modules ---------------------
			
 
				+def conv3x3(in_planes, out_planes, stride=1):
			
 
				+    """3x3 convolution with padding"""
			
 
				+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
			
 
				+                     padding=1, bias=False)
			
 
				+
			
 
				+def conv1x1(in_planes, out_planes, stride=1):
			
 
				+    """1x1 convolution"""
			
 
				+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
			
 
				+
			
 
				+class BasicBlock(nn.Module):
			
 
				+    expansion = 1
			
 
				+
			
 
				+    def __init__(self, inplanes, planes, stride=1, downsample=None):
			
 
				+        super(BasicBlock, self).__init__()
			
 
				+        self.conv1 = conv3x3(inplanes, planes, stride)
			
 
				+        self.bn1 = nn.BatchNorm2d(planes)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.conv2 = conv3x3(planes, planes)
			
 
				+        self.bn2 = nn.BatchNorm2d(planes)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        identity = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            identity = self.downsample(x)
			
 
				+
			
 
				+        out += identity
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+class Bottleneck(nn.Module):
			
 
				+    expansion = 4
			
 
				+
			
 
				+    def __init__(self, inplanes, planes, stride=1, downsample=None):
			
 
				+        super(Bottleneck, self).__init__()
			
 
				+        self.conv1 = conv1x1(inplanes, planes)
			
 
				+        self.bn1 = nn.BatchNorm2d(planes)
			
 
				+        self.conv2 = conv3x3(planes, planes, stride)
			
 
				+        self.bn2 = nn.BatchNorm2d(planes)
			
 
				+        self.conv3 = conv1x1(planes, planes * self.expansion)
			
 
				+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        identity = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv3(out)
			
 
				+        out = self.bn3(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            identity = self.downsample(x)
			
 
				+
			
 
				+        out += identity
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
--- a/yolo/models/fcos/build.py
+++ b/yolo/models/fcos/build.py
--- a/yolo/models/fcos/fcos.py
+++ b/yolo/models/fcos/fcos.py
--- a/yolo/models/fcos/fcos_backbone.py
+++ b/yolo/models/fcos/fcos_backbone.py
--- a/yolo/models/fcos/fcos_fpn.py
+++ b/yolo/models/fcos/fcos_fpn.py
@@ -0,0 +1,68 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+# ------------------ Basic Feature Pyramid Network ------------------
			
 
				+class BasicFPN(nn.Module):
			
 
				+    def __init__(self, cfg, 
			
 
				+                 in_dims=[512, 1024, 2048],
			
 
				+                 out_dim=256,
			
 
				+                 ):
			
 
				+        super().__init__()
			
 
				+        # ------------------ Basic parameters -------------------
			
 
				+        self.p6_feat = cfg.fpn_p6_feat
			
 
				+        self.p7_feat = cfg.fpn_p7_feat
			
 
				+        self.from_c5 = cfg.fpn_p6_from_c5
			
 
				+
			
 
				+        # ------------------ Network parameters -------------------
			
 
				+        ## latter layers
			
 
				+        self.input_projs = nn.ModuleList()
			
 
				+        self.smooth_layers = nn.ModuleList()
			
 
				+        for in_dim in in_dims[::-1]:
			
 
				+            self.input_projs.append(nn.Conv2d(in_dim, out_dim, kernel_size=1))
			
 
				+            self.smooth_layers.append(nn.Conv2d(out_dim, out_dim, kernel_size=3, padding=1))
			
 
				+
			
 
				+        ## P6/P7 layers
			
 
				+        if self.p6_feat:
			
 
				+            if self.from_c5:
			
 
				+                self.p6_conv = nn.Conv2d(in_dims[-1], out_dim, kernel_size=3, stride=2, padding=1)
			
 
				+            else: # from p5
			
 
				+                self.p6_conv = nn.Conv2d(out_dim, out_dim, kernel_size=3, stride=2, padding=1)
			
 
				+        if self.p7_feat:
			
 
				+            self.p7_conv = nn.Sequential(
			
 
				+                nn.ReLU(inplace=True),
			
 
				+                nn.Conv2d(out_dim, out_dim, kernel_size=3, stride=2, padding=1)
			
 
				+            )
			
 
				+
			
 
				+    def forward(self, feats):
			
 
				+        """
			
 
				+            feats: (List of Tensor) [C3, C4, C5], C_i ∈ R^(B x C_i x H_i x W_i)
			
 
				+        """
			
 
				+        outputs = []
			
 
				+        # [C3, C4, C5] -> [C5, C4, C3]
			
 
				+        feats = feats[::-1]
			
 
				+        top_level_feat = feats[0]
			
 
				+        prev_feat = self.input_projs[0](top_level_feat)
			
 
				+        outputs.append(self.smooth_layers[0](prev_feat))
			
 
				+
			
 
				+        for feat, input_proj, smooth_layer in zip(feats[1:], self.input_projs[1:], self.smooth_layers[1:]):
			
 
				+            feat = input_proj(feat)
			
 
				+            top_down_feat = F.interpolate(prev_feat, size=feat.shape[2:], mode='nearest')
			
 
				+            prev_feat = feat + top_down_feat
			
 
				+            outputs.insert(0, smooth_layer(prev_feat))
			
 
				+
			
 
				+        if self.p6_feat:
			
 
				+            if self.from_c5:
			
 
				+                p6_feat = self.p6_conv(feats[0])
			
 
				+            else:
			
 
				+                p6_feat = self.p6_conv(outputs[-1])
			
 
				+            # [P3, P4, P5] -> [P3, P4, P5, P6]
			
 
				+            outputs.append(p6_feat)
			
 
				+
			
 
				+            if self.p7_feat:
			
 
				+                p7_feat = self.p7_conv(p6_feat)
			
 
				+                # [P3, P4, P5, P6] -> [P3, P4, P5, P6, P7]
			
 
				+                outputs.append(p7_feat)
			
 
				+
			
 
				+        # [P3, P4, P5] or [P3, P4, P5, P6, P7]
			
 
				+        return outputs
			
--- a/yolo/models/fcos/fcos_head.py
+++ b/yolo/models/fcos/fcos_head.py
@@ -0,0 +1,186 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+
			
 
				+from .modules import BasicConv
			
 
				+
			
 
				+
			
 
				+class Scale(nn.Module):
			
 
				+    """
			
 
				+    Multiply the output regression range by a learnable constant value
			
 
				+    """
			
 
				+    def __init__(self, init_value=1.0):
			
 
				+        """
			
 
				+        init_value : initial value for the scalar
			
 
				+        """
			
 
				+        super().__init__()
			
 
				+        self.scale = nn.Parameter(
			
 
				+            torch.tensor(init_value, dtype=torch.float32),
			
 
				+            requires_grad=True
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        """
			
 
				+        input -> scale * input
			
 
				+        """
			
 
				+        return x * self.scale
			
 
				+
			
 
				+class FcosHead(nn.Module):
			
 
				+    def __init__(self, cfg, in_dim, out_dim,):
			
 
				+        super().__init__()
			
 
				+        self.fmp_size = None
			
 
				+        # ------------------ Basic parameters -------------------
			
 
				+        self.cfg = cfg
			
 
				+        self.in_dim = in_dim
			
 
				+        self.stride       = cfg.out_stride
			
 
				+        self.num_classes  = cfg.num_classes
			
 
				+        self.num_cls_head = cfg.num_cls_head
			
 
				+        self.num_reg_head = cfg.num_reg_head
			
 
				+        self.act_type     = cfg.head_act
			
 
				+        self.norm_type    = cfg.head_norm
			
 
				+
			
 
				+        # ------------------ Network parameters -------------------
			
 
				+        ## cls head
			
 
				+        cls_heads = []
			
 
				+        self.cls_head_dim = out_dim
			
 
				+        for i in range(self.num_cls_head):
			
 
				+            if i == 0:
			
 
				+                cls_heads.append(
			
 
				+                    BasicConv(in_dim, self.cls_head_dim,
			
 
				+                              kernel_size=3, padding=1, stride=1, 
			
 
				+                              act_type=self.act_type, norm_type=self.norm_type)
			
 
				+                              )
			
 
				+            else:
			
 
				+                cls_heads.append(
			
 
				+                    BasicConv(self.cls_head_dim, self.cls_head_dim,
			
 
				+                              kernel_size=3, padding=1, stride=1, 
			
 
				+                              act_type=self.act_type, norm_type=self.norm_type)
			
 
				+                              )
			
 
				+        
			
 
				+        ## reg head
			
 
				+        reg_heads = []
			
 
				+        self.reg_head_dim = out_dim
			
 
				+        for i in range(self.num_reg_head):
			
 
				+            if i == 0:
			
 
				+                reg_heads.append(
			
 
				+                    BasicConv(in_dim, self.reg_head_dim,
			
 
				+                              kernel_size=3, padding=1, stride=1, 
			
 
				+                              act_type=self.act_type, norm_type=self.norm_type)
			
 
				+                              )
			
 
				+            else:
			
 
				+                reg_heads.append(
			
 
				+                    BasicConv(self.reg_head_dim, self.reg_head_dim,
			
 
				+                              kernel_size=3, padding=1, stride=1, 
			
 
				+                              act_type=self.act_type, norm_type=self.norm_type)
			
 
				+                              )
			
 
				+        self.cls_heads = nn.Sequential(*cls_heads)
			
 
				+        self.reg_heads = nn.Sequential(*reg_heads)
			
 
				+
			
 
				+        ## pred layers
			
 
				+        self.cls_pred = nn.Conv2d(self.cls_head_dim, cfg.num_classes, kernel_size=3, padding=1)
			
 
				+        self.reg_pred = nn.Conv2d(self.reg_head_dim, 4, kernel_size=3, padding=1)
			
 
				+        self.ctn_pred = nn.Conv2d(self.reg_head_dim, 1, kernel_size=3, padding=1)
			
 
				+        
			
 
				+        ## scale layers
			
 
				+        self.scales = nn.ModuleList(
			
 
				+            Scale() for _ in range(len(self.stride))
			
 
				+        )
			
 
				+        
			
 
				+        # init bias
			
 
				+        self._init_layers()
			
 
				+
			
 
				+    def _init_layers(self):
			
 
				+        for module in [self.cls_heads, self.reg_heads, self.cls_pred, self.reg_pred, self.ctn_pred]:
			
 
				+            for layer in module.modules():
			
 
				+                if isinstance(layer, nn.Conv2d):
			
 
				+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
			
 
				+                    if layer.bias is not None:
			
 
				+                        torch.nn.init.constant_(layer.bias, 0)
			
 
				+                if isinstance(layer, nn.GroupNorm):
			
 
				+                    torch.nn.init.constant_(layer.weight, 1)
			
 
				+                    if layer.bias is not None:
			
 
				+                        torch.nn.init.constant_(layer.bias, 0)
			
 
				+        # init the bias of cls pred
			
 
				+        init_prob = 0.01
			
 
				+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
			
 
				+        torch.nn.init.constant_(self.cls_pred.bias, bias_value)
			
 
				+        
			
 
				+    def get_anchors(self, level, fmp_size):
			
 
				+        """
			
 
				+            fmp_size: (List) [H, W]
			
 
				+        """
			
 
				+        # generate grid cells
			
 
				+        fmp_h, fmp_w = fmp_size
			
 
				+        anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
			
 
				+        # [H, W, 2] -> [HW, 2]
			
 
				+        anchors = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2) + 0.5
			
 
				+        anchors *= self.stride[level]
			
 
				+
			
 
				+        return anchors
			
 
				+        
			
 
				+    def decode_boxes(self, pred_deltas, anchors):
			
 
				+        """
			
 
				+            pred_deltas: (List[Tensor]) [B, M, 4] or [M, 4] (l, t, r, b)
			
 
				+            anchors:     (List[Tensor]) [1, M, 2] or [M, 2]
			
 
				+        """
			
 
				+        # x1 = x_anchor - l, x2 = x_anchor + r
			
 
				+        # y1 = y_anchor - t, y2 = y_anchor + b
			
 
				+        pred_x1y1 = anchors - pred_deltas[..., :2]
			
 
				+        pred_x2y2 = anchors + pred_deltas[..., 2:]
			
 
				+        pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
			
 
				+
			
 
				+        return pred_box
			
 
				+    
			
 
				+    def forward(self, pyramid_feats, mask=None):
			
 
				+        all_masks = []
			
 
				+        all_anchors = []
			
 
				+        all_cls_preds = []
			
 
				+        all_reg_preds = []
			
 
				+        all_box_preds = []
			
 
				+        all_ctn_preds = []
			
 
				+        for level, feat in enumerate(pyramid_feats):
			
 
				+            # ------------------- Decoupled head -------------------
			
 
				+            cls_feat = self.cls_heads(feat)
			
 
				+            reg_feat = self.reg_heads(feat)
			
 
				+
			
 
				+            # ------------------- Generate anchor box -------------------
			
 
				+            B, _, H, W = cls_feat.size()
			
 
				+            fmp_size = [H, W]
			
 
				+            anchors = self.get_anchors(level, fmp_size)   # [M, 4]
			
 
				+            anchors = anchors.to(cls_feat.device)
			
 
				+
			
 
				+            # ------------------- Predict -------------------
			
 
				+            cls_pred = self.cls_pred(cls_feat)
			
 
				+            reg_pred = self.reg_pred(reg_feat)
			
 
				+            ctn_pred = self.ctn_pred(reg_feat)
			
 
				+
			
 
				+            # ------------------- Process preds -------------------
			
 
				+            ## [B, C, H, W] -> [B, H, W, C] -> [B, M, C]
			
 
				+            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
			
 
				+            ctn_pred = ctn_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
			
 
				+            reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 4)
			
 
				+            reg_pred = nn.functional.relu(self.scales[level](reg_pred)) * self.stride[level]
			
 
				+            ## Decode bbox
			
 
				+            box_pred = self.decode_boxes(reg_pred, anchors)
			
 
				+            ## Adjust mask
			
 
				+            if mask is not None:
			
 
				+                # [B, H, W]
			
 
				+                mask_i = torch.nn.functional.interpolate(mask[None].float(), size=[H, W]).bool()[0]
			
 
				+                # [B, H, W] -> [B, M]
			
 
				+                mask_i = mask_i.flatten(1)     
			
 
				+                all_masks.append(mask_i)
			
 
				+                
			
 
				+            all_anchors.append(anchors)
			
 
				+            all_cls_preds.append(cls_pred)
			
 
				+            all_reg_preds.append(reg_pred)
			
 
				+            all_box_preds.append(box_pred)
			
 
				+            all_ctn_preds.append(ctn_pred)
			
 
				+
			
 
				+        outputs = {"pred_cls": all_cls_preds,  # List [B, M, C]
			
 
				+                   "pred_reg": all_reg_preds,  # List [B, M, 4]
			
 
				+                   "pred_box": all_box_preds,  # List [B, M, 4]
			
 
				+                   "pred_ctn": all_ctn_preds,  # List [B, M, 1]
			
 
				+                   "anchors": all_anchors,     # List [B, M, 2]
			
 
				+                   "strides": self.stride,
			
 
				+                   "mask": all_masks}          # List [B, M,]
			
 
				+
			
 
				+        return outputs 
			
--- a/yolo/models/fcos/loss.py
+++ b/yolo/models/fcos/loss.py
@@ -0,0 +1,290 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+from utils.box_ops import get_ious
			
 
				+from utils.misc import sigmoid_focal_loss
			
 
				+from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
			
 
				+
			
 
				+from .matcher import FcosMatcher, AlignedOTAMatcher
			
 
				+
			
 
				+
			
 
				+class SetCriterion(nn.Module):
			
 
				+    def __init__(self, cfg):
			
 
				+        super().__init__()
			
 
				+        # ------------- Basic parameters -------------
			
 
				+        self.cfg = cfg
			
 
				+        self.num_classes = cfg.num_classes
			
 
				+        # ------------- Focal loss -------------
			
 
				+        self.alpha = cfg.focal_loss_alpha
			
 
				+        self.gamma = cfg.focal_loss_gamma
			
 
				+        # ------------- Loss weight -------------
			
 
				+        # ------------- Matcher & Loss weight -------------
			
 
				+        self.matcher_cfg = cfg.matcher_hpy
			
 
				+        if cfg.matcher == 'fcos_matcher':
			
 
				+            self.weight_dict = {'loss_cls': cfg.loss_cls_weight,
			
 
				+                                'loss_reg': cfg.loss_reg_weight,
			
 
				+                                'loss_ctn': cfg.loss_ctn_weight}
			
 
				+            self.matcher = FcosMatcher(cfg.num_classes,
			
 
				+                                       self.matcher_cfg['center_sampling_radius'],
			
 
				+                                       self.matcher_cfg['object_sizes_of_interest'],
			
 
				+                                       [1., 1., 1., 1.]
			
 
				+                                       )
			
 
				+        elif cfg.matcher == 'simota':
			
 
				+            self.weight_dict = {'loss_cls': cfg.loss_cls_weight,
			
 
				+                                'loss_reg': cfg.loss_reg_weight}
			
 
				+            self.matcher = AlignedOTAMatcher(cfg.num_classes,
			
 
				+                                             self.matcher_cfg['soft_center_radius'],
			
 
				+                                             self.matcher_cfg['topk_candidates'])
			
 
				+        else:
			
 
				+            raise NotImplementedError("Unknown matcher: {}.".format(cfg.matcher))
			
 
				+
			
 
				+    def loss_labels(self, pred_cls, tgt_cls, num_boxes=1.0):
			
 
				+        """
			
 
				+            pred_cls: (Tensor) [N, C]
			
 
				+            tgt_cls:  (Tensor) [N, C]
			
 
				+        """
			
 
				+        # cls loss: [V, C]
			
 
				+        loss_cls = sigmoid_focal_loss(pred_cls, tgt_cls, self.alpha, self.gamma)
			
 
				+
			
 
				+        return loss_cls.sum() / num_boxes
			
 
				+
			
 
				+    def loss_labels_qfl(self, pred_cls, target, beta=2.0, num_boxes=1.0):
			
 
				+        # Quality FocalLoss
			
 
				+        """
			
 
				+            pred_cls: (torch.Tensor): [N, C]。
			
 
				+            target:   (tuple([torch.Tensor], [torch.Tensor])): label -> (N,), score -> (N)
			
 
				+        """
			
 
				+        label, score = target
			
 
				+        pred_sigmoid = pred_cls.sigmoid()
			
 
				+        scale_factor = pred_sigmoid
			
 
				+        zerolabel = scale_factor.new_zeros(pred_cls.shape)
			
 
				+
			
 
				+        ce_loss = F.binary_cross_entropy_with_logits(
			
 
				+            pred_cls, zerolabel, reduction='none') * scale_factor.pow(beta)
			
 
				+        
			
 
				+        bg_class_ind = pred_cls.shape[-1]
			
 
				+        pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
			
 
				+        if pos.shape[0] > 0:
			
 
				+            pos_label = label[pos].long()
			
 
				+
			
 
				+            scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
			
 
				+
			
 
				+            ce_loss[pos, pos_label] = F.binary_cross_entropy_with_logits(
			
 
				+                pred_cls[pos, pos_label], score[pos],
			
 
				+                reduction='none') * scale_factor.abs().pow(beta)
			
 
				+
			
 
				+        return ce_loss.sum() / num_boxes
			
 
				+    
			
 
				+    def loss_bboxes_ltrb(self, pred_delta, tgt_delta, bbox_quality=None, num_boxes=1.0):
			
 
				+        """
			
 
				+            pred_box: (Tensor) [N, 4]
			
 
				+            tgt_box:  (Tensor) [N, 4]
			
 
				+        """
			
 
				+        pred_delta = torch.cat((-pred_delta[..., :2], pred_delta[..., 2:]), dim=-1)
			
 
				+        tgt_delta = torch.cat((-tgt_delta[..., :2], tgt_delta[..., 2:]), dim=-1)
			
 
				+
			
 
				+        eps = torch.finfo(torch.float32).eps
			
 
				+
			
 
				+        pred_area = (pred_delta[..., 2] - pred_delta[..., 0]).clamp_(min=0) \
			
 
				+            * (pred_delta[..., 3] - pred_delta[..., 1]).clamp_(min=0)
			
 
				+        tgt_area = (tgt_delta[..., 2] - tgt_delta[..., 0]).clamp_(min=0) \
			
 
				+            * (tgt_delta[..., 3] - tgt_delta[..., 1]).clamp_(min=0)
			
 
				+
			
 
				+        w_intersect = (torch.min(pred_delta[..., 2], tgt_delta[..., 2])
			
 
				+                    - torch.max(pred_delta[..., 0], tgt_delta[..., 0])).clamp_(min=0)
			
 
				+        h_intersect = (torch.min(pred_delta[..., 3], tgt_delta[..., 3])
			
 
				+                    - torch.max(pred_delta[..., 1], tgt_delta[..., 1])).clamp_(min=0)
			
 
				+
			
 
				+        area_intersect = w_intersect * h_intersect
			
 
				+        area_union = tgt_area + pred_area - area_intersect
			
 
				+        ious = area_intersect / area_union.clamp(min=eps)
			
 
				+
			
 
				+        # giou
			
 
				+        g_w_intersect = torch.max(pred_delta[..., 2], tgt_delta[..., 2]) \
			
 
				+            - torch.min(pred_delta[..., 0], tgt_delta[..., 0])
			
 
				+        g_h_intersect = torch.max(pred_delta[..., 3], tgt_delta[..., 3]) \
			
 
				+            - torch.min(pred_delta[..., 1], tgt_delta[..., 1])
			
 
				+        ac_uion = g_w_intersect * g_h_intersect
			
 
				+        gious = ious - (ac_uion - area_union) / ac_uion.clamp(min=eps)
			
 
				+        loss_box = 1 - gious
			
 
				+
			
 
				+        if bbox_quality is not None:
			
 
				+            loss_box = loss_box * bbox_quality.view(loss_box.size())
			
 
				+
			
 
				+        return loss_box.sum() / num_boxes
			
 
				+
			
 
				+    def loss_bboxes_xyxy(self, pred_box, gt_box, num_boxes=1.0, box_weight=None):
			
 
				+        ious = get_ious(pred_box, gt_box, box_mode="xyxy", iou_type='giou')
			
 
				+        loss_box = 1.0 - ious
			
 
				+
			
 
				+        if box_weight is not None:
			
 
				+            loss_box = loss_box.squeeze(-1) * box_weight
			
 
				+
			
 
				+        return loss_box.sum() / num_boxes
			
 
				+    
			
 
				+    def fcos_loss(self, outputs, targets):
			
 
				+        """
			
 
				+            outputs['pred_cls']: (Tensor) [B, M, C]
			
 
				+            outputs['pred_reg']: (Tensor) [B, M, 4]
			
 
				+            outputs['pred_ctn']: (Tensor) [B, M, 1]
			
 
				+            outputs['strides']: (List) [8, 16, 32, ...] stride of the model output
			
 
				+            targets: (List) [dict{'boxes': [...], 
			
 
				+                                 'labels': [...], 
			
 
				+                                 'orig_size': ...}, ...]
			
 
				+        """
			
 
				+        # -------------------- Pre-process --------------------
			
 
				+        device = outputs['pred_cls'][0].device
			
 
				+        fpn_strides = outputs['strides']
			
 
				+        anchors = outputs['anchors']
			
 
				+        pred_cls = torch.cat(outputs['pred_cls'], dim=1).view(-1, self.num_classes)
			
 
				+        pred_delta = torch.cat(outputs['pred_reg'], dim=1).view(-1, 4)
			
 
				+        pred_ctn = torch.cat(outputs['pred_ctn'], dim=1).view(-1, 1)
			
 
				+        masks = ~torch.cat(outputs['mask'], dim=1).view(-1)
			
 
				+
			
 
				+        # -------------------- Label Assignment --------------------
			
 
				+        gt_classes, gt_deltas, gt_centerness = self.matcher(fpn_strides, anchors, targets)
			
 
				+        gt_classes = gt_classes.flatten().to(device)
			
 
				+        gt_deltas = gt_deltas.view(-1, 4).to(device)
			
 
				+        gt_centerness = gt_centerness.view(-1, 1).to(device)
			
 
				+
			
 
				+        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
			
 
				+        num_foreground = foreground_idxs.sum()
			
 
				+
			
 
				+        if is_dist_avail_and_initialized():
			
 
				+            torch.distributed.all_reduce(num_foreground)
			
 
				+        num_foreground = torch.clamp(num_foreground / get_world_size(), min=1).item()
			
 
				+
			
 
				+        num_foreground_centerness = gt_centerness[foreground_idxs].sum()
			
 
				+        if is_dist_avail_and_initialized():
			
 
				+            torch.distributed.all_reduce(num_foreground_centerness)
			
 
				+        num_targets = torch.clamp(num_foreground_centerness / get_world_size(), min=1).item()
			
 
				+
			
 
				+        # -------------------- classification loss --------------------
			
 
				+        gt_classes_target = torch.zeros_like(pred_cls)
			
 
				+        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1
			
 
				+        valid_idxs = (gt_classes >= 0) & masks
			
 
				+        loss_labels = self.loss_labels(
			
 
				+            pred_cls[valid_idxs], gt_classes_target[valid_idxs], num_foreground)
			
 
				+
			
 
				+        # -------------------- regression loss --------------------
			
 
				+        loss_bboxes = self.loss_bboxes_ltrb(
			
 
				+            pred_delta[foreground_idxs], gt_deltas[foreground_idxs], gt_centerness[foreground_idxs], num_targets)
			
 
				+
			
 
				+        # -------------------- centerness loss --------------------
			
 
				+        loss_centerness = F.binary_cross_entropy_with_logits(
			
 
				+            pred_ctn[foreground_idxs],  gt_centerness[foreground_idxs], reduction='none')
			
 
				+        loss_centerness = loss_centerness.sum() / num_foreground
			
 
				+
			
 
				+        total_loss = loss_labels * self.weight_dict["loss_cls"] + \
			
 
				+                     loss_bboxes * self.weight_dict["loss_reg"] + \
			
 
				+                     loss_centerness * self.weight_dict["loss_ctn"]
			
 
				+        loss_dict = dict(
			
 
				+                loss_cls = loss_labels,
			
 
				+                loss_reg = loss_bboxes,
			
 
				+                loss_ctn = loss_centerness,
			
 
				+                losses   = total_loss,
			
 
				+        )
			
 
				+
			
 
				+        return loss_dict
			
 
				+    
			
 
				+    def ota_loss(self, outputs, targets):
			
 
				+        """
			
 
				+            outputs['pred_cls']: (Tensor) [B, M, C]
			
 
				+            outputs['pred_reg']: (Tensor) [B, M, 4]
			
 
				+            outputs['pred_box']: (Tensor) [B, M, 4]
			
 
				+            outputs['strides']: (List) [8, 16, 32, ...] stride of the model output
			
 
				+            targets: (List) [dict{'boxes': [...], 
			
 
				+                                 'labels': [...], 
			
 
				+                                 'orig_size': ...}, ...]
			
 
				+        """
			
 
				+        # -------------------- Pre-process --------------------
			
 
				+        bs          = outputs['pred_cls'][0].shape[0]
			
 
				+        device      = outputs['pred_cls'][0].device
			
 
				+        fpn_strides = outputs['strides']
			
 
				+        anchors     = outputs['anchors']
			
 
				+        # preds: [B, M, C]
			
 
				+        # preds: [B, M, C]
			
 
				+        cls_preds = torch.cat(outputs['pred_cls'], dim=1)
			
 
				+        box_preds = torch.cat(outputs['pred_box'], dim=1)
			
 
				+        masks = ~torch.cat(outputs['mask'], dim=1).view(-1)
			
 
				+
			
 
				+        # -------------------- Label Assignment --------------------
			
 
				+        cls_targets = []
			
 
				+        box_targets = []
			
 
				+        assign_metrics = []
			
 
				+        for batch_idx in range(bs):
			
 
				+            tgt_labels = targets[batch_idx]["labels"].to(device)  # [N,]
			
 
				+            tgt_bboxes = targets[batch_idx]["boxes"].to(device)   # [N, 4]
			
 
				+            # refine target
			
 
				+            tgt_boxes_wh = tgt_bboxes[..., 2:] - tgt_bboxes[..., :2]
			
 
				+            min_tgt_size = torch.min(tgt_boxes_wh, dim=-1)[0]
			
 
				+            keep = (min_tgt_size >= 8)
			
 
				+            tgt_bboxes = tgt_bboxes[keep]
			
 
				+            tgt_labels = tgt_labels[keep]
			
 
				+            # label assignment
			
 
				+            assigned_result = self.matcher(fpn_strides=fpn_strides,
			
 
				+                                           anchors=anchors,
			
 
				+                                           pred_cls=cls_preds[batch_idx].detach(),
			
 
				+                                           pred_box=box_preds[batch_idx].detach(),
			
 
				+                                           gt_labels=tgt_labels,
			
 
				+                                           gt_bboxes=tgt_bboxes
			
 
				+                                           )
			
 
				+            cls_targets.append(assigned_result['assigned_labels'])
			
 
				+            box_targets.append(assigned_result['assigned_bboxes'])
			
 
				+            assign_metrics.append(assigned_result['assign_metrics'])
			
 
				+
			
 
				+        # List[B, M, C] -> Tensor[BM, C]
			
 
				+        cls_targets = torch.cat(cls_targets, dim=0)
			
 
				+        box_targets = torch.cat(box_targets, dim=0)
			
 
				+        assign_metrics = torch.cat(assign_metrics, dim=0)
			
 
				+
			
 
				+        valid_idxs = (cls_targets >= 0) & masks
			
 
				+        foreground_idxs = (cls_targets >= 0) & (cls_targets != self.num_classes)
			
 
				+        num_fgs = assign_metrics.sum()
			
 
				+
			
 
				+        if is_dist_avail_and_initialized():
			
 
				+            torch.distributed.all_reduce(num_fgs)
			
 
				+        num_fgs = torch.clamp(num_fgs / get_world_size(), min=1).item()
			
 
				+
			
 
				+        # -------------------- classification loss --------------------
			
 
				+        cls_preds = cls_preds.view(-1, self.num_classes)[valid_idxs]
			
 
				+        qfl_targets = (cls_targets[valid_idxs], assign_metrics[valid_idxs])
			
 
				+        loss_labels = self.loss_labels_qfl(cls_preds, qfl_targets, 2.0, num_fgs)
			
 
				+
			
 
				+        # -------------------- regression loss --------------------
			
 
				+        box_preds_pos = box_preds.view(-1, 4)[foreground_idxs]
			
 
				+        box_targets_pos = box_targets[foreground_idxs]
			
 
				+        box_weight = assign_metrics[foreground_idxs]
			
 
				+        loss_bboxes = self.loss_bboxes_xyxy(box_preds_pos, box_targets_pos, num_fgs, box_weight)
			
 
				+
			
 
				+        total_loss = loss_labels * self.weight_dict["loss_cls"] + \
			
 
				+                     loss_bboxes * self.weight_dict["loss_reg"]
			
 
				+        loss_dict = dict(
			
 
				+                loss_cls = loss_labels,
			
 
				+                loss_reg = loss_bboxes,
			
 
				+                losses   = total_loss,
			
 
				+        )
			
 
				+
			
 
				+        return loss_dict
			
 
				+    
			
 
				+    def forward(self, outputs, targets):
			
 
				+        """
			
 
				+            outputs['pred_cls']: (Tensor) [B, M, C]
			
 
				+            outputs['pred_reg']: (Tensor) [B, M, 4]
			
 
				+            outputs['pred_ctn']: (Tensor) [B, M, 1]
			
 
				+            outputs['strides']: (List) [8, 16, 32, ...] stride of the model output
			
 
				+            targets: (List) [dict{'boxes': [...], 
			
 
				+                                 'labels': [...], 
			
 
				+                                 'orig_size': ...}, ...]
			
 
				+        """
			
 
				+        if self.cfg.matcher == "fcos_matcher":
			
 
				+            return self.fcos_loss(outputs, targets)
			
 
				+        elif self.cfg.matcher == "simota":
			
 
				+            return self.ota_loss(outputs, targets)
			
 
				+        else:
			
 
				+            raise NotImplementedError
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pass
			
--- a/yolo/models/fcos/matcher.py
+++ b/yolo/models/fcos/matcher.py
@@ -0,0 +1,378 @@
 
				+import math
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+from utils.box_ops import *
			
 
				+
			
 
				+
			
 
				+@torch.no_grad()
			
 
				+def get_ious_and_iou_loss(inputs,
			
 
				+                          targets,
			
 
				+                          weight=None,
			
 
				+                          box_mode="xyxy",
			
 
				+                          loss_type="iou",
			
 
				+                          reduction="none"):
			
 
				+    """
			
 
				+    Compute iou loss of type ['iou', 'giou', 'linear_iou']
			
 
				+
			
 
				+    Args:
			
 
				+        inputs (tensor): pred values
			
 
				+        targets (tensor): target values
			
 
				+        weight (tensor): loss weight
			
 
				+        box_mode (str): 'xyxy' or 'ltrb', 'ltrb' is currently supported.
			
 
				+        loss_type (str): 'giou' or 'iou' or 'linear_iou'
			
 
				+        reduction (str): reduction manner
			
 
				+
			
 
				+    Returns:
			
 
				+        loss (tensor): computed iou loss.
			
 
				+    """
			
 
				+    if box_mode == "ltrb":
			
 
				+        inputs = torch.cat((-inputs[..., :2], inputs[..., 2:]), dim=-1)
			
 
				+        targets = torch.cat((-targets[..., :2], targets[..., 2:]), dim=-1)
			
 
				+    elif box_mode != "xyxy":
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    eps = torch.finfo(torch.float32).eps
			
 
				+
			
 
				+    inputs_area = (inputs[..., 2] - inputs[..., 0]).clamp_(min=0) \
			
 
				+        * (inputs[..., 3] - inputs[..., 1]).clamp_(min=0)
			
 
				+    targets_area = (targets[..., 2] - targets[..., 0]).clamp_(min=0) \
			
 
				+        * (targets[..., 3] - targets[..., 1]).clamp_(min=0)
			
 
				+
			
 
				+    w_intersect = (torch.min(inputs[..., 2], targets[..., 2])
			
 
				+                   - torch.max(inputs[..., 0], targets[..., 0])).clamp_(min=0)
			
 
				+    h_intersect = (torch.min(inputs[..., 3], targets[..., 3])
			
 
				+                   - torch.max(inputs[..., 1], targets[..., 1])).clamp_(min=0)
			
 
				+
			
 
				+    area_intersect = w_intersect * h_intersect
			
 
				+    area_union = targets_area + inputs_area - area_intersect
			
 
				+    ious = area_intersect / area_union.clamp(min=eps)
			
 
				+
			
 
				+    if loss_type == "iou":
			
 
				+        loss = -ious.clamp(min=eps).log()
			
 
				+    elif loss_type == "linear_iou":
			
 
				+        loss = 1 - ious
			
 
				+    elif loss_type == "giou":
			
 
				+        g_w_intersect = torch.max(inputs[..., 2], targets[..., 2]) \
			
 
				+            - torch.min(inputs[..., 0], targets[..., 0])
			
 
				+        g_h_intersect = torch.max(inputs[..., 3], targets[..., 3]) \
			
 
				+            - torch.min(inputs[..., 1], targets[..., 1])
			
 
				+        ac_uion = g_w_intersect * g_h_intersect
			
 
				+        gious = ious - (ac_uion - area_union) / ac_uion.clamp(min=eps)
			
 
				+        loss = 1 - gious
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+    if weight is not None:
			
 
				+        loss = loss * weight.view(loss.size())
			
 
				+        if reduction == "mean":
			
 
				+            loss = loss.sum() / max(weight.sum().item(), eps)
			
 
				+    else:
			
 
				+        if reduction == "mean":
			
 
				+            loss = loss.mean()
			
 
				+    if reduction == "sum":
			
 
				+        loss = loss.sum()
			
 
				+
			
 
				+    return ious, loss
			
 
				+
			
 
				+
			
 
				+class FcosMatcher(object):
			
 
				+    """
			
 
				+        This code referenced to https://github.com/Megvii-BaseDetection/cvpods
			
 
				+    """
			
 
				+    def __init__(self, 
			
 
				+                 num_classes,
			
 
				+                 center_sampling_radius,
			
 
				+                 object_sizes_of_interest,
			
 
				+                 box_weights=[1, 1, 1, 1]):
			
 
				+        self.num_classes = num_classes
			
 
				+        self.center_sampling_radius = center_sampling_radius
			
 
				+        self.object_sizes_of_interest = object_sizes_of_interest
			
 
				+        self.box_weightss = box_weights
			
 
				+
			
 
				+
			
 
				+    def get_deltas(self, anchors, boxes):
			
 
				+        """
			
 
				+        Get box regression transformation deltas (dl, dt, dr, db) that can be used
			
 
				+        to transform the `anchors` into the `boxes`. That is, the relation
			
 
				+        ``boxes == self.apply_deltas(deltas, anchors)`` is true.
			
 
				+
			
 
				+        Args:
			
 
				+            anchors (Tensor): anchors, e.g., feature map coordinates
			
 
				+            boxes (Tensor): target of the transformation, e.g., ground-truth
			
 
				+                boxes.
			
 
				+        """
			
 
				+        assert isinstance(anchors, torch.Tensor), type(anchors)
			
 
				+        assert isinstance(boxes, torch.Tensor), type(boxes)
			
 
				+        deltas = torch.cat((anchors - boxes[..., :2], boxes[..., 2:] - anchors),
			
 
				+                           dim=-1) * anchors.new_tensor(self.box_weightss)
			
 
				+        return deltas
			
 
				+
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def __call__(self, fpn_strides, anchors, targets):
			
 
				+        """
			
 
				+            fpn_strides: (List) List[8, 16, 32, ...] stride of network output.
			
 
				+            anchors: (List of Tensor) List[F, M, 2], F = num_fpn_levels
			
 
				+            targets: (Dict) dict{'boxes': [...], 
			
 
				+                                 'labels': [...], 
			
 
				+                                 'orig_size': ...}
			
 
				+        """
			
 
				+        gt_classes = []
			
 
				+        gt_anchors_deltas = []
			
 
				+        gt_centerness = []
			
 
				+        device = anchors[0].device
			
 
				+
			
 
				+        # List[F, M, 2] -> [M, 2]
			
 
				+        anchors_over_all_feature_maps = torch.cat(anchors, dim=0).to(device)
			
 
				+
			
 
				+        for targets_per_image in targets:
			
 
				+            # generate object_sizes_of_interest: List[[M, 2]]
			
 
				+            object_sizes_of_interest = [anchors_i.new_tensor(scale_range).unsqueeze(0).expand(anchors_i.size(0), -1) 
			
 
				+                                        for anchors_i, scale_range in zip(anchors, self.object_sizes_of_interest)]
			
 
				+            # List[F, M, 2] -> [M, 2], M = M1 + M2 + ... + MF
			
 
				+            object_sizes_of_interest = torch.cat(object_sizes_of_interest, dim=0)
			
 
				+            # [N, 4]
			
 
				+            tgt_box = targets_per_image['boxes'].to(device)
			
 
				+            # [N, C]
			
 
				+            tgt_cls = targets_per_image['labels'].to(device)
			
 
				+            # [N, M, 4], M = M1 + M2 + ... + MF
			
 
				+            deltas = self.get_deltas(anchors_over_all_feature_maps, tgt_box.unsqueeze(1))
			
 
				+
			
 
				+            has_gt = (len(tgt_cls) > 0)
			
 
				+            if has_gt:
			
 
				+                if self.center_sampling_radius > 0:
			
 
				+                    # bbox centers: [N, 2]
			
 
				+                    centers = (tgt_box[..., :2] + tgt_box[..., 2:]) * 0.5
			
 
				+
			
 
				+                    is_in_boxes = []
			
 
				+                    for stride, anchors_i in zip(fpn_strides, anchors):
			
 
				+                        radius = stride * self.center_sampling_radius
			
 
				+                        # [N, 4]
			
 
				+                        center_boxes = torch.cat((
			
 
				+                            torch.max(centers - radius, tgt_box[:, :2]),
			
 
				+                            torch.min(centers + radius, tgt_box[:, 2:]),
			
 
				+                        ), dim=-1)
			
 
				+                        # [N, Mi, 4]
			
 
				+                        center_deltas = self.get_deltas(anchors_i, center_boxes.unsqueeze(1))
			
 
				+                        # [N, Mi]
			
 
				+                        is_in_boxes.append(center_deltas.min(dim=-1).values > 0)
			
 
				+                    # [N, M], M = M1 + M2 + ... + MF
			
 
				+                    is_in_boxes = torch.cat(is_in_boxes, dim=1)
			
 
				+                else:
			
 
				+                    # no center sampling, it will use all the locations within a ground-truth box
			
 
				+                    # [N, M], M = M1 + M2 + ... + MF
			
 
				+                    is_in_boxes = deltas.min(dim=-1).values > 0
			
 
				+                # [N, M], M = M1 + M2 + ... + MF
			
 
				+                max_deltas = deltas.max(dim=-1).values
			
 
				+                # limit the regression range for each location
			
 
				+                is_cared_in_the_level = \
			
 
				+                    (max_deltas >= object_sizes_of_interest[None, :, 0]) & \
			
 
				+                    (max_deltas <= object_sizes_of_interest[None, :, 1])
			
 
				+
			
 
				+                # [N,]
			
 
				+                tgt_box_area = (tgt_box[:, 2] - tgt_box[:, 0]) * (tgt_box[:, 3] - tgt_box[:, 1])
			
 
				+                # [N,] -> [N, 1] -> [N, M]
			
 
				+                gt_positions_area = tgt_box_area.unsqueeze(1).repeat(
			
 
				+                    1, anchors_over_all_feature_maps.size(0))
			
 
				+                gt_positions_area[~is_in_boxes] = math.inf
			
 
				+                gt_positions_area[~is_cared_in_the_level] = math.inf
			
 
				+
			
 
				+                # if there are still more than one objects for a position,
			
 
				+                # we choose the one with minimal area
			
 
				+                # [M,], each element is the index of ground-truth
			
 
				+                positions_min_area, gt_matched_idxs = gt_positions_area.min(dim=0)
			
 
				+
			
 
				+                # ground truth box regression
			
 
				+                # [M, 4]
			
 
				+                gt_anchors_reg_deltas_i = self.get_deltas(
			
 
				+                    anchors_over_all_feature_maps, tgt_box[gt_matched_idxs])
			
 
				+
			
 
				+                # [M,]
			
 
				+                tgt_cls_i = tgt_cls[gt_matched_idxs]
			
 
				+                # anchors with area inf are treated as background.
			
 
				+                tgt_cls_i[positions_min_area == math.inf] = self.num_classes
			
 
				+
			
 
				+                # ground truth centerness
			
 
				+                left_right = gt_anchors_reg_deltas_i[:, [0, 2]]
			
 
				+                top_bottom = gt_anchors_reg_deltas_i[:, [1, 3]]
			
 
				+                # [M,]
			
 
				+                gt_centerness_i = torch.sqrt(
			
 
				+                    (left_right.min(dim=-1).values / left_right.max(dim=-1).values).clamp_(min=0)
			
 
				+                    * (top_bottom.min(dim=-1).values / top_bottom.max(dim=-1).values).clamp_(min=0)
			
 
				+                )
			
 
				+
			
 
				+                gt_classes.append(tgt_cls_i)
			
 
				+                gt_anchors_deltas.append(gt_anchors_reg_deltas_i)
			
 
				+                gt_centerness.append(gt_centerness_i)
			
 
				+
			
 
				+                del centers, center_boxes, deltas, max_deltas, center_deltas
			
 
				+
			
 
				+            else:
			
 
				+                tgt_cls_i = torch.zeros(anchors_over_all_feature_maps.shape[0], device=device) + self.num_classes
			
 
				+                gt_anchors_reg_deltas_i = torch.zeros([anchors_over_all_feature_maps.shape[0], 4], device=device)
			
 
				+                gt_centerness_i = torch.zeros(anchors_over_all_feature_maps.shape[0], device=device)
			
 
				+
			
 
				+                gt_classes.append(tgt_cls_i.long())
			
 
				+                gt_anchors_deltas.append(gt_anchors_reg_deltas_i.float())
			
 
				+                gt_centerness.append(gt_centerness_i.float())
			
 
				+
			
 
				+
			
 
				+        # [B, M], [B, M, 4], [B, M]
			
 
				+        return torch.stack(gt_classes), torch.stack(gt_anchors_deltas), torch.stack(gt_centerness)
			
 
				+
			
 
				+
			
 
				+class AlignedOTAMatcher(object):
			
 
				+    """
			
 
				+    This code referenced to https://github.com/open-mmlab/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py
			
 
				+    """
			
 
				+    def __init__(self, num_classes, soft_center_radius=3.0, topk_candidates=13):
			
 
				+        self.num_classes = num_classes
			
 
				+        self.soft_center_radius = soft_center_radius
			
 
				+        self.topk_candidates = topk_candidates
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def __call__(self, 
			
 
				+                 fpn_strides, 
			
 
				+                 anchors, 
			
 
				+                 pred_cls, 
			
 
				+                 pred_box,
			
 
				+                 gt_labels,
			
 
				+                 gt_bboxes):
			
 
				+        # [M,]
			
 
				+        strides = torch.cat([torch.ones_like(anchor_i[:, 0]) * stride_i
			
 
				+                                for stride_i, anchor_i in zip(fpn_strides, anchors)], dim=-1)
			
 
				+        # List[F, M, 2] -> [M, 2]
			
 
				+        num_gt = len(gt_labels)
			
 
				+        anchors = torch.cat(anchors, dim=0)
			
 
				+
			
 
				+        # check gt
			
 
				+        if num_gt == 0 or gt_bboxes.max().item() == 0.:
			
 
				+            return {
			
 
				+                'assigned_labels': gt_labels.new_full(pred_cls[..., 0].shape,
			
 
				+                                                      self.num_classes,
			
 
				+                                                      dtype=torch.long),
			
 
				+                'assigned_bboxes': gt_bboxes.new_full(pred_box.shape, 0),
			
 
				+                'assign_metrics': gt_bboxes.new_full(pred_cls[..., 0].shape, 0)
			
 
				+            }
			
 
				+        
			
 
				+        # get inside points: [N, M]
			
 
				+        is_in_gt = self.find_inside_points(gt_bboxes, anchors)
			
 
				+        valid_mask = is_in_gt.sum(dim=0) > 0  # [M,]
			
 
				+
			
 
				+        # ----------------------------------- soft center prior -----------------------------------
			
 
				+        gt_center = (gt_bboxes[..., :2] + gt_bboxes[..., 2:]) / 2.0
			
 
				+        distance = (anchors.unsqueeze(0) - gt_center.unsqueeze(1)
			
 
				+                    ).pow(2).sum(-1).sqrt() / strides.unsqueeze(0)  # [N, M]
			
 
				+        distance = distance * valid_mask.unsqueeze(0)
			
 
				+        soft_center_prior = torch.pow(10, distance - self.soft_center_radius)
			
 
				+
			
 
				+        # ----------------------------------- regression cost -----------------------------------
			
 
				+        pair_wise_ious, _ = box_iou(gt_bboxes, pred_box)  # [N, M]
			
 
				+        pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8) * 3.0
			
 
				+
			
 
				+        # ----------------------------------- classification cost -----------------------------------
			
 
				+        ## select the predicted scores corresponded to the gt_labels
			
 
				+        pairwise_pred_scores = pred_cls.permute(1, 0)  # [M, C] -> [C, M]
			
 
				+        pairwise_pred_scores = pairwise_pred_scores[gt_labels.long(), :].float()   # [N, M]
			
 
				+        ## scale factor
			
 
				+        scale_factor = (pair_wise_ious - pairwise_pred_scores.sigmoid()).abs().pow(2.0)
			
 
				+        ## cls cost
			
 
				+        pair_wise_cls_loss = F.binary_cross_entropy_with_logits(
			
 
				+            pairwise_pred_scores, pair_wise_ious,
			
 
				+            reduction="none") * scale_factor # [N, M]
			
 
				+            
			
 
				+        del pairwise_pred_scores
			
 
				+
			
 
				+        ## foreground cost matrix
			
 
				+        cost_matrix = pair_wise_cls_loss + pair_wise_ious_loss + soft_center_prior
			
 
				+        max_pad_value = torch.ones_like(cost_matrix) * 1e9
			
 
				+        cost_matrix = torch.where(valid_mask[None].repeat(num_gt, 1),   # [N, M]
			
 
				+                                  cost_matrix, max_pad_value)
			
 
				+
			
 
				+        # ----------------------------------- dynamic label assignment -----------------------------------
			
 
				+        matched_pred_ious, matched_gt_inds, fg_mask_inboxes = self.dynamic_k_matching(
			
 
				+            cost_matrix, pair_wise_ious, num_gt)
			
 
				+        del pair_wise_cls_loss, cost_matrix, pair_wise_ious, pair_wise_ious_loss
			
 
				+
			
 
				+        # -----------------------------------process assigned labels -----------------------------------
			
 
				+        assigned_labels = gt_labels.new_full(pred_cls[..., 0].shape,
			
 
				+                                             self.num_classes)  # [M,]
			
 
				+        assigned_labels[fg_mask_inboxes] = gt_labels[matched_gt_inds].squeeze(-1)
			
 
				+        assigned_labels = assigned_labels.long()  # [M,]
			
 
				+
			
 
				+        assigned_bboxes = gt_bboxes.new_full(pred_box.shape, 0)        # [M, 4]
			
 
				+        assigned_bboxes[fg_mask_inboxes] = gt_bboxes[matched_gt_inds]  # [M, 4]
			
 
				+
			
 
				+        assign_metrics = gt_bboxes.new_full(pred_cls[..., 0].shape, 0) # [M,]
			
 
				+        assign_metrics[fg_mask_inboxes] = matched_pred_ious            # [M,]
			
 
				+
			
 
				+        assigned_dict = dict(
			
 
				+            assigned_labels=assigned_labels,
			
 
				+            assigned_bboxes=assigned_bboxes,
			
 
				+            assign_metrics=assign_metrics
			
 
				+            )
			
 
				+        
			
 
				+        return assigned_dict
			
 
				+
			
 
				+    def find_inside_points(self, gt_bboxes, anchors):
			
 
				+        """
			
 
				+            gt_bboxes: Tensor -> [N, 2]
			
 
				+            anchors:   Tensor -> [M, 2]
			
 
				+        """
			
 
				+        num_anchors = anchors.shape[0]
			
 
				+        num_gt = gt_bboxes.shape[0]
			
 
				+
			
 
				+        anchors_expand = anchors.unsqueeze(0).repeat(num_gt, 1, 1)           # [N, M, 2]
			
 
				+        gt_bboxes_expand = gt_bboxes.unsqueeze(1).repeat(1, num_anchors, 1)  # [N, M, 4]
			
 
				+
			
 
				+        # offset
			
 
				+        lt = anchors_expand - gt_bboxes_expand[..., :2]
			
 
				+        rb = gt_bboxes_expand[..., 2:] - anchors_expand
			
 
				+        bbox_deltas = torch.cat([lt, rb], dim=-1)
			
 
				+
			
 
				+        is_in_gts = bbox_deltas.min(dim=-1).values > 0
			
 
				+
			
 
				+        return is_in_gts
			
 
				+    
			
 
				+    def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):
			
 
				+        """Use IoU and matching cost to calculate the dynamic top-k positive
			
 
				+        targets.
			
 
				+
			
 
				+        Args:
			
 
				+            cost_matrix (Tensor): Cost matrix.
			
 
				+            pairwise_ious (Tensor): Pairwise iou matrix.
			
 
				+            num_gt (int): Number of gt.
			
 
				+            valid_mask (Tensor): Mask for valid bboxes.
			
 
				+        Returns:
			
 
				+            tuple: matched ious and gt indexes.
			
 
				+        """
			
 
				+        matching_matrix = torch.zeros_like(cost_matrix, dtype=torch.uint8)
			
 
				+        # select candidate topk ious for dynamic-k calculation
			
 
				+        candidate_topk = min(self.topk_candidates, pairwise_ious.size(1))
			
 
				+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=1)
			
 
				+        # calculate dynamic k for each gt
			
 
				+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
			
 
				+
			
 
				+        # sorting the batch cost matirx is faster than topk
			
 
				+        _, sorted_indices = torch.sort(cost_matrix, dim=1)
			
 
				+        for gt_idx in range(num_gt):
			
 
				+            topk_ids = sorted_indices[gt_idx, :dynamic_ks[gt_idx]]
			
 
				+            matching_matrix[gt_idx, :][topk_ids] = 1
			
 
				+
			
 
				+        del topk_ious, dynamic_ks, topk_ids
			
 
				+
			
 
				+        prior_match_gt_mask = matching_matrix.sum(0) > 1
			
 
				+        if prior_match_gt_mask.sum() > 0:
			
 
				+            cost_min, cost_argmin = torch.min(
			
 
				+                cost_matrix[:, prior_match_gt_mask], dim=0)
			
 
				+            matching_matrix[:, prior_match_gt_mask] *= 0
			
 
				+            matching_matrix[cost_argmin, prior_match_gt_mask] = 1
			
 
				+
			
 
				+        # get foreground mask inside box and center prior
			
 
				+        fg_mask_inboxes = matching_matrix.sum(0) > 0
			
 
				+        matched_pred_ious = (matching_matrix *
			
 
				+                             pairwise_ious).sum(0)[fg_mask_inboxes]
			
 
				+        matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
			
 
				+
			
 
				+        return matched_pred_ious, matched_gt_inds, fg_mask_inboxes
			
 
				+        
			
--- a/yolo/models/fcos/modules.py
+++ b/yolo/models/fcos/modules.py
@@ -0,0 +1,148 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+from typing import List
			
 
				+
			
 
				+
			
 
				+# --------------------- Basic modules ---------------------
			
 
				+def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
			
 
				+    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
			
 
				+
			
 
				+    return conv
			
 
				+
			
 
				+def get_activation(act_type=None):
			
 
				+    if act_type == 'relu':
			
 
				+        return nn.ReLU(inplace=True)
			
 
				+    elif act_type == 'lrelu':
			
 
				+        return nn.LeakyReLU(0.1, inplace=True)
			
 
				+    elif act_type == 'mish':
			
 
				+        return nn.Mish(inplace=True)
			
 
				+    elif act_type == 'silu':
			
 
				+        return nn.SiLU(inplace=True)
			
 
				+    elif act_type is None:
			
 
				+        return nn.Identity()
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+        
			
 
				+def get_norm(norm_type, dim):
			
 
				+    if norm_type == 'BN':
			
 
				+        return nn.BatchNorm2d(dim)
			
 
				+    elif norm_type == 'GN':
			
 
				+        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				+    elif norm_type is None:
			
 
				+        return nn.Identity()
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+class BasicConv(nn.Module):
			
 
				+    def __init__(self, 
			
 
				+                 in_dim,                   # in channels
			
 
				+                 out_dim,                  # out channels 
			
 
				+                 kernel_size=1,            # kernel size 
			
 
				+                 padding=0,                # padding
			
 
				+                 stride=1,                 # padding
			
 
				+                 dilation=1,               # dilation
			
 
				+                 act_type  :str = 'lrelu', # activation
			
 
				+                 norm_type :str = 'BN',    # normalization
			
 
				+                 depthwise :bool = False
			
 
				+                ):
			
 
				+        super(BasicConv, self).__init__()
			
 
				+        self.depthwise = depthwise
			
 
				+        use_bias = False if norm_type is not None else True
			
 
				+        if not depthwise:
			
 
				+            self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=1, bias=use_bias)
			
 
				+            self.norm = get_norm(norm_type, out_dim)
			
 
				+        else:
			
 
				+            self.conv1 = get_conv2d(in_dim, in_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=in_dim, bias=use_bias)
			
 
				+            self.norm1 = get_norm(norm_type, in_dim)
			
 
				+            self.conv2 = get_conv2d(in_dim, out_dim, k=1, p=0, s=1, d=1, g=1)
			
 
				+            self.norm2 = get_norm(norm_type, out_dim)
			
 
				+        self.act  = get_activation(act_type)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        if not self.depthwise:
			
 
				+            return self.act(self.norm(self.conv(x)))
			
 
				+        else:
			
 
				+            # Depthwise conv
			
 
				+            x = self.norm1(self.conv1(x))
			
 
				+            # Pointwise conv
			
 
				+            x = self.act(self.norm2(self.conv2(x)))
			
 
				+            return x
			
 
				+
			
 
				+
			
 
				+# --------------------- ResNet modules ---------------------
			
 
				+def conv3x3(in_planes, out_planes, stride=1):
			
 
				+    """3x3 convolution with padding"""
			
 
				+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
			
 
				+                     padding=1, bias=False)
			
 
				+
			
 
				+def conv1x1(in_planes, out_planes, stride=1):
			
 
				+    """1x1 convolution"""
			
 
				+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
			
 
				+
			
 
				+class BasicBlock(nn.Module):
			
 
				+    expansion = 1
			
 
				+
			
 
				+    def __init__(self, inplanes, planes, stride=1, downsample=None):
			
 
				+        super(BasicBlock, self).__init__()
			
 
				+        self.conv1 = conv3x3(inplanes, planes, stride)
			
 
				+        self.bn1 = nn.BatchNorm2d(planes)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.conv2 = conv3x3(planes, planes)
			
 
				+        self.bn2 = nn.BatchNorm2d(planes)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        identity = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            identity = self.downsample(x)
			
 
				+
			
 
				+        out += identity
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+class Bottleneck(nn.Module):
			
 
				+    expansion = 4
			
 
				+
			
 
				+    def __init__(self, inplanes, planes, stride=1, downsample=None):
			
 
				+        super(Bottleneck, self).__init__()
			
 
				+        self.conv1 = conv1x1(inplanes, planes)
			
 
				+        self.bn1 = nn.BatchNorm2d(planes)
			
 
				+        self.conv2 = conv3x3(planes, planes, stride)
			
 
				+        self.bn2 = nn.BatchNorm2d(planes)
			
 
				+        self.conv3 = conv1x1(planes, planes * self.expansion)
			
 
				+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        identity = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv3(out)
			
 
				+        out = self.bn3(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            identity = self.downsample(x)
			
 
				+
			
 
				+        out += identity
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
--- a/yolo/models/fcos/resnet.py
+++ b/yolo/models/fcos/resnet.py
@@ -0,0 +1,187 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.utils.model_zoo as model_zoo
			
 
				+
			
 
				+try:
			
 
				+    from .modules import conv1x1, BasicBlock, Bottleneck
			
 
				+except:
			
 
				+    from  modules import conv1x1, BasicBlock, Bottleneck
			
 
				+
			
 
				+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
			
 
				+           'resnet152']
			
 
				+
			
 
				+
			
 
				+model_urls = {
			
 
				+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
			
 
				+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
			
 
				+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
			
 
				+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
			
 
				+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# --------------------- ResNet -----------------------
			
 
				+class ResNet(nn.Module):
			
 
				+
			
 
				+    def __init__(self, block, layers, zero_init_residual=False):
			
 
				+        super(ResNet, self).__init__()
			
 
				+        self.inplanes = 64
			
 
				+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
			
 
				+                               bias=False)
			
 
				+        self.bn1 = nn.BatchNorm2d(64)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
			
 
				+        self.layer1 = self._make_layer(block, 64, layers[0])
			
 
				+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
			
 
				+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
			
 
				+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
			
 
				+
			
 
				+        for m in self.modules():
			
 
				+            if isinstance(m, nn.Conv2d):
			
 
				+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
			
 
				+            elif isinstance(m, nn.BatchNorm2d):
			
 
				+                nn.init.constant_(m.weight, 1)
			
 
				+                nn.init.constant_(m.bias, 0)
			
 
				+
			
 
				+        # Zero-initialize the last BN in each residual branch,
			
 
				+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
			
 
				+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
			
 
				+        if zero_init_residual:
			
 
				+            for m in self.modules():
			
 
				+                if isinstance(m, Bottleneck):
			
 
				+                    nn.init.constant_(m.bn3.weight, 0)
			
 
				+                elif isinstance(m, BasicBlock):
			
 
				+                    nn.init.constant_(m.bn2.weight, 0)
			
 
				+
			
 
				+    def _make_layer(self, block, planes, blocks, stride=1):
			
 
				+        downsample = None
			
 
				+        if stride != 1 or self.inplanes != planes * block.expansion:
			
 
				+            downsample = nn.Sequential(
			
 
				+                conv1x1(self.inplanes, planes * block.expansion, stride),
			
 
				+                nn.BatchNorm2d(planes * block.expansion),
			
 
				+            )
			
 
				+
			
 
				+        layers = []
			
 
				+        layers.append(block(self.inplanes, planes, stride, downsample))
			
 
				+        self.inplanes = planes * block.expansion
			
 
				+        for _ in range(1, blocks):
			
 
				+            layers.append(block(self.inplanes, planes))
			
 
				+
			
 
				+        return nn.Sequential(*layers)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        """
			
 
				+        Input:
			
 
				+            x: (Tensor) -> [B, C, H, W]
			
 
				+        Output:
			
 
				+            c5: (Tensor) -> [B, C, H/32, W/32]
			
 
				+        """
			
 
				+        c1 = self.conv1(x)     # [B, C, H/2, W/2]
			
 
				+        c1 = self.bn1(c1)      # [B, C, H/2, W/2]
			
 
				+        c1 = self.relu(c1)     # [B, C, H/2, W/2]
			
 
				+        c2 = self.maxpool(c1)  # [B, C, H/4, W/4]
			
 
				+
			
 
				+        c2 = self.layer1(c2)   # [B, C, H/4, W/4]
			
 
				+        c3 = self.layer2(c2)   # [B, C, H/8, W/8]
			
 
				+        c4 = self.layer3(c3)   # [B, C, H/16, W/16]
			
 
				+        c5 = self.layer4(c4)   # [B, C, H/32, W/32]
			
 
				+
			
 
				+        return c5
			
 
				+
			
 
				+
			
 
				+# --------------------- Functions -----------------------
			
 
				+def build_resnet(model_name="resnet18", pretrained=False):
			
 
				+    if model_name == 'resnet18':
			
 
				+        model = resnet18(pretrained)
			
 
				+        feat_dim = 512
			
 
				+    elif model_name == 'resnet34':
			
 
				+        model = resnet34(pretrained)
			
 
				+        feat_dim = 512
			
 
				+    elif model_name == 'resnet50':
			
 
				+        model = resnet50(pretrained)
			
 
				+        feat_dim = 2048
			
 
				+    elif model_name == 'resnet101':
			
 
				+        model = resnet34(pretrained)
			
 
				+        feat_dim = 2048
			
 
				+    else:
			
 
				+        raise NotImplementedError("Unknown resnet: {}".format(model_name))
			
 
				+    
			
 
				+    return model, feat_dim
			
 
				+
			
 
				+def resnet18(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-18 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
			
 
				+    if pretrained:
			
 
				+        # strict = False as we don't need fc layer params.
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+def resnet34(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-34 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+def resnet50(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-50 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+def resnet101(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-101 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+def resnet152(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-152 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+if __name__=='__main__':
			
 
				+    import time
			
 
				+    from thop import profile
			
 
				+
			
 
				+    # Build backbone
			
 
				+    model, _ = build_resnet(model_name='resnet18')
			
 
				+
			
 
				+    # Inference
			
 
				+    x = torch.randn(1, 3, 640, 640)
			
 
				+    t0 = time.time()
			
 
				+    output = model(x)
			
 
				+    t1 = time.time()
			
 
				+    print('Time: ', t1 - t0)
			
 
				+    print(output.shape)
			
 
				+
			
 
				+    print('==============================')
			
 
				+    flops, params = profile(model, inputs=(x, ), verbose=False)
			
 
				+    print('==============================')
			
 
				+    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				+    print('Params : {:.2f} M'.format(params / 1e6))    
			
--- a/yolo/models/gelan/build.py
+++ b/yolo/models/gelan/build.py
@@ -1,24 +0,0 @@
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .loss import SetCriterion
			
 
				-from .gelan import GElan
			
 
				-
			
 
				-
			
 
				-# build object detector
			
 
				-def build_gelan(cfg, is_val=False):
			
 
				-    # -------------- Build YOLO --------------
			
 
				-    model = GElan(cfg, is_val, deploy=False)
			
 
				-
			
 
				-    # -------------- Initialize YOLO --------------
			
 
				-    for m in model.modules():
			
 
				-        if isinstance(m, nn.BatchNorm2d):
			
 
				-            m.eps = 1e-3
			
 
				-            m.momentum = 0.03    
			
 
				-            
			
 
				-    # -------------- Build criterion --------------
			
 
				-    criterion = None
			
 
				-    if is_val:
			
 
				-        # build criterion for training
			
 
				-        criterion = SetCriterion(cfg)
			
 
				-        
			
 
				-    return model, criterion
			
--- a/yolo/models/gelan/gelan.py
+++ b/yolo/models/gelan/gelan.py
@@ -1,165 +0,0 @@
 
				-# --------------- Torch components ---------------
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-# --------------- Model components ---------------
			
 
				-from .gelan_backbone import build_backbone
			
 
				-from .gelan_neck     import SPPElan
			
 
				-from .gelan_pafpn    import GElanPaFPN
			
 
				-from .gelan_head     import GElanDetHead
			
 
				-from .gelan_pred     import GElanPredLayer
			
 
				-
			
 
				-# --------------- External components ---------------
			
 
				-from utils.misc import multiclass_nms
			
 
				-
			
 
				-
			
 
				-# G-ELAN proposed by YOLOv9
			
 
				-class GElan(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 cfg,
			
 
				-                 is_val = False,
			
 
				-                 deploy = False,
			
 
				-                 ) -> None:
			
 
				-        super(GElan, self).__init__()
			
 
				-        # ---------------------- Basic setting ----------------------
			
 
				-        self.cfg = cfg
			
 
				-        self.deploy = deploy
			
 
				-        self.num_classes = cfg.num_classes
			
 
				-        ## Post-process parameters
			
 
				-        self.topk_candidates = cfg.val_topk        if is_val else cfg.test_topk
			
 
				-        self.conf_thresh     = cfg.val_conf_thresh if is_val else cfg.test_conf_thresh
			
 
				-        self.nms_thresh      = cfg.val_nms_thresh  if is_val else cfg.test_nms_thresh
			
 
				-        self.no_multi_labels = False if is_val else True
			
 
				-        
			
 
				-        # ---------------------- Network Parameters ----------------------
			
 
				-        ## Backbone
			
 
				-        self.backbone = build_backbone(cfg)
			
 
				-        self.neck     = SPPElan(cfg, self.backbone.feat_dims[-1])
			
 
				-        self.backbone.feat_dims[-1] = self.neck.out_dim
			
 
				-        ## PaFPN
			
 
				-        self.fpn      = GElanPaFPN(cfg, self.backbone.feat_dims)
			
 
				-        ## Detection head
			
 
				-        self.head     = GElanDetHead(cfg, self.fpn.out_dims)
			
 
				-        self.pred     = GElanPredLayer(cfg, self.head.cls_head_dim, self.head.reg_head_dim)
			
 
				-
			
 
				-    def switch_to_deploy(self,):
			
 
				-        for m in self.modules():
			
 
				-            if hasattr(m, "fuse_convs"):
			
 
				-                m.fuse_convs()
			
 
				-
			
 
				-    def post_process(self, cls_preds, box_preds):
			
 
				-        """
			
 
				-        Input:
			
 
				-            cls_preds: List[torch.Tensor] -> [[B, M, C], ...], B=1
			
 
				-            box_preds: List[torch.Tensor] -> [[B, M, 4], ...], B=1
			
 
				-        Output:
			
 
				-            bboxes: np.array -> [N, 4]
			
 
				-            scores: np.array -> [N,]
			
 
				-            labels: np.array -> [N,]
			
 
				-        """
			
 
				-        all_scores = []
			
 
				-        all_labels = []
			
 
				-        all_bboxes = []
			
 
				-        
			
 
				-        for cls_pred_i, box_pred_i in zip(cls_preds, box_preds):
			
 
				-            cls_pred_i = cls_pred_i[0]
			
 
				-            box_pred_i = box_pred_i[0]
			
 
				-            if self.no_multi_labels:
			
 
				-                # [M,]
			
 
				-                scores, labels = torch.max(cls_pred_i.sigmoid(), dim=1)
			
 
				-
			
 
				-                # Keep top k top scoring indices only.
			
 
				-                num_topk = min(self.topk_candidates, box_pred_i.size(0))
			
 
				-
			
 
				-                # topk candidates
			
 
				-                predicted_prob, topk_idxs = scores.sort(descending=True)
			
 
				-                topk_scores = predicted_prob[:num_topk]
			
 
				-                topk_idxs = topk_idxs[:num_topk]
			
 
				-
			
 
				-                # filter out the proposals with low confidence score
			
 
				-                keep_idxs = topk_scores > self.conf_thresh
			
 
				-                scores = topk_scores[keep_idxs]
			
 
				-                topk_idxs = topk_idxs[keep_idxs]
			
 
				-
			
 
				-                labels = labels[topk_idxs]
			
 
				-                bboxes = box_pred_i[topk_idxs]
			
 
				-            else:
			
 
				-                # [M, C] -> [MC,]
			
 
				-                scores_i = cls_pred_i.sigmoid().flatten()
			
 
				-
			
 
				-                # Keep top k top scoring indices only.
			
 
				-                num_topk = min(self.topk_candidates, box_pred_i.size(0))
			
 
				-
			
 
				-                # torch.sort is actually faster than .topk (at least on GPUs)
			
 
				-                predicted_prob, topk_idxs = scores_i.sort(descending=True)
			
 
				-                topk_scores = predicted_prob[:num_topk]
			
 
				-                topk_idxs = topk_idxs[:num_topk]
			
 
				-
			
 
				-                # filter out the proposals with low confidence score
			
 
				-                keep_idxs = topk_scores > self.conf_thresh
			
 
				-                scores = topk_scores[keep_idxs]
			
 
				-                topk_idxs = topk_idxs[keep_idxs]
			
 
				-
			
 
				-                anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
			
 
				-                labels = topk_idxs % self.num_classes
			
 
				-
			
 
				-                bboxes = box_pred_i[anchor_idxs]
			
 
				-
			
 
				-            all_scores.append(scores)
			
 
				-            all_labels.append(labels)
			
 
				-            all_bboxes.append(bboxes)
			
 
				-
			
 
				-        scores = torch.cat(all_scores, dim=0)
			
 
				-        labels = torch.cat(all_labels, dim=0)
			
 
				-        bboxes = torch.cat(all_bboxes, dim=0)
			
 
				-
			
 
				-        # to cpu & numpy
			
 
				-        scores = scores.cpu().numpy()
			
 
				-        labels = labels.cpu().numpy()
			
 
				-        bboxes = bboxes.cpu().numpy()
			
 
				-
			
 
				-        # nms
			
 
				-        scores, labels, bboxes = multiclass_nms(
			
 
				-            scores, labels, bboxes, self.nms_thresh, self.num_classes)
			
 
				-
			
 
				-        return bboxes, scores, labels
			
 
				-    
			
 
				-    def forward(self, x):
			
 
				-        # ---------------- Backbone ----------------
			
 
				-        pyramid_feats = self.backbone(x)
			
 
				-        # ---------------- Neck: SPP ----------------
			
 
				-        pyramid_feats[-1] = self.neck(pyramid_feats[-1])
			
 
				-
			
 
				-        # ---------------- Neck: PaFPN ----------------
			
 
				-        pyramid_feats = self.fpn(pyramid_feats)
			
 
				-
			
 
				-        # ---------------- Heads ----------------
			
 
				-        cls_feats, reg_feats = self.head(pyramid_feats)
			
 
				-
			
 
				-        # ---------------- Preds ----------------
			
 
				-        outputs = self.pred(cls_feats, reg_feats)
			
 
				-        outputs['image_size'] = [x.shape[2], x.shape[3]]
			
 
				-
			
 
				-        if not self.training:
			
 
				-            all_cls_preds = outputs['pred_cls']
			
 
				-            all_box_preds = outputs['pred_box']
			
 
				-
			
 
				-            if self.deploy:
			
 
				-                cls_preds = torch.cat(all_cls_preds, dim=1)[0]
			
 
				-                box_preds = torch.cat(all_box_preds, dim=1)[0]
			
 
				-                scores = cls_preds.sigmoid()
			
 
				-                bboxes = box_preds
			
 
				-                # [n_anchors_all, 4 + C]
			
 
				-                outputs = torch.cat([bboxes, scores], dim=-1)
			
 
				-
			
 
				-            else:
			
 
				-                # post process
			
 
				-                bboxes, scores, labels = self.post_process(all_cls_preds, all_box_preds)
			
 
				-                outputs = {
			
 
				-                    "scores": scores,
			
 
				-                    "labels": labels,
			
 
				-                    "bboxes": bboxes
			
 
				-                }
			
 
				-        
			
 
				-        return outputs
			
 
				-    
			
--- a/yolo/models/gelan/gelan_backbone.py
+++ b/yolo/models/gelan/gelan_backbone.py
@@ -1,198 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-try:
			
 
				-    from .gelan_basic import BasicConv, RepGElanLayer, ADown
			
 
				-except:
			
 
				-    from  gelan_basic import BasicConv, RepGElanLayer, ADown
			
 
				-
			
 
				-# IN1K pretrained weight
			
 
				-pretrained_urls = {
			
 
				-    's': "https://github.com/yjh0410/YOLO-Tutorial-v2/releases/download/in1k_pretrained_weight/gelan_s_in1k_68.4.pth",
			
 
				-    'c': "https://github.com/yjh0410/YOLO-Tutorial-v2/releases/download/in1k_pretrained_weight/gelan_c_in1k_76.7.pth",
			
 
				-}
			
 
				-
			
 
				-# ----------------- GELAN backbone proposed by YOLOv9 -----------------
			
 
				-class GElanBackbone(nn.Module):
			
 
				-    def __init__(self, cfg):
			
 
				-        super(GElanBackbone, self).__init__()
			
 
				-        # ---------- Basic setting ----------
			
 
				-        self.model_scale = cfg.scale
			
 
				-        self.feat_dims = [cfg.backbone_feats["c1"][-1],  # 64
			
 
				-                          cfg.backbone_feats["c2"][-1],  # 128
			
 
				-                          cfg.backbone_feats["c3"][-1],  # 256
			
 
				-                          cfg.backbone_feats["c4"][-1],  # 512
			
 
				-                          cfg.backbone_feats["c5"][-1],  # 512
			
 
				-                          ]
			
 
				-        
			
 
				-        # ---------- Network setting ----------
			
 
				-        ## P1/2
			
 
				-        self.layer_1 = BasicConv(3, cfg.backbone_feats["c1"][0],
			
 
				-                                 kernel_size=3, padding=1, stride=2,
			
 
				-                                 act_type=cfg.bk_act, norm_type=cfg.bk_norm, depthwise=cfg.bk_depthwise)
			
 
				-        # P2/4
			
 
				-        self.layer_2 = nn.Sequential(
			
 
				-            BasicConv(cfg.backbone_feats["c1"][0], cfg.backbone_feats["c2"][0],
			
 
				-                      kernel_size=3, padding=1, stride=2,
			
 
				-                      act_type=cfg.bk_act, norm_type=cfg.bk_norm, depthwise=cfg.bk_depthwise),
			
 
				-            RepGElanLayer(in_dim     = cfg.backbone_feats["c2"][0],
			
 
				-                          inter_dims = cfg.backbone_feats["c2"][1],
			
 
				-                          out_dim    = cfg.backbone_feats["c2"][2],
			
 
				-                          num_blocks = cfg.backbone_depth,
			
 
				-                          shortcut   = True,
			
 
				-                          act_type   = cfg.bk_act,
			
 
				-                          norm_type  = cfg.bk_norm,
			
 
				-                          depthwise  = cfg.bk_depthwise)
			
 
				-        )
			
 
				-        # P3/8
			
 
				-        self.layer_3 = nn.Sequential(
			
 
				-            ADown(cfg.backbone_feats["c2"][2], cfg.backbone_feats["c3"][0],
			
 
				-                  act_type=cfg.bk_act, norm_type=cfg.bk_norm, depthwise=cfg.bk_depthwise),
			
 
				-            RepGElanLayer(in_dim     = cfg.backbone_feats["c3"][0],
			
 
				-                          inter_dims = cfg.backbone_feats["c3"][1],
			
 
				-                          out_dim    = cfg.backbone_feats["c3"][2],
			
 
				-                          num_blocks = cfg.backbone_depth,
			
 
				-                          shortcut   = True,
			
 
				-                          act_type   = cfg.bk_act,
			
 
				-                          norm_type  = cfg.bk_norm,
			
 
				-                          depthwise  = cfg.bk_depthwise)
			
 
				-        )
			
 
				-        # P4/16
			
 
				-        self.layer_4 = nn.Sequential(
			
 
				-            ADown(cfg.backbone_feats["c3"][2], cfg.backbone_feats["c4"][0],
			
 
				-                  act_type=cfg.bk_act, norm_type=cfg.bk_norm, depthwise=cfg.bk_depthwise),
			
 
				-            RepGElanLayer(in_dim     = cfg.backbone_feats["c4"][0],
			
 
				-                          inter_dims = cfg.backbone_feats["c4"][1],
			
 
				-                          out_dim    = cfg.backbone_feats["c4"][2],
			
 
				-                          num_blocks = cfg.backbone_depth,
			
 
				-                          shortcut   = True,
			
 
				-                          act_type   = cfg.bk_act,
			
 
				-                          norm_type  = cfg.bk_norm,
			
 
				-                          depthwise  = cfg.bk_depthwise)
			
 
				-        )
			
 
				-        # P5/32
			
 
				-        self.layer_5 = nn.Sequential(
			
 
				-            ADown(cfg.backbone_feats["c4"][2], cfg.backbone_feats["c5"][0],
			
 
				-                  act_type=cfg.bk_act, norm_type=cfg.bk_norm, depthwise=cfg.bk_depthwise),
			
 
				-            RepGElanLayer(in_dim     = cfg.backbone_feats["c5"][0],
			
 
				-                          inter_dims = cfg.backbone_feats["c5"][1],
			
 
				-                          out_dim    = cfg.backbone_feats["c5"][2],
			
 
				-                          num_blocks = cfg.backbone_depth,
			
 
				-                          shortcut   = True,
			
 
				-                          act_type   = cfg.bk_act,
			
 
				-                          norm_type  = cfg.bk_norm,
			
 
				-                          depthwise  = cfg.bk_depthwise)
			
 
				-        )
			
 
				-
			
 
				-        # Initialize all layers
			
 
				-        self.init_weights()
			
 
				-
			
 
				-        # Load imagenet pretrained weight
			
 
				-        if cfg.use_pretrained:
			
 
				-            self.load_pretrained()
			
 
				-
			
 
				-    def init_weights(self):
			
 
				-        """Initialize the parameters."""
			
 
				-        for m in self.modules():
			
 
				-            if isinstance(m, torch.nn.Conv2d):
			
 
				-                m.reset_parameters()
			
 
				-
			
 
				-    def load_pretrained(self):
			
 
				-        url = pretrained_urls[self.model_scale]
			
 
				-        if url is not None:
			
 
				-            print('Loading backbone pretrained weight from : {}'.format(url))
			
 
				-            # checkpoint state dict
			
 
				-            checkpoint = torch.hub.load_state_dict_from_url(
			
 
				-                url=url, map_location="cpu", check_hash=True)
			
 
				-            checkpoint_state_dict = checkpoint.pop("model")
			
 
				-            # model state dict
			
 
				-            model_state_dict = self.state_dict()
			
 
				-            # check
			
 
				-            for k in list(checkpoint_state_dict.keys()):
			
 
				-                if k in model_state_dict:
			
 
				-                    shape_model = tuple(model_state_dict[k].shape)
			
 
				-                    shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
			
 
				-                    if shape_model != shape_checkpoint:
			
 
				-                        checkpoint_state_dict.pop(k)
			
 
				-                else:
			
 
				-                    checkpoint_state_dict.pop(k)
			
 
				-                    print('Unused key: ', k)
			
 
				-            # load the weight
			
 
				-            self.load_state_dict(checkpoint_state_dict)
			
 
				-        else:
			
 
				-            print('No pretrained weight for model scale: {}.'.format(self.model_scale))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        c1 = self.layer_1(x)
			
 
				-        c2 = self.layer_2(c1)
			
 
				-        c3 = self.layer_3(c2)
			
 
				-        c4 = self.layer_4(c3)
			
 
				-        c5 = self.layer_5(c4)
			
 
				-        outputs = [c3, c4, c5]
			
 
				-
			
 
				-        return outputs
			
 
				-
			
 
				-
			
 
				-# ------------ Functions ------------
			
 
				-def build_backbone(cfg): 
			
 
				-    # model
			
 
				-    if   cfg.backbone == "gelan":
			
 
				-        backbone = GElanBackbone(cfg)
			
 
				-    else:
			
 
				-        raise NotImplementedError("Unknown gelan backbone: {}".format(cfg.backbone))
			
 
				-        
			
 
				-    return backbone
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    class BaseConfig(object):
			
 
				-        def __init__(self) -> None:
			
 
				-            self.backbone = 'gelan'
			
 
				-            self.use_pretrained = True
			
 
				-            self.bk_act = 'silu'
			
 
				-            self.bk_norm = 'BN'
			
 
				-            self.bk_depthwise = False
			
 
				-            # # Gelan-C scale
			
 
				-            # self.backbone_feats = {
			
 
				-            #     "c1": [64],
			
 
				-            #     "c2": [128, [128, 64], 256],
			
 
				-            #     "c3": [256, [256, 128], 512],
			
 
				-            #     "c4": [512, [512, 256], 512],
			
 
				-            #     "c5": [512, [512, 256], 512],
			
 
				-            # }
			
 
				-            # self.scale = "l"
			
 
				-            # self.backbone_depth = 1
			
 
				-            # Gelan-S scale
			
 
				-            self.backbone_feats = {
			
 
				-                "c1": [32],
			
 
				-                "c2": [64,  [64, 32],   64],
			
 
				-                "c3": [64,  [64, 32],   128],
			
 
				-                "c4": [128, [128, 64],  256],
			
 
				-                "c5": [256, [256, 128], 256],
			
 
				-            }
			
 
				-            self.scale = "s"
			
 
				-            self.backbone_depth = 3
			
 
				-    # 定义模型配置文件
			
 
				-    cfg = BaseConfig()
			
 
				-
			
 
				-    # 构建GELAN主干网络
			
 
				-    model = build_backbone(cfg)
			
 
				-
			
 
				-    # 随机生成输入数据
			
 
				-    x = torch.randn(1, 3, 640, 640)
			
 
				-
			
 
				-    # 前向推理
			
 
				-    outputs = model(x)
			
 
				-
			
 
				-    # 打印输出中的shape
			
 
				-    for out in outputs:
			
 
				-        print(out.shape)
			
 
				-
			
 
				-    # 计算模型的参数量和理论计算量
			
 
				-    print('============ Params & FLOPs ============')
			
 
				-    flops, params = profile(model, inputs=(x, ), verbose=False)
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
 
				-    
			
--- a/yolo/models/gelan/gelan_basic.py
+++ b/yolo/models/gelan/gelan_basic.py
@@ -1,312 +0,0 @@
 
				-import numpy as np
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-from typing import List
			
 
				-
			
 
				-
			
 
				-# --------------------- Basic modules ---------------------
			
 
				-def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
			
 
				-    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
			
 
				-
			
 
				-    return conv
			
 
				-
			
 
				-def get_activation(act_type=None):
			
 
				-    if act_type == 'relu':
			
 
				-        return nn.ReLU(inplace=True)
			
 
				-    elif act_type == 'lrelu':
			
 
				-        return nn.LeakyReLU(0.1, inplace=True)
			
 
				-    elif act_type == 'mish':
			
 
				-        return nn.Mish(inplace=True)
			
 
				-    elif act_type == 'silu':
			
 
				-        return nn.SiLU(inplace=True)
			
 
				-    elif act_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-        
			
 
				-def get_norm(norm_type, dim):
			
 
				-    if norm_type == 'BN':
			
 
				-        return nn.BatchNorm2d(dim)
			
 
				-    elif norm_type == 'GN':
			
 
				-        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				-    elif norm_type is None:
			
 
				-        return nn.Identity()
			
 
				-    else:
			
 
				-        raise NotImplementedError
			
 
				-
			
 
				-class BasicConv(nn.Module):
			
 
				-    def __init__(self, 
			
 
				-                 in_dim,                   # in channels
			
 
				-                 out_dim,                  # out channels 
			
 
				-                 kernel_size=1,            # kernel size 
			
 
				-                 padding=0,                # padding
			
 
				-                 stride=1,                 # padding
			
 
				-                 dilation=1,               # dilation
			
 
				-                 group=1,                  # group
			
 
				-                 act_type  :str = 'lrelu', # activation
			
 
				-                 norm_type :str = 'BN',    # normalization
			
 
				-                 depthwise :bool = False
			
 
				-                ):
			
 
				-        super(BasicConv, self).__init__()
			
 
				-        self.depthwise = depthwise
			
 
				-        if not depthwise:
			
 
				-            self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=group)
			
 
				-            self.norm = get_norm(norm_type, out_dim)
			
 
				-        else:
			
 
				-            self.conv1 = get_conv2d(in_dim, in_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=in_dim)
			
 
				-            self.norm1 = get_norm(norm_type, in_dim)
			
 
				-            self.conv2 = get_conv2d(in_dim, out_dim, k=1, p=0, s=1, d=1, g=1)
			
 
				-            self.norm2 = get_norm(norm_type, out_dim)
			
 
				-        self.act  = get_activation(act_type)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        if not self.depthwise:
			
 
				-            return self.act(self.norm(self.conv(x)))
			
 
				-        else:
			
 
				-            # Depthwise conv
			
 
				-            x = self.norm1(self.conv1(x))
			
 
				-            # Pointwise conv
			
 
				-            x = self.act(self.norm2(self.conv2(x)))
			
 
				-            return x
			
 
				-
			
 
				-
			
 
				-# --------------------- GELAN modules (from yolov9) ---------------------
			
 
				-class ADown(nn.Module):
			
 
				-    def __init__(self, in_dim, out_dim, act_type="silu", norm_type="BN", depthwise=False):
			
 
				-        super().__init__()
			
 
				-        inter_dim = out_dim // 2
			
 
				-        self.conv_layer_1 = BasicConv(in_dim // 2, inter_dim,
			
 
				-                                    kernel_size=3, padding=1, stride=2,
			
 
				-                                    act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        self.conv_layer_2 = BasicConv(in_dim // 2, inter_dim, kernel_size=1,
			
 
				-                                    act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-    def forward(self, x):
			
 
				-        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
			
 
				-        x1,x2 = x.chunk(2, 1)
			
 
				-        x1 = self.conv_layer_1(x1)
			
 
				-        x2 = torch.nn.functional.max_pool2d(x2, 3, 2, 1)
			
 
				-        x2 = self.conv_layer_2(x2)
			
 
				-
			
 
				-        return torch.cat((x1, x2), 1)
			
 
				-
			
 
				-class RepConvN(nn.Module):
			
 
				-    """RepConv is a basic rep-style block, including training and deploy status
			
 
				-    This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
			
 
				-    """
			
 
				-    def __init__(self, in_dim, out_dim, k=3, s=1, p=1, g=1, act_type='silu', norm_type='BN', depthwise=False):
			
 
				-        super().__init__()
			
 
				-        assert k == 3 and p == 1
			
 
				-        self.g = g
			
 
				-        self.in_dim = in_dim
			
 
				-        self.out_dim = out_dim
			
 
				-        self.act = get_activation(act_type)
			
 
				-
			
 
				-        self.bn = None
			
 
				-        self.conv1 = BasicConv(in_dim, out_dim,
			
 
				-                               kernel_size=k, padding=p, stride=s, group=g,
			
 
				-                               act_type=None, norm_type=norm_type, depthwise=depthwise)
			
 
				-        self.conv2 = BasicConv(in_dim, out_dim,
			
 
				-                               kernel_size=1, padding=(p - k // 2), stride=s, group=g,
			
 
				-                               act_type=None, norm_type=norm_type, depthwise=depthwise)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        """Forward process"""
			
 
				-        if hasattr(self, 'conv'):
			
 
				-            return self.forward_fuse(x)
			
 
				-        else:
			
 
				-            id_out = 0 if self.bn is None else self.bn(x)
			
 
				-            return self.act(self.conv1(x) + self.conv2(x) + id_out)
			
 
				-
			
 
				-    def forward_fuse(self, x):
			
 
				-        """Forward process"""
			
 
				-        return self.act(self.conv(x))
			
 
				-
			
 
				-    def get_equivalent_kernel_bias(self):
			
 
				-        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
			
 
				-        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
			
 
				-        kernelid, biasid = self._fuse_bn_tensor(self.bn)
			
 
				-        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
			
 
				-
			
 
				-    def _avg_to_3x3_tensor(self, avgp):
			
 
				-        channels = self.in_dim
			
 
				-        groups = self.g
			
 
				-        kernel_size = avgp.kernel_size
			
 
				-        input_dim = channels // groups
			
 
				-        k = torch.zeros((channels, input_dim, kernel_size, kernel_size))
			
 
				-        k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2
			
 
				-        return k
			
 
				-
			
 
				-    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
			
 
				-        if kernel1x1 is None:
			
 
				-            return 0
			
 
				-        else:
			
 
				-            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
			
 
				-
			
 
				-    def _fuse_bn_tensor(self, branch):
			
 
				-        if branch is None:
			
 
				-            return 0, 0
			
 
				-        if isinstance(branch, BasicConv):
			
 
				-            kernel       = branch.conv.weight
			
 
				-            running_mean = branch.norm.running_mean
			
 
				-            running_var  = branch.norm.running_var
			
 
				-            gamma        = branch.norm.weight
			
 
				-            beta         = branch.norm.bias
			
 
				-            eps          = branch.norm.eps
			
 
				-        elif isinstance(branch, nn.BatchNorm2d):
			
 
				-            if not hasattr(self, 'id_tensor'):
			
 
				-                input_dim = self.in_dim // self.g
			
 
				-                kernel_value = np.zeros((self.in_dim, input_dim, 3, 3), dtype=np.float32)
			
 
				-                for i in range(self.in_dim):
			
 
				-                    kernel_value[i, i % input_dim, 1, 1] = 1
			
 
				-                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
			
 
				-            kernel       = self.id_tensor
			
 
				-            running_mean = branch.running_mean
			
 
				-            running_var  = branch.running_var
			
 
				-            gamma        = branch.weight
			
 
				-            beta         = branch.bias
			
 
				-            eps          = branch.eps
			
 
				-        std = (running_var + eps).sqrt()
			
 
				-        t = (gamma / std).reshape(-1, 1, 1, 1)
			
 
				-        return kernel * t, beta - running_mean * gamma / std
			
 
				-
			
 
				-    def fuse_convs(self):
			
 
				-        if hasattr(self, 'conv'):
			
 
				-            return
			
 
				-        kernel, bias = self.get_equivalent_kernel_bias()
			
 
				-        self.conv = nn.Conv2d(in_channels  = self.conv1.conv.in_channels,
			
 
				-                              out_channels = self.conv1.conv.out_channels,
			
 
				-                              kernel_size  = self.conv1.conv.kernel_size,
			
 
				-                              stride       = self.conv1.conv.stride,
			
 
				-                              padding      = self.conv1.conv.padding,
			
 
				-                              dilation     = self.conv1.conv.dilation,
			
 
				-                              groups       = self.conv1.conv.groups,
			
 
				-                              bias         = True).requires_grad_(False)
			
 
				-        self.conv.weight.data = kernel
			
 
				-        self.conv.bias.data = bias
			
 
				-        for para in self.parameters():
			
 
				-            para.detach_()
			
 
				-        self.__delattr__('conv1')
			
 
				-        self.__delattr__('conv2')
			
 
				-        if hasattr(self, 'nm'):
			
 
				-            self.__delattr__('nm')
			
 
				-        if hasattr(self, 'bn'):
			
 
				-            self.__delattr__('bn')
			
 
				-        if hasattr(self, 'id_tensor'):
			
 
				-            self.__delattr__('id_tensor')
			
 
				-
			
 
				-class RepNBottleneck(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim,
			
 
				-                 out_dim,
			
 
				-                 shortcut=True,
			
 
				-                 kernel_size=(3, 3),
			
 
				-                 expansion=0.5,
			
 
				-                 act_type='silu',
			
 
				-                 norm_type='BN',
			
 
				-                 depthwise=False
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        inter_dim = round(out_dim * expansion)
			
 
				-        self.conv_layer_1 = RepConvN(in_dim, inter_dim, kernel_size[0], p=kernel_size[0]//2, s=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.conv_layer_2 = BasicConv(inter_dim, out_dim, kernel_size[1], padding=kernel_size[1]//2, stride=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.add = shortcut and in_dim == out_dim
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        h = self.conv_layer_2(self.conv_layer_1(x))
			
 
				-        return x + h if self.add else h
			
 
				-
			
 
				-class RepNCSP(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim,
			
 
				-                 out_dim,
			
 
				-                 num_blocks=1,
			
 
				-                 shortcut=True,
			
 
				-                 expansion=0.5,
			
 
				-                 act_type='silu',
			
 
				-                 norm_type='BN',
			
 
				-                 depthwise=False
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        inter_dim = int(out_dim * expansion)
			
 
				-        self.conv_layer_1 = BasicConv(in_dim, inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.conv_layer_2 = BasicConv(in_dim, inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.conv_layer_3 = BasicConv(2 * inter_dim, out_dim, kernel_size=1)
			
 
				-        self.module       = nn.Sequential(*(RepNBottleneck(inter_dim,
			
 
				-                                                           inter_dim,
			
 
				-                                                           kernel_size = [3, 3],
			
 
				-                                                           shortcut    = shortcut,
			
 
				-                                                           expansion   = 1.0,
			
 
				-                                                           act_type    = act_type,
			
 
				-                                                           norm_type   = norm_type,
			
 
				-                                                           depthwise   = depthwise)
			
 
				-                                                           for _ in range(num_blocks)))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x1 = self.conv_layer_1(x)
			
 
				-        x2 = self.module(self.conv_layer_2(x))
			
 
				-
			
 
				-        return self.conv_layer_3(torch.cat([x1, x2], dim=1))
			
 
				-
			
 
				-class RepGElanLayer(nn.Module):
			
 
				-    """YOLOv9's GELAN module"""
			
 
				-    def __init__(self,
			
 
				-                 in_dim     :int,
			
 
				-                 inter_dims :List,
			
 
				-                 out_dim    :int,
			
 
				-                 num_blocks :int   = 1,
			
 
				-                 shortcut   :bool  = False,
			
 
				-                 act_type   :str   = 'silu',
			
 
				-                 norm_type  :str   = 'BN',
			
 
				-                 depthwise  :bool  = False,
			
 
				-                 ) -> None:
			
 
				-        super(RepGElanLayer, self).__init__()
			
 
				-        # ----------- Basic parameters -----------
			
 
				-        self.in_dim = in_dim
			
 
				-        self.inter_dims = inter_dims
			
 
				-        self.out_dim = out_dim
			
 
				-
			
 
				-        # ----------- Network parameters -----------
			
 
				-        self.conv_layer_1  = BasicConv(in_dim, inter_dims[0], kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-        self.elan_module_1 = nn.Sequential(
			
 
				-             RepNCSP(inter_dims[0]//2,
			
 
				-                     inter_dims[1],
			
 
				-                     num_blocks  = num_blocks,
			
 
				-                     shortcut    = shortcut,
			
 
				-                     expansion   = 0.5,
			
 
				-                     act_type    = act_type,
			
 
				-                     norm_type   = norm_type,
			
 
				-                     depthwise   = depthwise),
			
 
				-            BasicConv(inter_dims[1], inter_dims[1],
			
 
				-                      kernel_size=3, padding=1,
			
 
				-                      act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        )
			
 
				-        self.elan_module_2 = nn.Sequential(
			
 
				-             RepNCSP(inter_dims[1],
			
 
				-                     inter_dims[1],
			
 
				-                     num_blocks  = num_blocks,
			
 
				-                     shortcut    = shortcut,
			
 
				-                     expansion   = 0.5,
			
 
				-                     act_type    = act_type,
			
 
				-                     norm_type   = norm_type,
			
 
				-                     depthwise   = depthwise),
			
 
				-            BasicConv(inter_dims[1], inter_dims[1],
			
 
				-                      kernel_size=3, padding=1,
			
 
				-                      act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				-        )
			
 
				-        self.conv_layer_2 = BasicConv(inter_dims[0] + 2*self.inter_dims[1], out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        # Input proj
			
 
				-        x1, x2 = torch.chunk(self.conv_layer_1(x), 2, dim=1)
			
 
				-        out = list([x1, x2])
			
 
				-
			
 
				-        # ELAN module
			
 
				-        out.append(self.elan_module_1(out[-1]))
			
 
				-        out.append(self.elan_module_2(out[-1]))
			
 
				-
			
 
				-        # Output proj
			
 
				-        out = self.conv_layer_2(torch.cat(out, dim=1))
			
 
				-
			
 
				-        return out
			
 
				-    
			
--- a/yolo/models/gelan/gelan_head.py
+++ b/yolo/models/gelan/gelan_head.py
@@ -1,176 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-try:
			
 
				-    from .gelan_basic import BasicConv
			
 
				-except:
			
 
				-    from  gelan_basic import BasicConv
			
 
				-    
			
 
				-
			
 
				-# Single-level Head
			
 
				-class SingleLevelHead(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 in_dim       :int  = 256,
			
 
				-                 cls_head_dim :int  = 256,
			
 
				-                 reg_head_dim :int  = 256,
			
 
				-                 num_cls_head :int  = 2,
			
 
				-                 num_reg_head :int  = 2,
			
 
				-                 act_type     :str  = "silu",
			
 
				-                 norm_type    :str  = "BN",
			
 
				-                 depthwise    :bool = False):
			
 
				-        super().__init__()
			
 
				-        # --------- Basic Parameters ----------
			
 
				-        self.in_dim = in_dim
			
 
				-        self.num_cls_head = num_cls_head
			
 
				-        self.num_reg_head = num_reg_head
			
 
				-        self.act_type = act_type
			
 
				-        self.norm_type = norm_type
			
 
				-        self.depthwise = depthwise
			
 
				-        
			
 
				-        # --------- Network Parameters ----------
			
 
				-        ## cls head
			
 
				-        cls_feats = []
			
 
				-        self.cls_head_dim = cls_head_dim
			
 
				-        for i in range(num_cls_head):
			
 
				-            if i == 0:
			
 
				-                cls_feats.append(
			
 
				-                    BasicConv(in_dim, self.cls_head_dim,
			
 
				-                              kernel_size=3, padding=1, stride=1, 
			
 
				-                              act_type=act_type,
			
 
				-                              norm_type=norm_type,
			
 
				-                              depthwise=depthwise)
			
 
				-                              )
			
 
				-            else:
			
 
				-                cls_feats.append(
			
 
				-                    BasicConv(self.cls_head_dim, self.cls_head_dim,
			
 
				-                              kernel_size=3, padding=1, stride=1, 
			
 
				-                              act_type=act_type,
			
 
				-                              norm_type=norm_type,
			
 
				-                              depthwise=depthwise)
			
 
				-                              )
			
 
				-        ## reg head
			
 
				-        reg_feats = []
			
 
				-        self.reg_head_dim = reg_head_dim
			
 
				-        for i in range(num_reg_head):
			
 
				-            if i == 0:
			
 
				-                reg_feats.append(
			
 
				-                    BasicConv(in_dim, self.reg_head_dim,
			
 
				-                              kernel_size=3, padding=1, stride=1, 
			
 
				-                              act_type=act_type,
			
 
				-                              norm_type=norm_type,
			
 
				-                              depthwise=depthwise)
			
 
				-                              )
			
 
				-            else:
			
 
				-                reg_feats.append(
			
 
				-                    BasicConv(self.reg_head_dim, self.reg_head_dim,
			
 
				-                              kernel_size=3, padding=1, stride=1, group=4,
			
 
				-                              act_type=act_type,
			
 
				-                              norm_type=norm_type,
			
 
				-                              depthwise=depthwise)
			
 
				-                              )
			
 
				-        self.cls_feats = nn.Sequential(*cls_feats)
			
 
				-        self.reg_feats = nn.Sequential(*reg_feats)
			
 
				-
			
 
				-        self.init_weights()
			
 
				-        
			
 
				-    def init_weights(self):
			
 
				-        """Initialize the parameters."""
			
 
				-        for m in self.modules():
			
 
				-            if isinstance(m, torch.nn.Conv2d):
			
 
				-                # In order to be consistent with the source code,
			
 
				-                # reset the Conv2d initialization parameters
			
 
				-                m.reset_parameters()
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        """
			
 
				-            in_feats: (Tensor) [B, C, H, W]
			
 
				-        """
			
 
				-        cls_feats = self.cls_feats(x)
			
 
				-        reg_feats = self.reg_feats(x)
			
 
				-
			
 
				-        return cls_feats, reg_feats
			
 
				-    
			
 
				-# Multi-level Head
			
 
				-class GElanDetHead(nn.Module):
			
 
				-    def __init__(self, cfg, in_dims):
			
 
				-        super().__init__()
			
 
				-        ## ----------- Network Parameters -----------
			
 
				-        self.multi_level_heads = nn.ModuleList(
			
 
				-            [SingleLevelHead(in_dim       = in_dims[level],
			
 
				-                             cls_head_dim = max(in_dims[0], min(cfg.num_classes * 2, 128)),
			
 
				-                             reg_head_dim = max(in_dims[0]//4, 16, 4*cfg.reg_max),
			
 
				-                             num_cls_head = cfg.num_cls_head,
			
 
				-                             num_reg_head = cfg.num_reg_head,
			
 
				-                             act_type     = cfg.head_act,
			
 
				-                             norm_type    = cfg.head_norm,
			
 
				-                             depthwise    = cfg.head_depthwise)
			
 
				-                             for level in range(cfg.num_levels)
			
 
				-                             ])
			
 
				-        # --------- Basic Parameters ----------
			
 
				-        self.in_dims = in_dims
			
 
				-        self.cls_head_dim = self.multi_level_heads[0].cls_head_dim
			
 
				-        self.reg_head_dim = self.multi_level_heads[0].reg_head_dim
			
 
				-
			
 
				-
			
 
				-    def forward(self, feats):
			
 
				-        """
			
 
				-            feats: List[(Tensor)] [[B, C, H, W], ...]
			
 
				-        """
			
 
				-        cls_feats = []
			
 
				-        reg_feats = []
			
 
				-        for feat, head in zip(feats, self.multi_level_heads):
			
 
				-            # ---------------- Pred ----------------
			
 
				-            cls_feat, reg_feat = head(feat)
			
 
				-
			
 
				-            cls_feats.append(cls_feat)
			
 
				-            reg_feats.append(reg_feat)
			
 
				-
			
 
				-        return cls_feats, reg_feats
			
 
				-    
			
 
				-
			
 
				-
			
 
				-if __name__=='__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    # Model config
			
 
				-    
			
 
				-    # GElan-Base config
			
 
				-    class GElanBaseConfig(object):
			
 
				-        def __init__(self) -> None:
			
 
				-            # ---------------- Model config ----------------
			
 
				-            self.reg_max  = 16
			
 
				-            self.out_stride = [8, 16, 32]
			
 
				-            self.max_stride = 32
			
 
				-            self.num_levels = 3
			
 
				-            ## Head
			
 
				-            self.head_act  = 'lrelu'
			
 
				-            self.head_norm = 'BN'
			
 
				-            self.head_depthwise = False
			
 
				-            self.num_cls_head   = 2
			
 
				-            self.num_reg_head   = 2
			
 
				-
			
 
				-    cfg = GElanBaseConfig()
			
 
				-    cfg.num_classes = 20
			
 
				-
			
 
				-    # Build a head
			
 
				-    fpn_dims = [128, 256, 256]
			
 
				-    pyramid_feats = [torch.randn(1, fpn_dims[0], 80, 80),
			
 
				-                     torch.randn(1, fpn_dims[1], 40, 40),
			
 
				-                     torch.randn(1, fpn_dims[2], 20, 20)]
			
 
				-    head = GElanDetHead(cfg, fpn_dims)
			
 
				-
			
 
				-
			
 
				-    # Inference
			
 
				-    t0 = time.time()
			
 
				-    cls_feats, reg_feats = head(pyramid_feats)
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
 
				-    print("====== GElan Head output ======")
			
 
				-    for level, (cls_f, reg_f) in enumerate(zip(cls_feats, reg_feats)):
			
 
				-        print("- Level-{} : ".format(level), cls_f.shape, reg_f.shape)
			
 
				-
			
 
				-    flops, params = profile(head, inputs=(pyramid_feats, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
 
				-    
			
--- a/yolo/models/gelan/gelan_neck.py
+++ b/yolo/models/gelan/gelan_neck.py
@@ -1,76 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-from .gelan_basic import BasicConv
			
 
				-
			
 
				-
			
 
				-# SPPF (from yolov5)
			
 
				-class SPPF(nn.Module):
			
 
				-    """
			
 
				-        This code referenced to https://github.com/ultralytics/yolov5
			
 
				-    """
			
 
				-    def __init__(self, cfg, in_dim, out_dim):
			
 
				-        super().__init__()
			
 
				-        ## ----------- Basic Parameters -----------
			
 
				-        inter_dim = round(in_dim * cfg.neck_expand_ratio)
			
 
				-        self.out_dim = out_dim
			
 
				-        ## ----------- Network Parameters -----------
			
 
				-        self.cv1 = BasicConv(in_dim, inter_dim,
			
 
				-                             kernel_size=1, padding=0, stride=1,
			
 
				-                             act_type=cfg.neck_act, norm_type=cfg.neck_norm)
			
 
				-        self.cv2 = BasicConv(inter_dim * 4, out_dim,
			
 
				-                             kernel_size=1, padding=0, stride=1,
			
 
				-                             act_type=cfg.neck_act, norm_type=cfg.neck_norm)
			
 
				-        self.m = nn.MaxPool2d(kernel_size=cfg.spp_pooling_size,
			
 
				-                              stride=1,
			
 
				-                              padding=cfg.spp_pooling_size // 2)
			
 
				-
			
 
				-        # Initialize all layers
			
 
				-        self.init_weights()
			
 
				-
			
 
				-    def init_weights(self):
			
 
				-        """Initialize the parameters."""
			
 
				-        for m in self.modules():
			
 
				-            if isinstance(m, torch.nn.Conv2d):
			
 
				-                # In order to be consistent with the source code,
			
 
				-                # reset the Conv2d initialization parameters
			
 
				-                m.reset_parameters()
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = self.cv1(x)
			
 
				-        y1 = self.m(x)
			
 
				-        y2 = self.m(y1)
			
 
				-
			
 
				-        return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
			
 
				-
			
 
				-# SPP-ELAN (from yolov9)
			
 
				-class SPPElan(nn.Module):
			
 
				-    def __init__(self, cfg, in_dim):
			
 
				-        """SPPElan looks like the SPPF."""
			
 
				-        super().__init__()
			
 
				-        ## ----------- Basic Parameters -----------
			
 
				-        self.in_dim = in_dim
			
 
				-        self.inter_dim = cfg.spp_inter_dim
			
 
				-        self.out_dim   = cfg.spp_out_dim
			
 
				-        ## ----------- Network Parameters -----------
			
 
				-        self.conv_layer_1 = BasicConv(in_dim, self.inter_dim, kernel_size=1, act_type=cfg.neck_act, norm_type=cfg.neck_norm)
			
 
				-        self.conv_layer_2 = BasicConv(self.inter_dim * 4, self.out_dim, kernel_size=1, act_type=cfg.neck_act, norm_type=cfg.neck_norm)
			
 
				-        self.pool_layer   = nn.MaxPool2d(kernel_size=cfg.spp_pooling_size, stride=1, padding=cfg.spp_pooling_size // 2)
			
 
				-
			
 
				-        # Initialize all layers
			
 
				-        self.init_weights()
			
 
				-
			
 
				-    def init_weights(self):
			
 
				-        """Initialize the parameters."""
			
 
				-        for m in self.modules():
			
 
				-            if isinstance(m, torch.nn.Conv2d):
			
 
				-                # In order to be consistent with the source code,
			
 
				-                # reset the Conv2d initialization parameters
			
 
				-                m.reset_parameters()
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        y = [self.conv_layer_1(x)]
			
 
				-        y.extend(self.pool_layer(y[-1]) for _ in range(3))
			
 
				-        
			
 
				-        return self.conv_layer_2(torch.cat(y, 1))
			
 
				-    
			
--- a/yolo/models/gelan/gelan_pafpn.py
+++ b/yolo/models/gelan/gelan_pafpn.py
@@ -1,158 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from typing import List
			
 
				-
			
 
				-try:
			
 
				-    from .gelan_basic import RepGElanLayer, ADown
			
 
				-except:
			
 
				-    from  gelan_basic import RepGElanLayer, ADown
			
 
				-
			
 
				-
			
 
				-# PaFPN-ELAN
			
 
				-class GElanPaFPN(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 cfg,
			
 
				-                 in_dims :List = [256, 512, 256],
			
 
				-                 ) -> None:
			
 
				-        super(GElanPaFPN, self).__init__()
			
 
				-        print('==============================')
			
 
				-        print('FPN: {}'.format("GELAN PaFPN"))
			
 
				-        # --------------------------- Basic Parameters ---------------------------
			
 
				-        self.in_dims = in_dims[::-1]
			
 
				-        self.out_dims = [cfg.fpn_feats_td["p3"][1], cfg.fpn_feats_bu["p4"][1], cfg.fpn_feats_bu["p5"][1]]
			
 
				-
			
 
				-        # ---------------- Top dwon ----------------
			
 
				-        ## P5 -> P4
			
 
				-        self.top_down_layer_1 = RepGElanLayer(in_dim     = self.in_dims[0] + self.in_dims[1],
			
 
				-                                              inter_dims = cfg.fpn_feats_td["p4"][0],
			
 
				-                                              out_dim    = cfg.fpn_feats_td["p4"][1],
			
 
				-                                              num_blocks = cfg.fpn_depth,
			
 
				-                                              shortcut   = False,
			
 
				-                                              act_type   = cfg.fpn_act,
			
 
				-                                              norm_type  = cfg.fpn_norm,
			
 
				-                                              depthwise  = cfg.fpn_depthwise,
			
 
				-                                              )
			
 
				-        ## P4 -> P3
			
 
				-        self.top_down_layer_2 = RepGElanLayer(in_dim     = cfg.fpn_feats_td["p4"][1] + self.in_dims[2],
			
 
				-                                              inter_dims = cfg.fpn_feats_td["p3"][0],
			
 
				-                                              out_dim    = cfg.fpn_feats_td["p3"][1],
			
 
				-                                              num_blocks = cfg.fpn_depth,
			
 
				-                                              shortcut   = False,
			
 
				-                                              act_type   = cfg.fpn_act,
			
 
				-                                              norm_type  = cfg.fpn_norm,
			
 
				-                                              depthwise  = cfg.fpn_depthwise,
			
 
				-                                              )
			
 
				-        # ---------------- Bottom up ----------------
			
 
				-        ## P3 -> P4
			
 
				-        self.dowmsample_layer_1 = ADown(cfg.fpn_feats_td["p3"][1], cfg.fpn_feats_td["p3"][1],
			
 
				-                                        act_type=cfg.fpn_act, norm_type=cfg.fpn_norm, depthwise=cfg.fpn_depthwise)
			
 
				-        self.bottom_up_layer_1  = RepGElanLayer(in_dim     = cfg.fpn_feats_td["p3"][1] + cfg.fpn_feats_td["p4"][1],
			
 
				-                                                inter_dims = cfg.fpn_feats_bu["p4"][0],
			
 
				-                                                out_dim    = cfg.fpn_feats_bu["p4"][1],
			
 
				-                                                num_blocks = cfg.fpn_depth,
			
 
				-                                                shortcut   = False,
			
 
				-                                                act_type   = cfg.fpn_act,
			
 
				-                                                norm_type  = cfg.fpn_norm,
			
 
				-                                                depthwise  = cfg.fpn_depthwise,
			
 
				-                                                )
			
 
				-        ## P4 -> P5
			
 
				-        self.dowmsample_layer_2 = ADown(cfg.fpn_feats_bu["p4"][1], cfg.fpn_feats_bu["p4"][1],
			
 
				-                                        act_type=cfg.fpn_act, norm_type=cfg.fpn_norm, depthwise=cfg.fpn_depthwise)
			
 
				-        self.bottom_up_layer_2  = RepGElanLayer(in_dim     = cfg.fpn_feats_td["p4"][1] + self.in_dims[0],
			
 
				-                                                inter_dims = cfg.fpn_feats_bu["p5"][0],
			
 
				-                                                out_dim    = cfg.fpn_feats_bu["p5"][1],
			
 
				-                                                num_blocks = cfg.fpn_depth,
			
 
				-                                                shortcut   = False,
			
 
				-                                                act_type   = cfg.fpn_act,
			
 
				-                                                norm_type  = cfg.fpn_norm,
			
 
				-                                                depthwise  = cfg.fpn_depthwise,
			
 
				-                                                )
			
 
				-        
			
 
				-        self.init_weights()
			
 
				-        
			
 
				-    def init_weights(self):
			
 
				-        """Initialize the parameters."""
			
 
				-        for m in self.modules():
			
 
				-            if isinstance(m, torch.nn.Conv2d):
			
 
				-                # In order to be consistent with the source code,
			
 
				-                # reset the Conv2d initialization parameters
			
 
				-                m.reset_parameters()
			
 
				-
			
 
				-    def forward(self, features):
			
 
				-        c3, c4, c5 = features
			
 
				-
			
 
				-        # ------------------ Top down FPN ------------------
			
 
				-        ## P5 -> P4
			
 
				-        p5_up = F.interpolate(c5, scale_factor=2.0)
			
 
				-        p4 = self.top_down_layer_1(torch.cat([p5_up, c4], dim=1))
			
 
				-
			
 
				-        ## P4 -> P3
			
 
				-        p4_up = F.interpolate(p4, scale_factor=2.0)
			
 
				-        p3 = self.top_down_layer_2(torch.cat([p4_up, c3], dim=1))
			
 
				-
			
 
				-        # ------------------ Bottom up FPN ------------------
			
 
				-        ## p3 -> P4
			
 
				-        p3_ds = self.dowmsample_layer_1(p3)
			
 
				-        p4 = self.bottom_up_layer_1(torch.cat([p3_ds, p4], dim=1))
			
 
				-
			
 
				-        ## P4 -> 5
			
 
				-        p4_ds = self.dowmsample_layer_2(p4)
			
 
				-        p5 = self.bottom_up_layer_2(torch.cat([p4_ds, c5], dim=1))
			
 
				-
			
 
				-        out_feats = [p3, p4, p5] # [P3, P4, P5]
			
 
				-
			
 
				-        return out_feats
			
 
				-
			
 
				-
			
 
				-if __name__=='__main__':
			
 
				-    import time
			
 
				-    from thop import profile
			
 
				-    # Model config
			
 
				-    
			
 
				-    # GElan-Base config
			
 
				-    class GElanBaseConfig(object):
			
 
				-        def __init__(self) -> None:
			
 
				-            # ---------------- Model config ----------------
			
 
				-            self.width    = 0.50
			
 
				-            self.depth    = 0.34
			
 
				-            self.ratio    = 2.0
			
 
				-            self.out_stride = [8, 16, 32]
			
 
				-            self.max_stride = 32
			
 
				-            self.num_levels = 3
			
 
				-            ## FPN
			
 
				-            self.fpn      = 'gelan_pafpn'
			
 
				-            self.fpn_act  = 'silu'
			
 
				-            self.fpn_norm = 'BN'
			
 
				-            self.fpn_depthwise = False
			
 
				-            self.fpn_depth    = 3
			
 
				-            self.fpn_feats_td = {
			
 
				-                "p4": [[256, 128], 256],
			
 
				-                "p3": [[128, 64],  128],
			
 
				-            }
			
 
				-            self.fpn_feats_bu = {
			
 
				-                "p4": [[256, 128], 256],
			
 
				-                "p5": [[256, 128], 256],
			
 
				-            }
			
 
				-
			
 
				-    cfg = GElanBaseConfig()
			
 
				-    # Build a head
			
 
				-    in_dims  = [128, 256, 256]
			
 
				-    fpn = GElanPaFPN(cfg, in_dims)
			
 
				-
			
 
				-    # Inference
			
 
				-    x = [torch.randn(1, in_dims[0], 80, 80),
			
 
				-         torch.randn(1, in_dims[1], 40, 40),
			
 
				-         torch.randn(1, in_dims[2], 20, 20)]
			
 
				-    t0 = time.time()
			
 
				-    output = fpn(x)
			
 
				-    t1 = time.time()
			
 
				-    print('Time: ', t1 - t0)
			
 
				-    print('====== FPN output ====== ')
			
 
				-    for level, feat in enumerate(output):
			
 
				-        print("- Level-{} : ".format(level), feat.shape)
			
 
				-
			
 
				-    flops, params = profile(fpn, inputs=(x, ), verbose=False)
			
 
				-    print('==============================')
			
 
				-    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				-    print('Params : {:.2f} M'.format(params / 1e6))
			
--- a/yolo/models/gelan/gelan_pred.py
+++ b/yolo/models/gelan/gelan_pred.py
@@ -1,155 +0,0 @@
 
				-import math
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-
			
 
				-# Single-level pred layer
			
 
				-class SingleLevelPredLayer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 cls_dim     :int = 256,
			
 
				-                 reg_dim     :int = 256,
			
 
				-                 stride      :int = 32,
			
 
				-                 reg_max     :int = 16,
			
 
				-                 num_classes :int = 80,
			
 
				-                 num_coords  :int = 4):
			
 
				-        super().__init__()
			
 
				-        # --------- Basic Parameters ----------
			
 
				-        self.stride = stride
			
 
				-        self.cls_dim = cls_dim
			
 
				-        self.reg_dim = reg_dim
			
 
				-        self.reg_max = reg_max
			
 
				-        self.num_classes = num_classes
			
 
				-        self.num_coords = num_coords
			
 
				-
			
 
				-        # --------- Network Parameters ----------
			
 
				-        self.cls_pred = nn.Conv2d(cls_dim, num_classes, kernel_size=1)
			
 
				-        self.reg_pred = nn.Conv2d(reg_dim, num_coords, kernel_size=1, groups=4)                
			
 
				-
			
 
				-        self.init_bias()
			
 
				-        
			
 
				-    def init_bias(self):
			
 
				-        # cls pred bias
			
 
				-        b = self.cls_pred.bias.view(1, -1)
			
 
				-        b.data.fill_(math.log(5 / self.num_classes / (640. / self.stride) ** 2))
			
 
				-        self.cls_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
			
 
				-        # reg pred bias
			
 
				-        b = self.reg_pred.bias.view(-1, )
			
 
				-        b.data.fill_(1.0)
			
 
				-        self.reg_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
			
 
				-        w = self.reg_pred.weight
			
 
				-        w.data.fill_(0.)
			
 
				-        self.reg_pred.weight = torch.nn.Parameter(w, requires_grad=True)
			
 
				-
			
 
				-    def generate_anchors(self, fmp_size):
			
 
				-        """
			
 
				-            fmp_size: (List) [H, W]
			
 
				-        """
			
 
				-        # generate grid cells
			
 
				-        fmp_h, fmp_w = fmp_size
			
 
				-        anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
			
 
				-        # [H, W, 2] -> [HW, 2]
			
 
				-        anchors = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2)
			
 
				-        anchors += 0.5  # add center offset
			
 
				-        anchors *= self.stride
			
 
				-
			
 
				-        return anchors
			
 
				-        
			
 
				-    def forward(self, cls_feat, reg_feat):
			
 
				-        # pred
			
 
				-        cls_pred = self.cls_pred(cls_feat)
			
 
				-        reg_pred = self.reg_pred(reg_feat)
			
 
				-
			
 
				-        # generate anchor boxes: [M, 4]
			
 
				-        B, _, H, W = cls_pred.size()
			
 
				-        fmp_size = [H, W]
			
 
				-        anchors = self.generate_anchors(fmp_size)
			
 
				-        anchors = anchors.to(cls_pred.device)
			
 
				-        # stride tensor: [M, 1]
			
 
				-        stride_tensor = torch.ones_like(anchors[..., :1]) * self.stride
			
 
				-        
			
 
				-        # [B, C, H, W] -> [B, H, W, C] -> [B, M, C]
			
 
				-        cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
			
 
				-        reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 4*self.reg_max)
			
 
				-        
			
 
				-        # output dict
			
 
				-        outputs = {"pred_cls": cls_pred,            # List(Tensor) [B, M, C]
			
 
				-                   "pred_reg": reg_pred,            # List(Tensor) [B, M, 4*(reg_max)]
			
 
				-                   "anchors": anchors,              # List(Tensor) [M, 2]
			
 
				-                   "strides": self.stride,          # List(Int) = [8, 16, 32]
			
 
				-                   "stride_tensor": stride_tensor   # List(Tensor) [M, 1]
			
 
				-                   }
			
 
				-
			
 
				-        return outputs
			
 
				-
			
 
				-# Multi-level pred layer
			
 
				-class GElanPredLayer(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 cfg,
			
 
				-                 cls_dim,
			
 
				-                 reg_dim,
			
 
				-                 ):
			
 
				-        super().__init__()
			
 
				-        # --------- Basic Parameters ----------
			
 
				-        self.cfg = cfg
			
 
				-        self.cls_dim = cls_dim
			
 
				-        self.reg_dim = reg_dim
			
 
				-
			
 
				-        # ----------- Network Parameters -----------
			
 
				-        ## pred layers
			
 
				-        self.multi_level_preds = nn.ModuleList(
			
 
				-            [SingleLevelPredLayer(cls_dim     = cls_dim,
			
 
				-                                  reg_dim     = reg_dim,
			
 
				-                                  stride      = cfg.out_stride[level],
			
 
				-                                  reg_max     = cfg.reg_max,
			
 
				-                                  num_classes = cfg.num_classes,
			
 
				-                                  num_coords  = 4 * cfg.reg_max)
			
 
				-                                  for level in range(cfg.num_levels)
			
 
				-                                  ])
			
 
				-        ## proj conv
			
 
				-        proj_init = torch.arange(cfg.reg_max, dtype=torch.float)
			
 
				-        self.proj_conv = nn.Conv2d(cfg.reg_max, 1, kernel_size=1, bias=False).requires_grad_(False)
			
 
				-        self.proj_conv.weight.data[:] = nn.Parameter(proj_init.view([1, cfg.reg_max, 1, 1]), requires_grad=False)
			
 
				-
			
 
				-    def forward(self, cls_feats, reg_feats):
			
 
				-        all_anchors = []
			
 
				-        all_strides = []
			
 
				-        all_cls_preds = []
			
 
				-        all_reg_preds = []
			
 
				-        all_box_preds = []
			
 
				-        for level in range(self.cfg.num_levels):
			
 
				-            # -------------- Single-level prediction --------------
			
 
				-            outputs = self.multi_level_preds[level](cls_feats[level], reg_feats[level])
			
 
				-
			
 
				-            # -------------- Decode bbox --------------
			
 
				-            B, M = outputs["pred_reg"].shape[:2]
			
 
				-            # [B, M, 4*(reg_max)] -> [B, M, 4, reg_max]
			
 
				-            delta_pred = outputs["pred_reg"].reshape([B, M, 4, self.cfg.reg_max])
			
 
				-            # [B, M, 4, reg_max] -> [B, reg_max, 4, M]
			
 
				-            delta_pred = delta_pred.permute(0, 3, 2, 1).contiguous()
			
 
				-            # [B, reg_max, 4, M] -> [B, 1, 4, M]
			
 
				-            delta_pred = self.proj_conv(F.softmax(delta_pred, dim=1))
			
 
				-            # [B, 1, 4, M] -> [B, 4, M] -> [B, M, 4]
			
 
				-            delta_pred = delta_pred.view(B, 4, M).permute(0, 2, 1).contiguous()
			
 
				-            ## tlbr -> xyxy
			
 
				-            x1y1_pred = outputs["anchors"][None] - delta_pred[..., :2] * self.cfg.out_stride[level]
			
 
				-            x2y2_pred = outputs["anchors"][None] + delta_pred[..., 2:] * self.cfg.out_stride[level]
			
 
				-            box_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
			
 
				-
			
 
				-            # collect results
			
 
				-            all_cls_preds.append(outputs["pred_cls"])
			
 
				-            all_reg_preds.append(outputs["pred_reg"])
			
 
				-            all_box_preds.append(box_pred)
			
 
				-            all_anchors.append(outputs["anchors"])
			
 
				-            all_strides.append(outputs["stride_tensor"])
			
 
				-        
			
 
				-        # output dict
			
 
				-        outputs = {"pred_cls":      all_cls_preds,         # List(Tensor) [B, M, C]
			
 
				-                   "pred_reg":      all_reg_preds,         # List(Tensor) [B, M, 4*(reg_max)]
			
 
				-                   "pred_box":      all_box_preds,         # List(Tensor) [B, M, 4]
			
 
				-                   "anchors":       all_anchors,           # List(Tensor) [M, 2]
			
 
				-                   "stride_tensor": all_strides,           # List(Tensor) [M, 1]
			
 
				-                   "strides":       self.cfg.out_stride,   # List(Int) = [8, 16, 32]
			
 
				-                   }
			
 
				-
			
 
				-        return outputs
			
--- a/yolo/models/gelan/loss.py
+++ b/yolo/models/gelan/loss.py
@@ -1,187 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-
			
 
				-from utils.box_ops import bbox2dist, bbox_iou
			
 
				-from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
			
 
				-
			
 
				-from .matcher import TaskAlignedAssigner
			
 
				-
			
 
				-
			
 
				-class SetCriterion(object):
			
 
				-    def __init__(self, cfg):
			
 
				-        # --------------- Basic parameters ---------------
			
 
				-        self.cfg = cfg
			
 
				-        self.reg_max = cfg.reg_max
			
 
				-        self.num_classes = cfg.num_classes
			
 
				-        # --------------- Loss config ---------------
			
 
				-        self.loss_cls_weight = cfg.loss_cls
			
 
				-        self.loss_box_weight = cfg.loss_box
			
 
				-        self.loss_dfl_weight = cfg.loss_dfl
			
 
				-        # --------------- Matcher config ---------------
			
 
				-        self.matcher = TaskAlignedAssigner(num_classes     = cfg.num_classes,
			
 
				-                                           topk_candidates = cfg.tal_topk_candidates,
			
 
				-                                           alpha           = cfg.tal_alpha,
			
 
				-                                           beta            = cfg.tal_beta
			
 
				-                                           )
			
 
				-
			
 
				-    def loss_classes(self, pred_cls, gt_score):
			
 
				-        # compute bce loss
			
 
				-        loss_cls = F.binary_cross_entropy_with_logits(pred_cls, gt_score, reduction='none')
			
 
				-
			
 
				-        return loss_cls
			
 
				-    
			
 
				-    def loss_bboxes(self, pred_box, gt_box, bbox_weight):
			
 
				-        # regression loss
			
 
				-        ious = bbox_iou(pred_box, gt_box, xywh=False, CIoU=True)
			
 
				-        loss_box = (1.0 - ious.squeeze(-1)) * bbox_weight
			
 
				-
			
 
				-        return loss_box
			
 
				-    
			
 
				-    def loss_dfl(self, pred_reg, gt_box, anchor, stride, bbox_weight=None):
			
 
				-        # rescale coords by stride
			
 
				-        gt_box_s = gt_box / stride
			
 
				-        anchor_s = anchor / stride
			
 
				-
			
 
				-        # compute deltas
			
 
				-        gt_ltrb_s = bbox2dist(anchor_s, gt_box_s, self.reg_max - 1)
			
 
				-
			
 
				-        gt_left = gt_ltrb_s.to(torch.long)
			
 
				-        gt_right = gt_left + 1
			
 
				-
			
 
				-        weight_left = gt_right.to(torch.float) - gt_ltrb_s
			
 
				-        weight_right = 1 - weight_left
			
 
				-
			
 
				-        # loss left
			
 
				-        loss_left = F.cross_entropy(
			
 
				-            pred_reg.view(-1, self.reg_max),
			
 
				-            gt_left.view(-1),
			
 
				-            reduction='none').view(gt_left.shape) * weight_left
			
 
				-        # loss right
			
 
				-        loss_right = F.cross_entropy(
			
 
				-            pred_reg.view(-1, self.reg_max),
			
 
				-            gt_right.view(-1),
			
 
				-            reduction='none').view(gt_left.shape) * weight_right
			
 
				-
			
 
				-        loss_dfl = (loss_left + loss_right).mean(-1)
			
 
				-        
			
 
				-        if bbox_weight is not None:
			
 
				-            loss_dfl *= bbox_weight
			
 
				-
			
 
				-        return loss_dfl
			
 
				-
			
 
				-    def __call__(self, outputs, targets):        
			
 
				-        """
			
 
				-            outputs['pred_cls']: List(Tensor) [B, M, C]
			
 
				-            outputs['pred_reg']: List(Tensor) [B, M, 4*(reg_max+1)]
			
 
				-            outputs['pred_box']: List(Tensor) [B, M, 4]
			
 
				-            outputs['anchors']: List(Tensor) [M, 2]
			
 
				-            outputs['strides']: List(Int) [8, 16, 32] output stride
			
 
				-            outputs['stride_tensor']: List(Tensor) [M, 1]
			
 
				-            targets: (List) [dict{'boxes': [...], 
			
 
				-                                 'labels': [...], 
			
 
				-                                 'orig_size': ...}, ...]
			
 
				-        """
			
 
				-        # preds: [B, M, C]
			
 
				-        cls_preds = torch.cat(outputs['pred_cls'], dim=1)
			
 
				-        reg_preds = torch.cat(outputs['pred_reg'], dim=1)
			
 
				-        box_preds = torch.cat(outputs['pred_box'], dim=1)
			
 
				-        bs, num_anchors = cls_preds.shape[:2]
			
 
				-        device = cls_preds.device
			
 
				-        anchors = torch.cat(outputs['anchors'], dim=0)
			
 
				-        
			
 
				-        # --------------- label assignment ---------------
			
 
				-        gt_score_targets = []
			
 
				-        gt_bbox_targets = []
			
 
				-        fg_masks = []
			
 
				-        for batch_idx in range(bs):
			
 
				-            tgt_labels = targets[batch_idx]["labels"].to(device)     # [Mp,]
			
 
				-            tgt_boxs = targets[batch_idx]["boxes"].to(device)        # [Mp, 4]
			
 
				-
			
 
				-            if self.cfg.normalize_coords:
			
 
				-                img_h, img_w = outputs['image_size']
			
 
				-                tgt_boxs[..., [0, 2]] *= img_w
			
 
				-                tgt_boxs[..., [1, 3]] *= img_h
			
 
				-            
			
 
				-            if self.cfg.box_format == 'xywh':
			
 
				-                tgt_boxs_x1y1 = tgt_boxs[..., :2] - 0.5 * tgt_boxs[..., 2:]
			
 
				-                tgt_boxs_x2y2 = tgt_boxs[..., :2] + 0.5 * tgt_boxs[..., 2:]
			
 
				-                tgt_boxs = torch.cat([tgt_boxs_x1y1, tgt_boxs_x2y2], dim=-1)
			
 
				-
			
 
				-            # check target
			
 
				-            if len(tgt_labels) == 0 or tgt_boxs.max().item() == 0.:
			
 
				-                # There is no valid gt
			
 
				-                fg_mask  = cls_preds.new_zeros(1, num_anchors).bool()               #[1, M,]
			
 
				-                gt_score = cls_preds.new_zeros((1, num_anchors, self.num_classes)) #[1, M, C]
			
 
				-                gt_box   = cls_preds.new_zeros((1, num_anchors, 4))                  #[1, M, 4]
			
 
				-            else:
			
 
				-                tgt_labels = tgt_labels[None, :, None]      # [1, Mp, 1]
			
 
				-                tgt_boxs = tgt_boxs[None]                   # [1, Mp, 4]
			
 
				-                (
			
 
				-                    _,
			
 
				-                    gt_box,     # [1, M, 4]
			
 
				-                    gt_score,   # [1, M, C]
			
 
				-                    fg_mask,    # [1, M,]
			
 
				-                    _
			
 
				-                ) = self.matcher(
			
 
				-                    pd_scores = cls_preds[batch_idx:batch_idx+1].detach().sigmoid(), 
			
 
				-                    pd_bboxes = box_preds[batch_idx:batch_idx+1].detach(),
			
 
				-                    anc_points = anchors,
			
 
				-                    gt_labels = tgt_labels,
			
 
				-                    gt_bboxes = tgt_boxs
			
 
				-                    )
			
 
				-            gt_score_targets.append(gt_score)
			
 
				-            gt_bbox_targets.append(gt_box)
			
 
				-            fg_masks.append(fg_mask)
			
 
				-
			
 
				-        # List[B, 1, M, C] -> Tensor[B, M, C] -> Tensor[BM, C]
			
 
				-        fg_masks = torch.cat(fg_masks, 0).view(-1)                                    # [BM,]
			
 
				-        gt_score_targets = torch.cat(gt_score_targets, 0).view(-1, self.num_classes)  # [BM, C]
			
 
				-        gt_bbox_targets = torch.cat(gt_bbox_targets, 0).view(-1, 4)                   # [BM, 4]
			
 
				-        num_fgs = gt_score_targets.sum()
			
 
				-        
			
 
				-        # Average loss normalizer across all the GPUs
			
 
				-        if is_dist_avail_and_initialized():
			
 
				-            torch.distributed.all_reduce(num_fgs)
			
 
				-        num_fgs = (num_fgs / get_world_size()).clamp(1.0)
			
 
				-
			
 
				-        # ------------------ Classification loss ------------------
			
 
				-        cls_preds = cls_preds.view(-1, self.num_classes)
			
 
				-        loss_cls = self.loss_classes(cls_preds, gt_score_targets)
			
 
				-        loss_cls = loss_cls.sum() / num_fgs
			
 
				-
			
 
				-        # ------------------ Regression loss ------------------
			
 
				-        box_preds_pos = box_preds.view(-1, 4)[fg_masks]
			
 
				-        box_targets_pos = gt_bbox_targets.view(-1, 4)[fg_masks]
			
 
				-        bbox_weight = gt_score_targets[fg_masks].sum(-1)
			
 
				-        loss_box = self.loss_bboxes(box_preds_pos, box_targets_pos, bbox_weight)
			
 
				-        loss_box = loss_box.sum() / num_fgs
			
 
				-
			
 
				-        # ------------------ Distribution focal loss  ------------------
			
 
				-        ## process anchors
			
 
				-        anchors = anchors[None].repeat(bs, 1, 1).view(-1, 2)
			
 
				-        ## process stride tensors
			
 
				-        strides = torch.cat(outputs['stride_tensor'], dim=0)
			
 
				-        strides = strides.unsqueeze(0).repeat(bs, 1, 1).view(-1, 1)
			
 
				-        ## fg preds
			
 
				-        reg_preds_pos = reg_preds.view(-1, 4*self.reg_max)[fg_masks]
			
 
				-        anchors_pos = anchors[fg_masks]
			
 
				-        strides_pos = strides[fg_masks]
			
 
				-        ## compute dfl
			
 
				-        loss_dfl = self.loss_dfl(reg_preds_pos, box_targets_pos, anchors_pos, strides_pos, bbox_weight)
			
 
				-        loss_dfl = loss_dfl.sum() / num_fgs
			
 
				-
			
 
				-        # total loss
			
 
				-        losses = loss_cls * self.loss_cls_weight + loss_box * self.loss_box_weight + loss_dfl * self.loss_dfl_weight
			
 
				-        loss_dict = dict(
			
 
				-                loss_cls = loss_cls,
			
 
				-                loss_box = loss_box,
			
 
				-                loss_dfl = loss_dfl,
			
 
				-                losses = losses
			
 
				-        )
			
 
				-
			
 
				-        return loss_dict
			
 
				-    
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    pass
			
--- a/yolo/models/gelan/matcher.py
+++ b/yolo/models/gelan/matcher.py
@@ -1,199 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from utils.box_ops import bbox_iou
			
 
				-
			
 
				-
			
 
				-# -------------------------- Task Aligned Assigner --------------------------
			
 
				-class TaskAlignedAssigner(nn.Module):
			
 
				-    def __init__(self,
			
 
				-                 num_classes     = 80,
			
 
				-                 topk_candidates = 10,
			
 
				-                 alpha           = 0.5,
			
 
				-                 beta            = 6.0, 
			
 
				-                 eps             = 1e-9):
			
 
				-        super(TaskAlignedAssigner, self).__init__()
			
 
				-        self.topk_candidates = topk_candidates
			
 
				-        self.num_classes = num_classes
			
 
				-        self.bg_idx = num_classes
			
 
				-        self.alpha = alpha
			
 
				-        self.beta = beta
			
 
				-        self.eps = eps
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def forward(self,
			
 
				-                pd_scores,
			
 
				-                pd_bboxes,
			
 
				-                anc_points,
			
 
				-                gt_labels,
			
 
				-                gt_bboxes):
			
 
				-        self.bs = pd_scores.size(0)
			
 
				-        self.n_max_boxes = gt_bboxes.size(1)
			
 
				-
			
 
				-        mask_pos, align_metric, overlaps = self.get_pos_mask(
			
 
				-            pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points)
			
 
				-
			
 
				-        target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(
			
 
				-            mask_pos, overlaps, self.n_max_boxes)
			
 
				-
			
 
				-        # Assigned target
			
 
				-        target_labels, target_bboxes, target_scores = self.get_targets(
			
 
				-            gt_labels, gt_bboxes, target_gt_idx, fg_mask)
			
 
				-
			
 
				-        # normalize
			
 
				-        align_metric *= mask_pos
			
 
				-        pos_align_metrics = align_metric.amax(axis=-1, keepdim=True)  # b, max_num_obj
			
 
				-        pos_overlaps = (overlaps * mask_pos).amax(axis=-1, keepdim=True)  # b, max_num_obj
			
 
				-        norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1)
			
 
				-        target_scores = target_scores * norm_align_metric
			
 
				-
			
 
				-        return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx
			
 
				-
			
 
				-    def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points):
			
 
				-        # get in_gts mask, (b, max_num_obj, h*w)
			
 
				-        mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes)
			
 
				-        # get anchor_align metric, (b, max_num_obj, h*w)
			
 
				-        align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_in_gts)
			
 
				-        # get topk_metric mask, (b, max_num_obj, h*w)
			
 
				-        mask_topk = self.select_topk_candidates(align_metric)
			
 
				-        # merge all mask to a final mask, (b, max_num_obj, h*w)
			
 
				-        mask_pos = mask_topk * mask_in_gts
			
 
				-
			
 
				-        return mask_pos, align_metric, overlaps
			
 
				-
			
 
				-    def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_in_gts):
			
 
				-        """Compute alignment metric given predicted and ground truth bounding boxes."""
			
 
				-        na = pd_bboxes.shape[-2]
			
 
				-        mask_in_gts = mask_in_gts.bool()  # b, max_num_obj, h*w
			
 
				-        overlaps = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_bboxes.dtype, device=pd_bboxes.device)
			
 
				-        bbox_scores = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_scores.dtype, device=pd_scores.device)
			
 
				-
			
 
				-        ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long)  # 2, b, max_num_obj
			
 
				-        ind[0] = torch.arange(end=self.bs).view(-1, 1).expand(-1, self.n_max_boxes)  # b, max_num_obj
			
 
				-        ind[1] = gt_labels.squeeze(-1)  # b, max_num_obj
			
 
				-        # Get the scores of each grid for each gt cls
			
 
				-        bbox_scores[mask_in_gts] = pd_scores[ind[0], :, ind[1]][mask_in_gts]  # b, max_num_obj, h*w
			
 
				-
			
 
				-        # (b, max_num_obj, 1, 4), (b, 1, h*w, 4)
			
 
				-        pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, self.n_max_boxes, -1, -1)[mask_in_gts]
			
 
				-        gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[mask_in_gts]
			
 
				-        overlaps[mask_in_gts] = bbox_iou(gt_boxes, pd_boxes, xywh=False, CIoU=True).squeeze(-1).clamp_(0)
			
 
				-
			
 
				-        align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta)
			
 
				-        return align_metric, overlaps
			
 
				-
			
 
				-    def select_topk_candidates(self, metrics, largest=True):
			
 
				-        """
			
 
				-        Args:
			
 
				-            metrics: (b, max_num_obj, h*w).
			
 
				-            topk_mask: (b, max_num_obj, topk) or None
			
 
				-        """
			
 
				-        # (b, max_num_obj, topk)
			
 
				-        topk_metrics, topk_idxs = torch.topk(metrics, self.topk_candidates, dim=-1, largest=largest)
			
 
				-        topk_mask = (topk_metrics.max(-1, keepdim=True)[0] > self.eps).expand_as(topk_idxs)
			
 
				-        # (b, max_num_obj, topk)
			
 
				-        topk_idxs.masked_fill_(~topk_mask, 0)
			
 
				-
			
 
				-        # (b, max_num_obj, topk, h*w) -> (b, max_num_obj, h*w)
			
 
				-        count_tensor = torch.zeros(metrics.shape, dtype=torch.int8, device=topk_idxs.device)
			
 
				-        ones = torch.ones_like(topk_idxs[:, :, :1], dtype=torch.int8, device=topk_idxs.device)
			
 
				-        for k in range(self.topk_candidates):
			
 
				-            # Expand topk_idxs for each value of k and add 1 at the specified positions
			
 
				-            count_tensor.scatter_add_(-1, topk_idxs[:, :, k:k + 1], ones)
			
 
				-        # count_tensor.scatter_add_(-1, topk_idxs, torch.ones_like(topk_idxs, dtype=torch.int8, device=topk_idxs.device))
			
 
				-        # Filter invalid bboxes
			
 
				-        count_tensor.masked_fill_(count_tensor > 1, 0)
			
 
				-
			
 
				-        return count_tensor.to(metrics.dtype)
			
 
				-
			
 
				-    def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, fg_mask):
			
 
				-        # Assigned target labels, (b, 1)
			
 
				-        batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[..., None]
			
 
				-        target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes  # (b, h*w)
			
 
				-        target_labels = gt_labels.long().flatten()[target_gt_idx]  # (b, h*w)
			
 
				-
			
 
				-        # Assigned target boxes, (b, max_num_obj, 4) -> (b, h*w, 4)
			
 
				-        target_bboxes = gt_bboxes.view(-1, 4)[target_gt_idx]
			
 
				-
			
 
				-        # Assigned target scores
			
 
				-        target_labels.clamp_(0)
			
 
				-
			
 
				-        # 10x faster than F.one_hot()
			
 
				-        target_scores = torch.zeros((target_labels.shape[0], target_labels.shape[1], self.num_classes),
			
 
				-                                    dtype=torch.int64,
			
 
				-                                    device=target_labels.device)  # (b, h*w, 80)
			
 
				-        target_scores.scatter_(2, target_labels.unsqueeze(-1), 1)
			
 
				-
			
 
				-        fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes)  # (b, h*w, 80)
			
 
				-        target_scores = torch.where(fg_scores_mask > 0, target_scores, 0)
			
 
				-
			
 
				-        return target_labels, target_bboxes, target_scores
			
 
				-    
			
 
				-
			
 
				-# -------------------------- Basic Functions --------------------------
			
 
				-def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9):
			
 
				-    """select the positive anchors's center in gt
			
 
				-    Args:
			
 
				-        xy_centers (Tensor): shape(bs*n_max_boxes, num_total_anchors, 4)
			
 
				-        gt_bboxes (Tensor): shape(bs, n_max_boxes, 4)
			
 
				-    Return:
			
 
				-        (Tensor): shape(bs, n_max_boxes, num_total_anchors)
			
 
				-    """
			
 
				-    n_anchors = xy_centers.size(0)
			
 
				-    bs, n_max_boxes, _ = gt_bboxes.size()
			
 
				-    _gt_bboxes = gt_bboxes.reshape([-1, 4])
			
 
				-    xy_centers = xy_centers.unsqueeze(0).repeat(bs * n_max_boxes, 1, 1)
			
 
				-    gt_bboxes_lt = _gt_bboxes[:, 0:2].unsqueeze(1).repeat(1, n_anchors, 1)
			
 
				-    gt_bboxes_rb = _gt_bboxes[:, 2:4].unsqueeze(1).repeat(1, n_anchors, 1)
			
 
				-    b_lt = xy_centers - gt_bboxes_lt
			
 
				-    b_rb = gt_bboxes_rb - xy_centers
			
 
				-    bbox_deltas = torch.cat([b_lt, b_rb], dim=-1)
			
 
				-    bbox_deltas = bbox_deltas.reshape([bs, n_max_boxes, n_anchors, -1])
			
 
				-    return (bbox_deltas.min(axis=-1)[0] > eps).to(gt_bboxes.dtype)
			
 
				-
			
 
				-def select_highest_overlaps(mask_pos, overlaps, n_max_boxes):
			
 
				-    """if an anchor box is assigned to multiple gts,
			
 
				-        the one with the highest iou will be selected.
			
 
				-    Args:
			
 
				-        mask_pos (Tensor): shape(bs, n_max_boxes, num_total_anchors)
			
 
				-        overlaps (Tensor): shape(bs, n_max_boxes, num_total_anchors)
			
 
				-    Return:
			
 
				-        target_gt_idx (Tensor): shape(bs, num_total_anchors)
			
 
				-        fg_mask (Tensor): shape(bs, num_total_anchors)
			
 
				-        mask_pos (Tensor): shape(bs, n_max_boxes, num_total_anchors)
			
 
				-    """
			
 
				-    fg_mask = mask_pos.sum(-2)
			
 
				-    if fg_mask.max() > 1:  # one anchor is assigned to multiple gt_bboxes
			
 
				-        mask_multi_gts = (fg_mask.unsqueeze(1) > 1).expand(-1, n_max_boxes, -1)  # (b, n_max_boxes, h*w)
			
 
				-        max_overlaps_idx = overlaps.argmax(1)  # (b, h*w)
			
 
				-
			
 
				-        is_max_overlaps = torch.zeros(mask_pos.shape, dtype=mask_pos.dtype, device=mask_pos.device)
			
 
				-        is_max_overlaps.scatter_(1, max_overlaps_idx.unsqueeze(1), 1)
			
 
				-
			
 
				-        mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos).float()  # (b, n_max_boxes, h*w)
			
 
				-        fg_mask = mask_pos.sum(-2)
			
 
				-    # Find each grid serve which gt(index)
			
 
				-    target_gt_idx = mask_pos.argmax(-2)  # (b, h*w)
			
 
				-
			
 
				-    return target_gt_idx, fg_mask, mask_pos
			
 
				-
			
 
				-def iou_calculator(box1, box2, eps=1e-9):
			
 
				-    """Calculate iou for batch
			
 
				-    Args:
			
 
				-        box1 (Tensor): shape(bs, n_max_boxes, 1, 4)
			
 
				-        box2 (Tensor): shape(bs, 1, num_total_anchors, 4)
			
 
				-    Return:
			
 
				-        (Tensor): shape(bs, n_max_boxes, num_total_anchors)
			
 
				-    """
			
 
				-    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
			
 
				-    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
			
 
				-    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
			
 
				-    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
			
 
				-    x1y1 = torch.maximum(px1y1, gx1y1)
			
 
				-    x2y2 = torch.minimum(px2y2, gx2y2)
			
 
				-    overlap = (x2y2 - x1y1).clip(0).prod(-1)
			
 
				-    area1 = (px2y2 - px1y1).clip(0).prod(-1)
			
 
				-    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
			
 
				-    union = area1 + area2 - overlap + eps
			
 
				-
			
 
				-    return overlap / union
			
--- a/yolo/models/yolof/build.py
+++ b/yolo/models/yolof/build.py
--- a/yolo/models/yolof/loss.py
+++ b/yolo/models/yolof/loss.py
@@ -0,0 +1,144 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+from utils.box_ops import *
			
 
				+from utils.misc import sigmoid_focal_loss
			
 
				+from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
			
 
				+
			
 
				+from .matcher import UniformMatcher
			
 
				+
			
 
				+
			
 
				+class SetCriterion(nn.Module):
			
 
				+    """
			
 
				+        This code referenced to https://github.com/megvii-model/YOLOF/blob/main/playground/detection/coco/yolof/yolof_base/yolof.py
			
 
				+    """
			
 
				+    def __init__(self, cfg):
			
 
				+        super().__init__()
			
 
				+        # ------------- Basic parameters -------------
			
 
				+        self.cfg = cfg
			
 
				+        self.num_classes = cfg.num_classes
			
 
				+        # ------------- Focal loss -------------
			
 
				+        self.alpha = cfg.focal_loss_alpha
			
 
				+        self.gamma = cfg.focal_loss_gamma
			
 
				+        # ------------- Loss weight -------------
			
 
				+        self.weight_dict = {'loss_cls': cfg.loss_cls_weight,
			
 
				+                            'loss_reg': cfg.loss_reg_weight}
			
 
				+        # ------------- Matcher -------------
			
 
				+        self.matcher_cfg = cfg.matcher_hpy
			
 
				+        self.matcher = UniformMatcher(self.matcher_cfg['topk_candidates'])
			
 
				+
			
 
				+    def loss_labels(self, pred_cls, tgt_cls, num_boxes):
			
 
				+        """
			
 
				+            pred_cls: (Tensor) [N, C]
			
 
				+            tgt_cls:  (Tensor) [N, C]
			
 
				+        """
			
 
				+        # cls loss: [V, C]
			
 
				+        loss_cls = sigmoid_focal_loss(pred_cls, tgt_cls, self.alpha, self.gamma)
			
 
				+
			
 
				+        return loss_cls.sum() / num_boxes
			
 
				+
			
 
				+    def loss_bboxes(self, pred_box, tgt_box, num_boxes):
			
 
				+        """
			
 
				+            pred_box: (Tensor) [N, 4]
			
 
				+            tgt_box:  (Tensor) [N, 4]
			
 
				+        """
			
 
				+        # giou
			
 
				+        pred_giou = generalized_box_iou(pred_box, tgt_box)  # [N, M]
			
 
				+        # giou loss
			
 
				+        loss_reg = 1. - torch.diag(pred_giou)
			
 
				+
			
 
				+        return loss_reg.sum() / num_boxes
			
 
				+
			
 
				+    def forward(self, outputs, targets):
			
 
				+        """
			
 
				+            outputs['pred_cls']: (Tensor) [B, M, C]
			
 
				+            outputs['pred_box']: (Tensor) [B, M, 4]
			
 
				+            targets: (List) [dict{'boxes': [...], 
			
 
				+                                 'labels': [...], 
			
 
				+                                 'orig_size': ...}, ...]
			
 
				+        """
			
 
				+        # -------------------- Pre-process --------------------
			
 
				+        pred_box = outputs['pred_box']
			
 
				+        pred_cls = outputs['pred_cls'].reshape(-1, self.num_classes)
			
 
				+        anchor_boxes = outputs['anchors']
			
 
				+        masks = ~outputs['mask']
			
 
				+        device = pred_box.device
			
 
				+        B = len(targets)
			
 
				+
			
 
				+        # -------------------- Label assignment --------------------
			
 
				+        indices = self.matcher(pred_box, anchor_boxes, targets)
			
 
				+
			
 
				+        # [M, 4] -> [1, M, 4] -> [B, M, 4]
			
 
				+        anchor_boxes = box_cxcywh_to_xyxy(anchor_boxes)
			
 
				+        anchor_boxes = anchor_boxes[None].repeat(B, 1, 1)
			
 
				+
			
 
				+        ious = []
			
 
				+        pos_ious = []
			
 
				+        for i in range(B):
			
 
				+            src_idx, tgt_idx = indices[i]
			
 
				+            # iou between predbox and tgt box
			
 
				+            iou, _ = box_iou(pred_box[i, ...], (targets[i]['boxes']).clone())
			
 
				+            if iou.numel() == 0:
			
 
				+                max_iou = iou.new_full((iou.size(0),), 0)
			
 
				+            else:
			
 
				+                max_iou = iou.max(dim=1)[0]
			
 
				+            # iou between anchorbox and tgt box
			
 
				+            a_iou, _ = box_iou(anchor_boxes[i], (targets[i]['boxes']).clone())
			
 
				+            if a_iou.numel() == 0:
			
 
				+                pos_iou = a_iou.new_full((0,), 0)
			
 
				+            else:
			
 
				+                pos_iou = a_iou[src_idx, tgt_idx]
			
 
				+            ious.append(max_iou)
			
 
				+            pos_ious.append(pos_iou)
			
 
				+
			
 
				+        ious = torch.cat(ious)
			
 
				+        ignore_idx = ious > self.matcher_cfg['ignore_thresh']
			
 
				+        pos_ious = torch.cat(pos_ious)
			
 
				+        pos_ignore_idx = pos_ious < self.matcher_cfg['iou_thresh']
			
 
				+
			
 
				+        src_idx = torch.cat(
			
 
				+            [src + idx * anchor_boxes[0].shape[0] for idx, (src, _) in
			
 
				+             enumerate(indices)])
			
 
				+        # [BM,]
			
 
				+        gt_cls = torch.full(pred_cls.shape[:1],
			
 
				+                                self.num_classes,
			
 
				+                                dtype=torch.int64,
			
 
				+                                device=device)
			
 
				+        gt_cls[ignore_idx] = -1
			
 
				+        tgt_cls_o = torch.cat([t['labels'][J] for t, (_, J) in zip(targets, indices)])
			
 
				+        tgt_cls_o[pos_ignore_idx] = -1
			
 
				+
			
 
				+        gt_cls[src_idx] = tgt_cls_o.to(device)
			
 
				+
			
 
				+        foreground_idxs = (gt_cls >= 0) & (gt_cls != self.num_classes)
			
 
				+        num_foreground = foreground_idxs.sum()
			
 
				+
			
 
				+        if is_dist_avail_and_initialized():
			
 
				+            torch.distributed.all_reduce(num_foreground)
			
 
				+        num_foreground = torch.clamp(num_foreground / get_world_size(), min=1).item()
			
 
				+
			
 
				+        # -------------------- Classification loss --------------------
			
 
				+        gt_cls_target = torch.zeros_like(pred_cls)
			
 
				+        gt_cls_target[foreground_idxs, gt_cls[foreground_idxs]] = 1
			
 
				+        valid_idxs = (gt_cls >= 0) & masks
			
 
				+        loss_labels = self.loss_labels(pred_cls[valid_idxs], gt_cls_target[valid_idxs], num_foreground)
			
 
				+
			
 
				+        # -------------------- Regression loss --------------------
			
 
				+        tgt_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0).to(device)
			
 
				+        tgt_boxes = tgt_boxes[~pos_ignore_idx]
			
 
				+        matched_pred_box = pred_box.reshape(-1, 4)[src_idx[~pos_ignore_idx.cpu()]]
			
 
				+        loss_bboxes = self.loss_bboxes(matched_pred_box, tgt_boxes, num_foreground)
			
 
				+
			
 
				+        total_loss = loss_labels * self.weight_dict["loss_cls"] + \
			
 
				+                     loss_bboxes * self.weight_dict["loss_reg"]
			
 
				+        loss_dict = dict(
			
 
				+                loss_cls = loss_labels,
			
 
				+                loss_reg = loss_bboxes,
			
 
				+                losses   = total_loss,
			
 
				+        )
			
 
				+
			
 
				+        return loss_dict
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pass
			
--- a/yolo/models/yolof/matcher.py
+++ b/yolo/models/yolof/matcher.py
@@ -0,0 +1,103 @@
 
				+import numpy as np
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from utils.box_ops import *
			
 
				+
			
 
				+
			
 
				+class UniformMatcher(nn.Module):
			
 
				+    """
			
 
				+    This code is referenced to https://github.com/megvii-model/YOLOF/blob/main/playground/detection/coco/yolof/yolof_base/uniform_matcher.py
			
 
				+    """
			
 
				+    def __init__(self, match_times: int = 4):
			
 
				+        super().__init__()
			
 
				+        self.match_times = match_times
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def forward(self, pred_boxes, anchor_boxes, targets):
			
 
				+        """
			
 
				+            pred_boxes:   (Tensor) -> [B, num_queries, 4]
			
 
				+            anchor_boxes: (Tensor) -> [num_queries, 4]
			
 
				+            targets:      (Dict)   -> dict{'boxes': [...], 'labels': [...]}
			
 
				+        """
			
 
				+
			
 
				+        bs, num_queries = pred_boxes.shape[:2]
			
 
				+
			
 
				+        # We flatten to compute the cost matrices in a batch
			
 
				+        # [B, num_queries, 4] -> [M, 4]
			
 
				+        out_bbox = pred_boxes.flatten(0, 1)
			
 
				+        # [num_queries, 4] -> [1, num_queries, 4] -> [B, num_queries, 4] -> [M, 4]
			
 
				+        anchor_boxes = anchor_boxes[None].repeat(bs, 1, 1)
			
 
				+        anchor_boxes = anchor_boxes.flatten(0, 1)
			
 
				+
			
 
				+        # Also concat the target boxes
			
 
				+        tgt_bbox = torch.cat([v['boxes'] for v in targets])
			
 
				+
			
 
				+        # Compute the L1 cost between boxes
			
 
				+        # Note that we use anchors and predict boxes both
			
 
				+        cost_bbox = torch.cdist(box_xyxy_to_cxcywh(out_bbox), 
			
 
				+                                box_xyxy_to_cxcywh(tgt_bbox), 
			
 
				+                                p=1)
			
 
				+        cost_bbox_anchors = torch.cdist(anchor_boxes, 
			
 
				+                                        box_xyxy_to_cxcywh(tgt_bbox), 
			
 
				+                                        p=1)
			
 
				+
			
 
				+        # Final cost matrix: [B, M, N], M=num_queries, N=num_tgt
			
 
				+        C = cost_bbox
			
 
				+        C = C.view(bs, num_queries, -1).cpu()
			
 
				+        C1 = cost_bbox_anchors
			
 
				+        C1 = C1.view(bs, num_queries, -1).cpu()
			
 
				+
			
 
				+        sizes = [len(v['boxes']) for v in targets]  # the number of object instances in each image
			
 
				+        all_indices_list = [[] for _ in range(bs)]
			
 
				+        # positive indices when matching predict boxes and gt boxes
			
 
				+        # len(indices) = batch size
			
 
				+        # len(tupe) = topk
			
 
				+        indices = [
			
 
				+            tuple(
			
 
				+                torch.topk(
			
 
				+                    c[i],
			
 
				+                    k=self.match_times,
			
 
				+                    dim=0,
			
 
				+                    largest=False)[1].numpy().tolist()
			
 
				+            )
			
 
				+            for i, c in enumerate(C.split(sizes, -1))
			
 
				+        ]
			
 
				+        # positive indices when matching anchor boxes and gt boxes
			
 
				+        indices1 = [
			
 
				+            tuple(
			
 
				+                torch.topk(
			
 
				+                    c[i],
			
 
				+                    k=self.match_times,
			
 
				+                    dim=0,
			
 
				+                    largest=False)[1].numpy().tolist())
			
 
				+            for i, c in enumerate(C1.split(sizes, -1))]
			
 
				+
			
 
				+        # concat the indices according to image ids
			
 
				+        # img_id = batch_id
			
 
				+        for img_id, (idx, idx1) in enumerate(zip(indices, indices1)):
			
 
				+            img_idx_i = [
			
 
				+                np.array(idx_ + idx1_)
			
 
				+                for (idx_, idx1_) in zip(idx, idx1)
			
 
				+            ] # 'i' is the index of queris
			
 
				+            img_idx_j = [
			
 
				+                np.array(list(range(len(idx_))) + list(range(len(idx1_))))
			
 
				+                for (idx_, idx1_) in zip(idx, idx1)
			
 
				+            ] # 'j' is the index of tgt
			
 
				+            all_indices_list[img_id] = [*zip(img_idx_i, img_idx_j)]
			
 
				+
			
 
				+        # re-organize the positive indices
			
 
				+        all_indices = []
			
 
				+        for img_id in range(bs):
			
 
				+            all_idx_i = []
			
 
				+            all_idx_j = []
			
 
				+            for idx_list in all_indices_list[img_id]:
			
 
				+                idx_i, idx_j = idx_list
			
 
				+                all_idx_i.append(idx_i)
			
 
				+                all_idx_j.append(idx_j)
			
 
				+            all_idx_i = np.hstack(all_idx_i)
			
 
				+            all_idx_j = np.hstack(all_idx_j)
			
 
				+            all_indices.append((all_idx_i, all_idx_j))
			
 
				+
			
 
				+
			
 
				+        return [(torch.as_tensor(i, dtype=torch.int64), 
			
 
				+                 torch.as_tensor(j, dtype=torch.int64)) for i, j in all_indices]
			
--- a/yolo/models/yolof/modules.py
+++ b/yolo/models/yolof/modules.py
@@ -0,0 +1,148 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+from typing import List
			
 
				+
			
 
				+
			
 
				+# --------------------- Basic modules ---------------------
			
 
				+def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
			
 
				+    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
			
 
				+
			
 
				+    return conv
			
 
				+
			
 
				+def get_activation(act_type=None):
			
 
				+    if act_type == 'relu':
			
 
				+        return nn.ReLU(inplace=True)
			
 
				+    elif act_type == 'lrelu':
			
 
				+        return nn.LeakyReLU(0.1, inplace=True)
			
 
				+    elif act_type == 'mish':
			
 
				+        return nn.Mish(inplace=True)
			
 
				+    elif act_type == 'silu':
			
 
				+        return nn.SiLU(inplace=True)
			
 
				+    elif act_type is None:
			
 
				+        return nn.Identity()
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+        
			
 
				+def get_norm(norm_type, dim):
			
 
				+    if norm_type == 'BN':
			
 
				+        return nn.BatchNorm2d(dim)
			
 
				+    elif norm_type == 'GN':
			
 
				+        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				+    elif norm_type is None:
			
 
				+        return nn.Identity()
			
 
				+    else:
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+class BasicConv(nn.Module):
			
 
				+    def __init__(self, 
			
 
				+                 in_dim,                   # in channels
			
 
				+                 out_dim,                  # out channels 
			
 
				+                 kernel_size=1,            # kernel size 
			
 
				+                 padding=0,                # padding
			
 
				+                 stride=1,                 # padding
			
 
				+                 dilation=1,               # dilation
			
 
				+                 act_type  :str = 'lrelu', # activation
			
 
				+                 norm_type :str = 'BN',    # normalization
			
 
				+                 depthwise :bool = False
			
 
				+                ):
			
 
				+        super(BasicConv, self).__init__()
			
 
				+        self.depthwise = depthwise
			
 
				+        use_bias = False if norm_type is not None else True
			
 
				+        if not depthwise:
			
 
				+            self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=1, bias=use_bias)
			
 
				+            self.norm = get_norm(norm_type, out_dim)
			
 
				+        else:
			
 
				+            self.conv1 = get_conv2d(in_dim, in_dim, k=kernel_size, p=padding, s=stride, d=dilation, g=in_dim, bias=use_bias)
			
 
				+            self.norm1 = get_norm(norm_type, in_dim)
			
 
				+            self.conv2 = get_conv2d(in_dim, out_dim, k=1, p=0, s=1, d=1, g=1)
			
 
				+            self.norm2 = get_norm(norm_type, out_dim)
			
 
				+        self.act  = get_activation(act_type)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        if not self.depthwise:
			
 
				+            return self.act(self.norm(self.conv(x)))
			
 
				+        else:
			
 
				+            # Depthwise conv
			
 
				+            x = self.norm1(self.conv1(x))
			
 
				+            # Pointwise conv
			
 
				+            x = self.act(self.norm2(self.conv2(x)))
			
 
				+            return x
			
 
				+
			
 
				+
			
 
				+# --------------------- ResNet modules ---------------------
			
 
				+def conv3x3(in_planes, out_planes, stride=1):
			
 
				+    """3x3 convolution with padding"""
			
 
				+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
			
 
				+                     padding=1, bias=False)
			
 
				+
			
 
				+def conv1x1(in_planes, out_planes, stride=1):
			
 
				+    """1x1 convolution"""
			
 
				+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
			
 
				+
			
 
				+class BasicBlock(nn.Module):
			
 
				+    expansion = 1
			
 
				+
			
 
				+    def __init__(self, inplanes, planes, stride=1, downsample=None):
			
 
				+        super(BasicBlock, self).__init__()
			
 
				+        self.conv1 = conv3x3(inplanes, planes, stride)
			
 
				+        self.bn1 = nn.BatchNorm2d(planes)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.conv2 = conv3x3(planes, planes)
			
 
				+        self.bn2 = nn.BatchNorm2d(planes)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        identity = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            identity = self.downsample(x)
			
 
				+
			
 
				+        out += identity
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+class Bottleneck(nn.Module):
			
 
				+    expansion = 4
			
 
				+
			
 
				+    def __init__(self, inplanes, planes, stride=1, downsample=None):
			
 
				+        super(Bottleneck, self).__init__()
			
 
				+        self.conv1 = conv1x1(inplanes, planes)
			
 
				+        self.bn1 = nn.BatchNorm2d(planes)
			
 
				+        self.conv2 = conv3x3(planes, planes, stride)
			
 
				+        self.bn2 = nn.BatchNorm2d(planes)
			
 
				+        self.conv3 = conv1x1(planes, planes * self.expansion)
			
 
				+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        identity = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv3(out)
			
 
				+        out = self.bn3(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            identity = self.downsample(x)
			
 
				+
			
 
				+        out += identity
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
--- a/yolo/models/yolof/resnet.py
+++ b/yolo/models/yolof/resnet.py
@@ -0,0 +1,187 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.utils.model_zoo as model_zoo
			
 
				+
			
 
				+try:
			
 
				+    from .modules import conv1x1, BasicBlock, Bottleneck
			
 
				+except:
			
 
				+    from  modules import conv1x1, BasicBlock, Bottleneck
			
 
				+
			
 
				+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
			
 
				+           'resnet152']
			
 
				+
			
 
				+
			
 
				+model_urls = {
			
 
				+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
			
 
				+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
			
 
				+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
			
 
				+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
			
 
				+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# --------------------- ResNet -----------------------
			
 
				+class ResNet(nn.Module):
			
 
				+
			
 
				+    def __init__(self, block, layers, zero_init_residual=False):
			
 
				+        super(ResNet, self).__init__()
			
 
				+        self.inplanes = 64
			
 
				+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
			
 
				+                               bias=False)
			
 
				+        self.bn1 = nn.BatchNorm2d(64)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
			
 
				+        self.layer1 = self._make_layer(block, 64, layers[0])
			
 
				+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
			
 
				+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
			
 
				+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
			
 
				+
			
 
				+        for m in self.modules():
			
 
				+            if isinstance(m, nn.Conv2d):
			
 
				+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
			
 
				+            elif isinstance(m, nn.BatchNorm2d):
			
 
				+                nn.init.constant_(m.weight, 1)
			
 
				+                nn.init.constant_(m.bias, 0)
			
 
				+
			
 
				+        # Zero-initialize the last BN in each residual branch,
			
 
				+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
			
 
				+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
			
 
				+        if zero_init_residual:
			
 
				+            for m in self.modules():
			
 
				+                if isinstance(m, Bottleneck):
			
 
				+                    nn.init.constant_(m.bn3.weight, 0)
			
 
				+                elif isinstance(m, BasicBlock):
			
 
				+                    nn.init.constant_(m.bn2.weight, 0)
			
 
				+
			
 
				+    def _make_layer(self, block, planes, blocks, stride=1):
			
 
				+        downsample = None
			
 
				+        if stride != 1 or self.inplanes != planes * block.expansion:
			
 
				+            downsample = nn.Sequential(
			
 
				+                conv1x1(self.inplanes, planes * block.expansion, stride),
			
 
				+                nn.BatchNorm2d(planes * block.expansion),
			
 
				+            )
			
 
				+
			
 
				+        layers = []
			
 
				+        layers.append(block(self.inplanes, planes, stride, downsample))
			
 
				+        self.inplanes = planes * block.expansion
			
 
				+        for _ in range(1, blocks):
			
 
				+            layers.append(block(self.inplanes, planes))
			
 
				+
			
 
				+        return nn.Sequential(*layers)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        """
			
 
				+        Input:
			
 
				+            x: (Tensor) -> [B, C, H, W]
			
 
				+        Output:
			
 
				+            c5: (Tensor) -> [B, C, H/32, W/32]
			
 
				+        """
			
 
				+        c1 = self.conv1(x)     # [B, C, H/2, W/2]
			
 
				+        c1 = self.bn1(c1)      # [B, C, H/2, W/2]
			
 
				+        c1 = self.relu(c1)     # [B, C, H/2, W/2]
			
 
				+        c2 = self.maxpool(c1)  # [B, C, H/4, W/4]
			
 
				+
			
 
				+        c2 = self.layer1(c2)   # [B, C, H/4, W/4]
			
 
				+        c3 = self.layer2(c2)   # [B, C, H/8, W/8]
			
 
				+        c4 = self.layer3(c3)   # [B, C, H/16, W/16]
			
 
				+        c5 = self.layer4(c4)   # [B, C, H/32, W/32]
			
 
				+
			
 
				+        return c5
			
 
				+
			
 
				+
			
 
				+# --------------------- Functions -----------------------
			
 
				+def build_resnet(model_name="resnet18", pretrained=False):
			
 
				+    if model_name == 'resnet18':
			
 
				+        model = resnet18(pretrained)
			
 
				+        feat_dim = 512
			
 
				+    elif model_name == 'resnet34':
			
 
				+        model = resnet34(pretrained)
			
 
				+        feat_dim = 512
			
 
				+    elif model_name == 'resnet50':
			
 
				+        model = resnet50(pretrained)
			
 
				+        feat_dim = 2048
			
 
				+    elif model_name == 'resnet101':
			
 
				+        model = resnet34(pretrained)
			
 
				+        feat_dim = 2048
			
 
				+    else:
			
 
				+        raise NotImplementedError("Unknown resnet: {}".format(model_name))
			
 
				+    
			
 
				+    return model, feat_dim
			
 
				+
			
 
				+def resnet18(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-18 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
			
 
				+    if pretrained:
			
 
				+        # strict = False as we don't need fc layer params.
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+def resnet34(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-34 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+def resnet50(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-50 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+def resnet101(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-101 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+def resnet152(pretrained=False, **kwargs):
			
 
				+    """Constructs a ResNet-152 model.
			
 
				+
			
 
				+    Args:
			
 
				+        pretrained (bool): If True, returns a model pre-trained on ImageNet
			
 
				+    """
			
 
				+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
			
 
				+    if pretrained:
			
 
				+        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']), strict=False)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+if __name__=='__main__':
			
 
				+    import time
			
 
				+    from thop import profile
			
 
				+
			
 
				+    # Build backbone
			
 
				+    model, _ = build_resnet(model_name='resnet18')
			
 
				+
			
 
				+    # Inference
			
 
				+    x = torch.randn(1, 3, 640, 640)
			
 
				+    t0 = time.time()
			
 
				+    output = model(x)
			
 
				+    t1 = time.time()
			
 
				+    print('Time: ', t1 - t0)
			
 
				+    print(output.shape)
			
 
				+
			
 
				+    print('==============================')
			
 
				+    flops, params = profile(model, inputs=(x, ), verbose=False)
			
 
				+    print('==============================')
			
 
				+    print('GFLOPs : {:.2f}'.format(flops / 1e9 * 2))
			
 
				+    print('Params : {:.2f} M'.format(params / 1e6))    
			
--- a/yolo/models/yolof/yolof.py
+++ b/yolo/models/yolof/yolof.py
--- a/yolo/models/yolof/yolof_backbone.py
+++ b/yolo/models/yolof/yolof_backbone.py
--- a/yolo/models/yolof/yolof_decoder.py
+++ b/yolo/models/yolof/yolof_decoder.py
@@ -0,0 +1,185 @@
 
				+import math
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+
			
 
				+from .modules import BasicConv
			
 
				+
			
 
				+
			
 
				+class YolofHead(nn.Module):
			
 
				+    def __init__(self, cfg, in_dim, out_dim,):
			
 
				+        super().__init__()
			
 
				+        self.fmp_size = None
			
 
				+        self.ctr_clamp = cfg.center_clamp
			
 
				+        self.DEFAULT_EXP_CLAMP = math.log(1e8)
			
 
				+        self.DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
			
 
				+        # ------------------ Basic parameters -------------------
			
 
				+        self.cfg = cfg
			
 
				+        self.in_dim = in_dim
			
 
				+        self.stride       = cfg.out_stride
			
 
				+        self.num_classes  = cfg.num_classes
			
 
				+        self.num_cls_head = cfg.num_cls_head
			
 
				+        self.num_reg_head = cfg.num_reg_head
			
 
				+        self.act_type     = cfg.head_act
			
 
				+        self.norm_type    = cfg.head_norm
			
 
				+        # Anchor config
			
 
				+        self.anchor_size = torch.as_tensor(cfg.anchor_size)
			
 
				+        self.num_anchors = len(cfg.anchor_size)
			
 
				+
			
 
				+        # ------------------ Network parameters -------------------
			
 
				+        ## cls head
			
 
				+        cls_heads = []
			
 
				+        self.cls_head_dim = out_dim
			
 
				+        for i in range(self.num_cls_head):
			
 
				+            if i == 0:
			
 
				+                cls_heads.append(
			
 
				+                    BasicConv(in_dim, self.cls_head_dim,
			
 
				+                              kernel_size=3, padding=1, stride=1, 
			
 
				+                              act_type=self.act_type, norm_type=self.norm_type)
			
 
				+                              )
			
 
				+            else:
			
 
				+                cls_heads.append(
			
 
				+                    BasicConv(self.cls_head_dim, self.cls_head_dim,
			
 
				+                              kernel_size=3, padding=1, stride=1, 
			
 
				+                              act_type=self.act_type, norm_type=self.norm_type)
			
 
				+                              )
			
 
				+        ## reg head
			
 
				+        reg_heads = []
			
 
				+        self.reg_head_dim = out_dim
			
 
				+        for i in range(self.num_reg_head):
			
 
				+            if i == 0:
			
 
				+                reg_heads.append(
			
 
				+                    BasicConv(in_dim, self.reg_head_dim,
			
 
				+                              kernel_size=3, padding=1, stride=1, 
			
 
				+                              act_type=self.act_type, norm_type=self.norm_type)
			
 
				+                              )
			
 
				+            else:
			
 
				+                reg_heads.append(
			
 
				+                    BasicConv(self.reg_head_dim, self.reg_head_dim,
			
 
				+                              kernel_size=3, padding=1, stride=1, 
			
 
				+                              act_type=self.act_type, norm_type=self.norm_type)
			
 
				+                              )
			
 
				+        self.cls_heads = nn.Sequential(*cls_heads)
			
 
				+        self.reg_heads = nn.Sequential(*reg_heads)
			
 
				+
			
 
				+        # pred layer
			
 
				+        self.obj_pred = nn.Conv2d(self.reg_head_dim, 1 * self.num_anchors, kernel_size=3, padding=1)
			
 
				+        self.cls_pred = nn.Conv2d(self.cls_head_dim, self.num_classes * self.num_anchors, kernel_size=3, padding=1)
			
 
				+        self.reg_pred = nn.Conv2d(self.reg_head_dim, 4 * self.num_anchors, kernel_size=3, padding=1)
			
 
				+
			
 
				+        # init bias
			
 
				+        self._init_pred_layers()
			
 
				+
			
 
				+    def _init_pred_layers(self):  
			
 
				+        # init cls pred
			
 
				+        nn.init.normal_(self.cls_pred.weight, mean=0, std=0.01)
			
 
				+        init_prob = 0.01
			
 
				+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
			
 
				+        nn.init.constant_(self.cls_pred.bias, bias_value)
			
 
				+        # init reg pred
			
 
				+        nn.init.normal_(self.reg_pred.weight, mean=0, std=0.01)
			
 
				+        nn.init.constant_(self.reg_pred.bias, 0.0)
			
 
				+        # init obj pred
			
 
				+        nn.init.normal_(self.obj_pred.weight, mean=0, std=0.01)
			
 
				+        nn.init.constant_(self.obj_pred.bias, 0.0)
			
 
				+
			
 
				+    def get_anchors(self, fmp_size):
			
 
				+        """fmp_size: list -> [H, W] \n
			
 
				+           stride: int -> output stride
			
 
				+        """
			
 
				+        # check anchor boxes
			
 
				+        if self.fmp_size is not None and self.fmp_size == fmp_size:
			
 
				+            return self.anchor_boxes
			
 
				+        else:
			
 
				+            # generate grid cells
			
 
				+            fmp_h, fmp_w = fmp_size
			
 
				+            anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
			
 
				+            # [H, W, 2] -> [HW, 2]
			
 
				+            anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2) + 0.5
			
 
				+            # [HW, 2] -> [HW, 1, 2] -> [HW, KA, 2] 
			
 
				+            anchor_xy = anchor_xy[:, None, :].repeat(1, self.num_anchors, 1)
			
 
				+            anchor_xy *= self.stride
			
 
				+
			
 
				+            # [KA, 2] -> [1, KA, 2] -> [HW, KA, 2]
			
 
				+            anchor_wh = self.anchor_size[None, :, :].repeat(fmp_h*fmp_w, 1, 1)
			
 
				+
			
 
				+            # [HW, KA, 4] -> [M, 4]
			
 
				+            anchor_boxes = torch.cat([anchor_xy, anchor_wh], dim=-1)
			
 
				+            anchor_boxes = anchor_boxes.view(-1, 4)
			
 
				+
			
 
				+            self.anchor_boxes = anchor_boxes
			
 
				+            self.fmp_size = fmp_size
			
 
				+
			
 
				+            return anchor_boxes
			
 
				+        
			
 
				+    def decode_boxes(self, anchor_boxes, pred_reg):
			
 
				+        """
			
 
				+            anchor_boxes: (List[tensor]) [1, M, 4]
			
 
				+            pred_reg: (List[tensor]) [B, M, 4]
			
 
				+        """
			
 
				+        # x = x_anchor + dx * w_anchor
			
 
				+        # y = y_anchor + dy * h_anchor
			
 
				+        pred_ctr_offset = pred_reg[..., :2] * anchor_boxes[..., 2:]
			
 
				+        pred_ctr_offset = torch.clamp(pred_ctr_offset, min=-self.ctr_clamp, max=self.ctr_clamp)
			
 
				+        pred_ctr_xy = anchor_boxes[..., :2] + pred_ctr_offset
			
 
				+
			
 
				+        # w = w_anchor * exp(tw)
			
 
				+        # h = h_anchor * exp(th)
			
 
				+        pred_dwdh = pred_reg[..., 2:]
			
 
				+        pred_dwdh = torch.clamp(pred_dwdh, max=self.DEFAULT_SCALE_CLAMP)
			
 
				+        pred_wh = anchor_boxes[..., 2:] * pred_dwdh.exp()
			
 
				+
			
 
				+        # convert [x, y, w, h] -> [x1, y1, x2, y2]
			
 
				+        pred_x1y1 = pred_ctr_xy - 0.5 * pred_wh
			
 
				+        pred_x2y2 = pred_ctr_xy + 0.5 * pred_wh
			
 
				+        pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
			
 
				+
			
 
				+        return pred_box
			
 
				+
			
 
				+    def forward(self, x, mask=None):
			
 
				+        # ------------------- Decoupled head -------------------
			
 
				+        cls_feats = self.cls_heads(x)
			
 
				+        reg_feats = self.reg_heads(x)
			
 
				+
			
 
				+        # ------------------- Generate anchor box -------------------
			
 
				+        fmp_size = cls_feats.shape[2:]
			
 
				+        anchor_boxes = self.get_anchors(fmp_size)   # [M, 4]
			
 
				+        anchor_boxes = anchor_boxes.to(cls_feats.device)
			
 
				+
			
 
				+        # ------------------- Predict -------------------
			
 
				+        obj_pred = self.obj_pred(reg_feats)
			
 
				+        cls_pred = self.cls_pred(cls_feats)
			
 
				+        reg_pred = self.reg_pred(reg_feats)
			
 
				+
			
 
				+        # ------------------- Precoess preds -------------------
			
 
				+        ## implicit objectness
			
 
				+        B, _, H, W = obj_pred.size()
			
 
				+        obj_pred = obj_pred.view(B, -1, 1, H, W)
			
 
				+        cls_pred = cls_pred.view(B, -1, self.num_classes, H, W)
			
 
				+        normalized_cls_pred = cls_pred + obj_pred - torch.log(
			
 
				+                1. + 
			
 
				+                torch.clamp(cls_pred, max=self.DEFAULT_EXP_CLAMP).exp() + 
			
 
				+                torch.clamp(obj_pred, max=self.DEFAULT_EXP_CLAMP).exp())
			
 
				+        # [B, KA, C, H, W] -> [B, H, W, KA, C] -> [B, M, C], M = HxWxKA
			
 
				+        normalized_cls_pred = normalized_cls_pred.permute(0, 3, 4, 1, 2).contiguous()
			
 
				+        normalized_cls_pred = normalized_cls_pred.view(B, -1, self.num_classes)
			
 
				+        # [B, KA*4, H, W] -> [B, KA, 4, H, W] -> [B, H, W, KA, 4] -> [B, M, 4]
			
 
				+        reg_pred = reg_pred.view(B, -1, 4, H, W).permute(0, 3, 4, 1, 2).contiguous()
			
 
				+        reg_pred = reg_pred.view(B, -1, 4)
			
 
				+        ## Decode bbox
			
 
				+        box_pred = self.decode_boxes(anchor_boxes[None], reg_pred)  # [B, M, 4]
			
 
				+        ## adjust mask
			
 
				+        if mask is not None:
			
 
				+            # [B, H, W]
			
 
				+            mask = torch.nn.functional.interpolate(mask[None].float(), size=fmp_size).bool()[0]
			
 
				+            # [B, H, W] -> [B, HW]
			
 
				+            mask = mask.flatten(1)
			
 
				+            # [B, HW] -> [B, HW, KA] -> [BM,], M= HW x KA
			
 
				+            mask = mask[..., None].repeat(1, 1, self.num_anchors).flatten()
			
 
				+
			
 
				+        outputs = {"pred_cls": normalized_cls_pred,
			
 
				+                   "pred_reg": reg_pred,
			
 
				+                   "pred_box": box_pred,
			
 
				+                   "anchors": anchor_boxes,
			
 
				+                   "mask": mask}
			
 
				+
			
 
				+        return outputs 
			
--- a/yolo/models/yolof/yolof_encoder.py
+++ b/yolo/models/yolof/yolof_encoder.py
@@ -0,0 +1,72 @@
 
				+import torch.nn as nn
			
 
				+from utils import weight_init
			
 
				+
			
 
				+from .modules import BasicConv
			
 
				+
			
 
				+
			
 
				+# BottleNeck
			
 
				+class Bottleneck(nn.Module):
			
 
				+    def __init__(self, in_dim, dilation, expand_ratio, act_type='relu', norm_type='BN'):
			
 
				+        super(Bottleneck, self).__init__()
			
 
				+        # ------------------ Basic parameters -------------------
			
 
				+        self.in_dim = in_dim
			
 
				+        self.dilation = dilation
			
 
				+        self.expand_ratio = expand_ratio
			
 
				+        inter_dim = round(in_dim * expand_ratio)
			
 
				+        # ------------------ Network parameters -------------------
			
 
				+        self.branch = nn.Sequential(
			
 
				+            BasicConv(in_dim, inter_dim, kernel_size=1, act_type=act_type, norm_type=norm_type),
			
 
				+            BasicConv(inter_dim, inter_dim, kernel_size=3, padding=dilation, dilation=dilation, act_type=act_type, norm_type=norm_type),
			
 
				+            BasicConv(inter_dim, in_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return x + self.branch(x)
			
 
				+
			
 
				+# Dilated Encoder
			
 
				+class DilatedEncoder(nn.Module):
			
 
				+    def __init__(self, cfg, in_dim, out_dim):
			
 
				+        super(DilatedEncoder, self).__init__()
			
 
				+        # ------------------ Basic parameters -------------------
			
 
				+        self.in_dim = in_dim
			
 
				+        self.out_dim = out_dim
			
 
				+        self.expand_ratio = cfg.neck_expand_ratio
			
 
				+        self.dilations    = cfg.neck_dilations
			
 
				+        self.act_type     = cfg.neck_act
			
 
				+        self.norm_type    = cfg.neck_norm
			
 
				+        # ------------------ Network parameters -------------------
			
 
				+        ## proj layer
			
 
				+        self.projector = nn.Sequential(
			
 
				+            BasicConv(in_dim, out_dim, kernel_size=1, act_type=None, norm_type=self.norm_type),
			
 
				+            BasicConv(out_dim, out_dim, kernel_size=3, padding=1, act_type=None, norm_type=self.norm_type)
			
 
				+        )
			
 
				+        ## encoder layers
			
 
				+        self.encoders = nn.Sequential(
			
 
				+            *[Bottleneck(out_dim, d, self.expand_ratio, self.act_type, self.norm_type) for d in self.dilations])
			
 
				+
			
 
				+        self._init_weight()
			
 
				+
			
 
				+    def _init_weight(self):
			
 
				+        for m in self.projector:
			
 
				+            if isinstance(m, nn.Conv2d):
			
 
				+                weight_init.c2_xavier_fill(m)
			
 
				+                weight_init.c2_xavier_fill(m)
			
 
				+            if isinstance(m, (nn.GroupNorm, nn.BatchNorm2d, nn.SyncBatchNorm)):
			
 
				+                nn.init.constant_(m.weight, 1)
			
 
				+                nn.init.constant_(m.bias, 0)
			
 
				+
			
 
				+        for m in self.encoders.modules():
			
 
				+            if isinstance(m, nn.Conv2d):
			
 
				+                nn.init.normal_(m.weight, mean=0, std=0.01)
			
 
				+                if hasattr(m, 'bias') and m.bias is not None:
			
 
				+                    nn.init.constant_(m.bias, 0)
			
 
				+
			
 
				+            if isinstance(m, (nn.GroupNorm, nn.BatchNorm2d, nn.SyncBatchNorm)):
			
 
				+                nn.init.constant_(m.weight, 1)
			
 
				+                nn.init.constant_(m.bias, 0)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        x = self.projector(x)
			
 
				+        x = self.encoders(x)
			
 
				+
			
 
				+        return x
			
--- a/yolo/models/yolov10/README.md
+++ b/yolo/models/yolov10/README.md
@@ -0,0 +1,56 @@
 
				+# YOLOv7:
			
 
				+
			
 
				+|    Model    |   Backbone    | Batch | Scale | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
			
 
				+|-------------|---------------|-------|-------|------------------------|-------------------|-------------------|--------------------|--------|
			
 
				+| YOLOv7-Tiny | ELANNet-Tiny  | 8xb16 |  640  |         39.5           |       58.5        |   22.6            |   7.9              | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/yolo_tutorial_ckpt/yolov7_tiny_coco.pth) |
			
 
				+| YOLOv7      | ELANNet-Large | 8xb16 |  640  |         49.5           |       68.8        |   144.6           |   44.0             | [ckpt](https://github.com/yjh0410/RT-ODLab/releases/download/yolo_tutorial_ckpt/yolov7_coco.pth) |
			
 
				+| YOLOv7-X    | ELANNet-Huge  |       |  640  |                        |                   |                   |                    |  |
			
 
				+
			
 
				+- For training, we train `YOLOv7` and `YOLOv7-Tiny` with 300 epochs on 8 GPUs.
			
 
				+- For data augmentation, we use the [YOLOX-style](https://github.com/Megvii-BaseDetection/YOLOX) augmentation including the large scale jitter (LSJ), Mosaic augmentation and Mixup augmentation.
			
 
				+- For optimizer, we use `AdamW` with weight decay 0.05 and per image learning rate 0.001 / 64.
			
 
				+- For learning rate scheduler, we use Cosine decay scheduler.
			
 
				+- For YOLOv7's structure, we replace the coupled head with the YOLOX-style decoupled head.
			
 
				+- I think YOLOv7 uses too many training tricks, such as `anchor box`, `AuxiliaryHead`, `RepConv`, `Mosaic9x` and so on, making the picture of YOLO too complicated, which is against the development concept of the YOLO series. Otherwise, why don't we use the DETR series? It's nothing more than doing some acceleration optimization on DETR. Therefore, I was faithful to my own technical aesthetics and realized a cleaner and simpler YOLOv7, but without the blessing of so many tricks, I did not reproduce all the performance, which is a pity.
			
 
				+- I have no more GPUs to train my `YOLOv7-X`.
			
 
				+
			
 
				+## Train YOLOv7
			
 
				+### Single GPU
			
 
				+Taking training YOLOv7-Tiny on COCO as the example,
			
 
				+```Shell
			
 
				+python train.py --cuda -d coco --root path/to/coco -m yolov7_tiny -bs 16 -size 640 --wp_epoch 3 --max_epoch 300 --eval_epoch 10 --no_aug_epoch 20 --ema --fp16 --multi_scale 
			
 
				+```
			
 
				+
			
 
				+### Multi GPU
			
 
				+Taking training YOLOv7-Tiny on COCO as the example,
			
 
				+```Shell
			
 
				+python -m torch.distributed.run --nproc_per_node=8 train.py --cuda -dist -d coco --root /data/datasets/ -m yolov7_tiny -bs 128 -size 640 --wp_epoch 3 --max_epoch 300  --eval_epoch 10 --no_aug_epoch 20 --ema --fp16 --sybn --multi_scale --save_folder weights/ 
			
 
				+```
			
 
				+
			
 
				+## Test YOLOv7
			
 
				+Taking testing YOLOv7-Tiny on COCO-val as the example,
			
 
				+```Shell
			
 
				+python test.py --cuda -d coco --root path/to/coco -m yolov7_tiny --weight path/to/yolov7_tiny.pth -size 640 -vt 0.4 --show 
			
 
				+```
			
 
				+
			
 
				+## Evaluate YOLOv7
			
 
				+Taking evaluating YOLOv7-Tiny on COCO-val as the example,
			
 
				+```Shell
			
 
				+python eval.py --cuda -d coco-val --root path/to/coco -m yolov7_tiny --weight path/to/yolov7_tiny.pth 
			
 
				+```
			
 
				+
			
 
				+## Demo
			
 
				+### Detect with Image
			
 
				+```Shell
			
 
				+python demo.py --mode image --path_to_img path/to/image_dirs/ --cuda -m yolov7_tiny --weight path/to/weight -size 640 -vt 0.4 --show
			
 
				+```
			
 
				+
			
 
				+### Detect with Video
			
 
				+```Shell
			
 
				+python demo.py --mode video --path_to_vid path/to/video --cuda -m yolov7_tiny --weight path/to/weight -size 640 -vt 0.4 --show --gif
			
 
				+```
			
 
				+
			
 
				+### Detect with Camera
			
 
				+```Shell
			
 
				+python demo.py --mode camera --cuda -m yolov7_tiny --weight path/to/weight -size 640 -vt 0.4 --show --gif
			
 
				+```
			
--- a/yolo/models/yolov10/build.py
+++ b/yolo/models/yolov10/build.py
@@ -0,0 +1,66 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+
			
 
				+from .loss import build_criterion
			
 
				+from .yolov10 import YOLOv7
			
 
				+
			
 
				+
			
 
				+# build object detector
			
 
				+def build_yolov7(args, cfg, device, num_classes=80, trainable=False, deploy=False):
			
 
				+    print('==============================')
			
 
				+    print('Build {} ...'.format(args.model.upper()))
			
 
				+    
			
 
				+    print('==============================')
			
 
				+    print('Model Configuration: \n', cfg)
			
 
				+    
			
 
				+    # -------------- Build YOLO --------------
			
 
				+    model = YOLOv7(cfg                = cfg,
			
 
				+                   device             = device, 
			
 
				+                   num_classes        = num_classes,
			
 
				+                   trainable          = trainable,
			
 
				+                   conf_thresh        = args.conf_thresh,
			
 
				+                   nms_thresh         = args.nms_thresh,
			
 
				+                   topk               = args.topk,
			
 
				+                   deploy             = deploy,
			
 
				+                   no_multi_labels    = args.no_multi_labels,
			
 
				+                   nms_class_agnostic = args.nms_class_agnostic
			
 
				+                   )
			
 
				+
			
 
				+    # -------------- Initialize YOLO --------------
			
 
				+    for m in model.modules():
			
 
				+        if isinstance(m, nn.BatchNorm2d):
			
 
				+            m.eps = 1e-3
			
 
				+            m.momentum = 0.03    
			
 
				+    # Init bias
			
 
				+    init_prob = 0.01
			
 
				+    bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
			
 
				+    # obj pred
			
 
				+    for obj_pred in model.obj_preds:
			
 
				+        b = obj_pred.bias.view(1, -1)
			
 
				+        b.data.fill_(bias_value.item())
			
 
				+        obj_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
			
 
				+    # cls pred
			
 
				+    for cls_pred in model.cls_preds:
			
 
				+        b = cls_pred.bias.view(1, -1)
			
 
				+        b.data.fill_(bias_value.item())
			
 
				+        cls_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
			
 
				+    # reg pred
			
 
				+    for reg_pred in model.reg_preds:
			
 
				+        b = reg_pred.bias.view(-1, )
			
 
				+        b.data.fill_(1.0)
			
 
				+        reg_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
			
 
				+        w = reg_pred.weight
			
 
				+        w.data.fill_(0.)
			
 
				+        reg_pred.weight = torch.nn.Parameter(w, requires_grad=True)
			
 
				+
			
 
				+
			
 
				+    # -------------- Build criterion --------------
			
 
				+    criterion = None
			
 
				+    if trainable:
			
 
				+        # build criterion for training
			
 
				+        criterion = build_criterion(args, cfg, device, num_classes)
			
 
				+
			
 
				+    return model, criterion
			
--- a/yolo/models/yolov10/loss.py
+++ b/yolo/models/yolov10/loss.py
@@ -0,0 +1,212 @@
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from .matcher import SimOTA
			
 
				+from utils.box_ops import get_ious
			
 
				+from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
			
 
				+
			
 
				+
			
 
				+
			
 
				+class Criterion(object):
			
 
				+    def __init__(self,
			
 
				+                 args,
			
 
				+                 cfg, 
			
 
				+                 device, 
			
 
				+                 num_classes=80):
			
 
				+        self.args = args
			
 
				+        self.cfg = cfg
			
 
				+        self.device = device
			
 
				+        self.num_classes = num_classes
			
 
				+        self.max_epoch = args.max_epoch
			
 
				+        self.no_aug_epoch = args.no_aug_epoch
			
 
				+        self.aux_bbox_loss = False
			
 
				+        # loss weight
			
 
				+        self.loss_obj_weight = cfg['loss_obj_weight']
			
 
				+        self.loss_cls_weight = cfg['loss_cls_weight']
			
 
				+        self.loss_box_weight = cfg['loss_box_weight']
			
 
				+        # matcher
			
 
				+        matcher_config = cfg['matcher']
			
 
				+        self.matcher = SimOTA(
			
 
				+            num_classes=num_classes,
			
 
				+            center_sampling_radius=matcher_config['center_sampling_radius'],
			
 
				+            topk_candidate=matcher_config['topk_candicate']
			
 
				+            )
			
 
				+
			
 
				+
			
 
				+    def loss_objectness(self, pred_obj, gt_obj):
			
 
				+        loss_obj = F.binary_cross_entropy_with_logits(pred_obj, gt_obj, reduction='none')
			
 
				+
			
 
				+        return loss_obj
			
 
				+    
			
 
				+
			
 
				+    def loss_classes(self, pred_cls, gt_label):
			
 
				+        loss_cls = F.binary_cross_entropy_with_logits(pred_cls, gt_label, reduction='none')
			
 
				+
			
 
				+        return loss_cls
			
 
				+
			
 
				+
			
 
				+    def loss_bboxes(self, pred_box, gt_box):
			
 
				+        # regression loss
			
 
				+        ious = get_ious(pred_box, gt_box, "xyxy", 'giou')
			
 
				+        loss_box = 1.0 - ious
			
 
				+
			
 
				+        return loss_box
			
 
				+
			
 
				+
			
 
				+    def loss_bboxes_aux(self, pred_reg, gt_box, anchors, stride_tensors):
			
 
				+        # xyxy -> cxcy&bwbh
			
 
				+        gt_cxcy = (gt_box[..., :2] + gt_box[..., 2:]) * 0.5
			
 
				+        gt_bwbh = gt_box[..., 2:] - gt_box[..., :2]
			
 
				+        # encode gt box
			
 
				+        gt_cxcy_encode = (gt_cxcy - anchors) / stride_tensors
			
 
				+        gt_bwbh_encode = torch.log(gt_bwbh / stride_tensors)
			
 
				+        gt_box_encode = torch.cat([gt_cxcy_encode, gt_bwbh_encode], dim=-1)
			
 
				+        # l1 loss
			
 
				+        loss_box_aux = F.l1_loss(pred_reg, gt_box_encode, reduction='none')
			
 
				+
			
 
				+        return loss_box_aux
			
 
				+
			
 
				+
			
 
				+    def __call__(self, outputs, targets, epoch=0):        
			
 
				+        """
			
 
				+            outputs['pred_obj']: List(Tensor) [B, M, 1]
			
 
				+            outputs['pred_cls']: List(Tensor) [B, M, C]
			
 
				+            outputs['pred_box']: List(Tensor) [B, M, 4]
			
 
				+            outputs['pred_box']: List(Tensor) [B, M, 4]
			
 
				+            outputs['strides']: List(Int) [8, 16, 32] output stride
			
 
				+            targets: (List) [dict{'boxes': [...], 
			
 
				+                                 'labels': [...], 
			
 
				+                                 'orig_size': ...}, ...]
			
 
				+        """
			
 
				+        bs = outputs['pred_cls'][0].shape[0]
			
 
				+        device = outputs['pred_cls'][0].device
			
 
				+        fpn_strides = outputs['strides']
			
 
				+        anchors = outputs['anchors']
			
 
				+        # preds: [B, M, C]
			
 
				+        obj_preds = torch.cat(outputs['pred_obj'], dim=1)
			
 
				+        cls_preds = torch.cat(outputs['pred_cls'], dim=1)
			
 
				+        box_preds = torch.cat(outputs['pred_box'], dim=1)
			
 
				+
			
 
				+        # label assignment
			
 
				+        cls_targets = []
			
 
				+        box_targets = []
			
 
				+        obj_targets = []
			
 
				+        fg_masks = []
			
 
				+
			
 
				+        for batch_idx in range(bs):
			
 
				+            tgt_labels = targets[batch_idx]["labels"].to(device)
			
 
				+            tgt_bboxes = targets[batch_idx]["boxes"].to(device)
			
 
				+
			
 
				+            # check target
			
 
				+            if len(tgt_labels) == 0 or tgt_bboxes.max().item() == 0.:
			
 
				+                num_anchors = sum([ab.shape[0] for ab in anchors])
			
 
				+                # There is no valid gt
			
 
				+                cls_target = obj_preds.new_zeros((0, self.num_classes))
			
 
				+                box_target = obj_preds.new_zeros((0, 4))
			
 
				+                obj_target = obj_preds.new_zeros((num_anchors, 1))
			
 
				+                fg_mask = obj_preds.new_zeros(num_anchors).bool()
			
 
				+            else:
			
 
				+                (
			
 
				+                    fg_mask,
			
 
				+                    assigned_labels,
			
 
				+                    assigned_ious,
			
 
				+                    assigned_indexs
			
 
				+                ) = self.matcher(
			
 
				+                    fpn_strides = fpn_strides,
			
 
				+                    anchors = anchors,
			
 
				+                    pred_obj = obj_preds[batch_idx],
			
 
				+                    pred_cls = cls_preds[batch_idx], 
			
 
				+                    pred_box = box_preds[batch_idx],
			
 
				+                    tgt_labels = tgt_labels,
			
 
				+                    tgt_bboxes = tgt_bboxes
			
 
				+                    )
			
 
				+
			
 
				+                obj_target = fg_mask.unsqueeze(-1)
			
 
				+                cls_target = F.one_hot(assigned_labels.long(), self.num_classes)
			
 
				+                cls_target = cls_target * assigned_ious.unsqueeze(-1)
			
 
				+                box_target = tgt_bboxes[assigned_indexs]
			
 
				+
			
 
				+            cls_targets.append(cls_target)
			
 
				+            box_targets.append(box_target)
			
 
				+            obj_targets.append(obj_target)
			
 
				+            fg_masks.append(fg_mask)
			
 
				+
			
 
				+        cls_targets = torch.cat(cls_targets, 0)
			
 
				+        box_targets = torch.cat(box_targets, 0)
			
 
				+        obj_targets = torch.cat(obj_targets, 0)
			
 
				+        fg_masks = torch.cat(fg_masks, 0)
			
 
				+        num_fgs = fg_masks.sum()
			
 
				+
			
 
				+        if is_dist_avail_and_initialized():
			
 
				+            torch.distributed.all_reduce(num_fgs)
			
 
				+        num_fgs = (num_fgs / get_world_size()).clamp(1.0)
			
 
				+
			
 
				+        # ------------------ Objecntness loss ------------------
			
 
				+        loss_obj = self.loss_objectness(obj_preds.view(-1, 1), obj_targets.float())
			
 
				+        loss_obj = loss_obj.sum() / num_fgs
			
 
				+        
			
 
				+        # ------------------ Classification loss ------------------
			
 
				+        cls_preds_pos = cls_preds.view(-1, self.num_classes)[fg_masks]
			
 
				+        loss_cls = self.loss_classes(cls_preds_pos, cls_targets)
			
 
				+        loss_cls = loss_cls.sum() / num_fgs
			
 
				+
			
 
				+        # ------------------ Regression loss ------------------
			
 
				+        box_preds_pos = box_preds.view(-1, 4)[fg_masks]
			
 
				+        loss_box = self.loss_bboxes(box_preds_pos, box_targets)
			
 
				+        loss_box = loss_box.sum() / num_fgs
			
 
				+
			
 
				+        # total loss
			
 
				+        losses = self.loss_obj_weight * loss_obj + \
			
 
				+                 self.loss_cls_weight * loss_cls + \
			
 
				+                 self.loss_box_weight * loss_box
			
 
				+
			
 
				+        # ------------------ Aux regression loss ------------------
			
 
				+        loss_box_aux = None
			
 
				+        if epoch >= (self.max_epoch - self.no_aug_epoch - 1):
			
 
				+            ## reg_preds
			
 
				+            reg_preds = torch.cat(outputs['pred_reg'], dim=1)
			
 
				+            reg_preds_pos = reg_preds.view(-1, 4)[fg_masks]
			
 
				+            ## anchor tensors
			
 
				+            anchors_tensors = torch.cat(outputs['anchors'], dim=0)[None].repeat(bs, 1, 1)
			
 
				+            anchors_tensors_pos = anchors_tensors.view(-1, 2)[fg_masks]
			
 
				+            ## stride tensors
			
 
				+            stride_tensors = torch.cat(outputs['stride_tensors'], dim=0)[None].repeat(bs, 1, 1)
			
 
				+            stride_tensors_pos = stride_tensors.view(-1, 1)[fg_masks]
			
 
				+            ## aux loss
			
 
				+            loss_box_aux = self.loss_bboxes_aux(reg_preds_pos, box_targets, anchors_tensors_pos, stride_tensors_pos)
			
 
				+            loss_box_aux = loss_box_aux.sum() / num_fgs
			
 
				+
			
 
				+            losses += loss_box_aux
			
 
				+
			
 
				+        # Loss dict
			
 
				+        if loss_box_aux is None:
			
 
				+            loss_dict = dict(
			
 
				+                    loss_obj = loss_obj,
			
 
				+                    loss_cls = loss_cls,
			
 
				+                    loss_box = loss_box,
			
 
				+                    losses = losses
			
 
				+            )
			
 
				+        else:
			
 
				+            loss_dict = dict(
			
 
				+                    loss_obj = loss_obj,
			
 
				+                    loss_cls = loss_cls,
			
 
				+                    loss_box = loss_box,
			
 
				+                    loss_box_aux = loss_box_aux,
			
 
				+                    losses = losses
			
 
				+                    )
			
 
				+
			
 
				+        return loss_dict
			
 
				+    
			
 
				+
			
 
				+def build_criterion(args, cfg, device, num_classes):
			
 
				+    criterion = Criterion(
			
 
				+        args=args,
			
 
				+        cfg=cfg,
			
 
				+        device=device,
			
 
				+        num_classes=num_classes
			
 
				+        )
			
 
				+
			
 
				+    return criterion
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pass
			
--- a/yolo/models/yolov10/matcher.py
+++ b/yolo/models/yolov10/matcher.py
@@ -0,0 +1,187 @@
 
				+# ---------------------------------------------------------------------
			
 
				+# Copyright (c) Megvii Inc. All rights reserved.
			
 
				+# ---------------------------------------------------------------------
			
 
				+
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+from utils.box_ops import *
			
 
				+
			
 
				+
			
 
				+class SimOTA(object):
			
 
				+    """
			
 
				+        This code referenced to https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/models/yolo_head.py
			
 
				+    """
			
 
				+    def __init__(self, num_classes, center_sampling_radius, topk_candidate ):
			
 
				+        self.num_classes = num_classes
			
 
				+        self.center_sampling_radius = center_sampling_radius
			
 
				+        self.topk_candidate = topk_candidate
			
 
				+
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def __call__(self, 
			
 
				+                 fpn_strides, 
			
 
				+                 anchors, 
			
 
				+                 pred_obj, 
			
 
				+                 pred_cls, 
			
 
				+                 pred_box, 
			
 
				+                 tgt_labels,
			
 
				+                 tgt_bboxes):
			
 
				+        # [M,]
			
 
				+        strides_tensor = torch.cat([torch.ones_like(anchor_i[:, 0]) * stride_i
			
 
				+                                for stride_i, anchor_i in zip(fpn_strides, anchors)], dim=-1)
			
 
				+        # List[F, M, 2] -> [M, 2]
			
 
				+        anchors = torch.cat(anchors, dim=0)
			
 
				+        num_anchor = anchors.shape[0]        
			
 
				+        num_gt = len(tgt_labels)
			
 
				+
			
 
				+        # ----------------------- Find inside points -----------------------
			
 
				+        fg_mask, is_in_boxes_and_center = self.get_in_boxes_info(
			
 
				+            tgt_bboxes, anchors, strides_tensor, num_anchor, num_gt)
			
 
				+        obj_preds = pred_obj[fg_mask].float()   # [Mp, 1]
			
 
				+        cls_preds = pred_cls[fg_mask].float()   # [Mp, C]
			
 
				+        box_preds = pred_box[fg_mask].float()   # [Mp, 4]
			
 
				+
			
 
				+        # ----------------------- Reg cost -----------------------
			
 
				+        pair_wise_ious, _ = box_iou(tgt_bboxes, box_preds)      # [N, Mp]
			
 
				+        reg_cost = -torch.log(pair_wise_ious + 1e-8)            # [N, Mp]
			
 
				+
			
 
				+        # ----------------------- Cls cost -----------------------
			
 
				+        with torch.cuda.amp.autocast(enabled=False):
			
 
				+            # [Mp, C]
			
 
				+            score_preds = torch.sqrt(obj_preds.sigmoid_()* cls_preds.sigmoid_())
			
 
				+            # [N, Mp, C]
			
 
				+            score_preds = score_preds.unsqueeze(0).repeat(num_gt, 1, 1)
			
 
				+            # prepare cls_target
			
 
				+            cls_targets = F.one_hot(tgt_labels.long(), self.num_classes).float()
			
 
				+            cls_targets = cls_targets.unsqueeze(1).repeat(1, score_preds.size(1), 1)
			
 
				+            # [N, Mp]
			
 
				+            cls_cost = F.binary_cross_entropy(score_preds, cls_targets, reduction="none").sum(-1)
			
 
				+        del score_preds
			
 
				+
			
 
				+        #----------------------- Dynamic K-Matching -----------------------
			
 
				+        cost_matrix = (
			
 
				+            cls_cost
			
 
				+            + 3.0 * reg_cost
			
 
				+            + 100000.0 * (~is_in_boxes_and_center)
			
 
				+        ) # [N, Mp]
			
 
				+
			
 
				+        (
			
 
				+            assigned_labels,         # [num_fg,]
			
 
				+            assigned_ious,           # [num_fg,]
			
 
				+            assigned_indexs,         # [num_fg,]
			
 
				+        ) = self.dynamic_k_matching(
			
 
				+            cost_matrix,
			
 
				+            pair_wise_ious,
			
 
				+            tgt_labels,
			
 
				+            num_gt,
			
 
				+            fg_mask
			
 
				+            )
			
 
				+        del cls_cost, cost_matrix, pair_wise_ious, reg_cost
			
 
				+
			
 
				+        return fg_mask, assigned_labels, assigned_ious, assigned_indexs
			
 
				+
			
 
				+
			
 
				+    def get_in_boxes_info(
			
 
				+        self,
			
 
				+        gt_bboxes,   # [N, 4]
			
 
				+        anchors,     # [M, 2]
			
 
				+        strides,     # [M,]
			
 
				+        num_anchors, # M
			
 
				+        num_gt,      # N
			
 
				+        ):
			
 
				+        # anchor center
			
 
				+        x_centers = anchors[:, 0]
			
 
				+        y_centers = anchors[:, 1]
			
 
				+
			
 
				+        # [M,] -> [1, M] -> [N, M]
			
 
				+        x_centers = x_centers.unsqueeze(0).repeat(num_gt, 1)
			
 
				+        y_centers = y_centers.unsqueeze(0).repeat(num_gt, 1)
			
 
				+
			
 
				+        # [N,] -> [N, 1] -> [N, M]
			
 
				+        gt_bboxes_l = gt_bboxes[:, 0].unsqueeze(1).repeat(1, num_anchors) # x1
			
 
				+        gt_bboxes_t = gt_bboxes[:, 1].unsqueeze(1).repeat(1, num_anchors) # y1
			
 
				+        gt_bboxes_r = gt_bboxes[:, 2].unsqueeze(1).repeat(1, num_anchors) # x2
			
 
				+        gt_bboxes_b = gt_bboxes[:, 3].unsqueeze(1).repeat(1, num_anchors) # y2
			
 
				+
			
 
				+        b_l = x_centers - gt_bboxes_l
			
 
				+        b_r = gt_bboxes_r - x_centers
			
 
				+        b_t = y_centers - gt_bboxes_t
			
 
				+        b_b = gt_bboxes_b - y_centers
			
 
				+        bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2)
			
 
				+
			
 
				+        is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0
			
 
				+        is_in_boxes_all = is_in_boxes.sum(dim=0) > 0
			
 
				+        # in fixed center
			
 
				+        center_radius = self.center_sampling_radius
			
 
				+
			
 
				+        # [N, 2]
			
 
				+        gt_centers = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) * 0.5
			
 
				+        
			
 
				+        # [1, M]
			
 
				+        center_radius_ = center_radius * strides.unsqueeze(0)
			
 
				+
			
 
				+        gt_bboxes_l = gt_centers[:, 0].unsqueeze(1).repeat(1, num_anchors) - center_radius_ # x1
			
 
				+        gt_bboxes_t = gt_centers[:, 1].unsqueeze(1).repeat(1, num_anchors) - center_radius_ # y1
			
 
				+        gt_bboxes_r = gt_centers[:, 0].unsqueeze(1).repeat(1, num_anchors) + center_radius_ # x2
			
 
				+        gt_bboxes_b = gt_centers[:, 1].unsqueeze(1).repeat(1, num_anchors) + center_radius_ # y2
			
 
				+
			
 
				+        c_l = x_centers - gt_bboxes_l
			
 
				+        c_r = gt_bboxes_r - x_centers
			
 
				+        c_t = y_centers - gt_bboxes_t
			
 
				+        c_b = gt_bboxes_b - y_centers
			
 
				+        center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
			
 
				+        is_in_centers = center_deltas.min(dim=-1).values > 0.0
			
 
				+        is_in_centers_all = is_in_centers.sum(dim=0) > 0
			
 
				+
			
 
				+        # in boxes and in centers
			
 
				+        is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all
			
 
				+
			
 
				+        is_in_boxes_and_center = (
			
 
				+            is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor]
			
 
				+        )
			
 
				+        return is_in_boxes_anchor, is_in_boxes_and_center
			
 
				+    
			
 
				+    
			
 
				+    def dynamic_k_matching(
			
 
				+        self, 
			
 
				+        cost, 
			
 
				+        pair_wise_ious, 
			
 
				+        gt_classes, 
			
 
				+        num_gt, 
			
 
				+        fg_mask
			
 
				+        ):
			
 
				+        # Dynamic K
			
 
				+        # ---------------------------------------------------------------
			
 
				+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
			
 
				+
			
 
				+        ious_in_boxes_matrix = pair_wise_ious
			
 
				+        n_candidate_k = min(self.topk_candidate, ious_in_boxes_matrix.size(1))
			
 
				+        topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1)
			
 
				+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
			
 
				+        dynamic_ks = dynamic_ks.tolist()
			
 
				+        for gt_idx in range(num_gt):
			
 
				+            _, pos_idx = torch.topk(
			
 
				+                cost[gt_idx], k=dynamic_ks[gt_idx], largest=False
			
 
				+            )
			
 
				+            matching_matrix[gt_idx][pos_idx] = 1
			
 
				+
			
 
				+        del topk_ious, dynamic_ks, pos_idx
			
 
				+
			
 
				+        anchor_matching_gt = matching_matrix.sum(0)
			
 
				+        if (anchor_matching_gt > 1).sum() > 0:
			
 
				+            _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
			
 
				+            matching_matrix[:, anchor_matching_gt > 1] *= 0
			
 
				+            matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1
			
 
				+        fg_mask_inboxes = matching_matrix.sum(0) > 0
			
 
				+
			
 
				+        fg_mask[fg_mask.clone()] = fg_mask_inboxes
			
 
				+
			
 
				+        assigned_indexs = matching_matrix[:, fg_mask_inboxes].argmax(0)
			
 
				+        assigned_labels = gt_classes[assigned_indexs]
			
 
				+
			
 
				+        assigned_ious = (matching_matrix * pair_wise_ious).sum(0)[
			
 
				+            fg_mask_inboxes
			
 
				+        ]
			
 
				+        return assigned_labels, assigned_ious, assigned_indexs
			
 
				+    
			
--- a/yolo/models/yolov10/modules.py
+++ b/yolo/models/yolov10/modules.py
@@ -0,0 +1,338 @@
 
				+import numpy as np
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+
			
 
				+
			
 
				+# ---------------------------- 2D CNN ----------------------------
			
 
				+class SiLU(nn.Module):
			
 
				+    """export-friendly version of nn.SiLU()"""
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def forward(x):
			
 
				+        return x * torch.sigmoid(x)
			
 
				+
			
 
				+def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
			
 
				+    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
			
 
				+
			
 
				+    return conv
			
 
				+
			
 
				+def get_activation(act_type=None):
			
 
				+    if act_type == 'relu':
			
 
				+        return nn.ReLU(inplace=True)
			
 
				+    elif act_type == 'lrelu':
			
 
				+        return nn.LeakyReLU(0.1, inplace=True)
			
 
				+    elif act_type == 'mish':
			
 
				+        return nn.Mish(inplace=True)
			
 
				+    elif act_type == 'silu':
			
 
				+        return nn.SiLU(inplace=True)
			
 
				+
			
 
				+def get_norm(norm_type, dim):
			
 
				+    if norm_type == 'BN':
			
 
				+        return nn.BatchNorm2d(dim)
			
 
				+    elif norm_type == 'GN':
			
 
				+        return nn.GroupNorm(num_groups=32, num_channels=dim)
			
 
				+
			
 
				+## Basic conv layer
			
 
				+class Conv(nn.Module):
			
 
				+    def __init__(self, 
			
 
				+                 c1,                   # in channels
			
 
				+                 c2,                   # out channels 
			
 
				+                 k=1,                  # kernel size 
			
 
				+                 p=0,                  # padding
			
 
				+                 s=1,                  # padding
			
 
				+                 d=1,                  # dilation
			
 
				+                 act_type='lrelu',     # activation
			
 
				+                 norm_type='BN',       # normalization
			
 
				+                 depthwise=False):
			
 
				+        super(Conv, self).__init__()
			
 
				+        convs = []
			
 
				+        add_bias = False if norm_type else True
			
 
				+        if depthwise:
			
 
				+            convs.append(get_conv2d(c1, c1, k=k, p=p, s=s, d=d, g=c1, bias=add_bias))
			
 
				+            # depthwise conv
			
 
				+            if norm_type:
			
 
				+                convs.append(get_norm(norm_type, c1))
			
 
				+            if act_type:
			
 
				+                convs.append(get_activation(act_type))
			
 
				+            # pointwise conv
			
 
				+            convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias))
			
 
				+            if norm_type:
			
 
				+                convs.append(get_norm(norm_type, c2))
			
 
				+            if act_type:
			
 
				+                convs.append(get_activation(act_type))
			
 
				+
			
 
				+        else:
			
 
				+            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=1, bias=add_bias))
			
 
				+            if norm_type:
			
 
				+                convs.append(get_norm(norm_type, c2))
			
 
				+            if act_type:
			
 
				+                convs.append(get_activation(act_type))
			
 
				+            
			
 
				+        self.convs = nn.Sequential(*convs)
			
 
				+
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        return self.convs(x)
			
 
				+
			
 
				+
			
 
				+# ---------------------------- YOLOv7 Modules ----------------------------
			
 
				+## ELAN-Block proposed by YOLOv7
			
 
				+class ELANBlock(nn.Module):
			
 
				+    def __init__(self, in_dim, out_dim, squeeze_ratio=0.5, branch_depth :int=2, act_type='silu', norm_type='BN', depthwise=False):
			
 
				+        super(ELANBlock, self).__init__()
			
 
				+        inter_dim = int(in_dim * squeeze_ratio)
			
 
				+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				+        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				+        self.cv3 = nn.Sequential(*[
			
 
				+            Conv(inter_dim, inter_dim, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				+            for _ in range(round(branch_depth))
			
 
				+        ])
			
 
				+        self.cv4 = nn.Sequential(*[
			
 
				+            Conv(inter_dim, inter_dim, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				+            for _ in range(round(branch_depth))
			
 
				+        ])
			
 
				+
			
 
				+        self.out = Conv(inter_dim*4, out_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        x1 = self.cv1(x)
			
 
				+        x2 = self.cv2(x)
			
 
				+        x3 = self.cv3(x2)
			
 
				+        x4 = self.cv4(x3)
			
 
				+        out = self.out(torch.cat([x1, x2, x3, x4], dim=1))
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+## PaFPN's ELAN-Block proposed by YOLOv7
			
 
				+class ELANBlockFPN(nn.Module):
			
 
				+    def __init__(self, in_dim, out_dim, squeeze_ratio=0.5, branch_width :int=4, branch_depth :int=1, act_type='silu', norm_type='BN', depthwise=False):
			
 
				+        super(ELANBlockFPN, self).__init__()
			
 
				+        # Basic parameters
			
 
				+        inter_dim = int(in_dim * squeeze_ratio)
			
 
				+        inter_dim2 = int(inter_dim * squeeze_ratio) 
			
 
				+        # Network structure
			
 
				+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				+        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				+        self.cv3 = nn.ModuleList()
			
 
				+        for idx in range(round(branch_width)):
			
 
				+            if idx == 0:
			
 
				+                cvs = [Conv(inter_dim, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)]
			
 
				+            else:
			
 
				+                cvs = [Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)]
			
 
				+            # deeper
			
 
				+            if round(branch_depth) > 1:
			
 
				+                for _ in range(1, round(branch_depth)):
			
 
				+                    cvs.append(Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise))
			
 
				+                self.cv3.append(nn.Sequential(*cvs))
			
 
				+            else:
			
 
				+                self.cv3.append(cvs[0])
			
 
				+
			
 
				+        self.out = Conv(inter_dim*2+inter_dim2*len(self.cv3), out_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				+
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        x1 = self.cv1(x)
			
 
				+        x2 = self.cv2(x)
			
 
				+        inter_outs = [x1, x2]
			
 
				+        for m in self.cv3:
			
 
				+            y1 = inter_outs[-1]
			
 
				+            y2 = m(y1)
			
 
				+            inter_outs.append(y2)
			
 
				+        out = self.out(torch.cat(inter_outs, dim=1))
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+## DownSample Block proposed by YOLOv7
			
 
				+class DownSample(nn.Module):
			
 
				+    def __init__(self, in_dim, out_dim, act_type='silu', norm_type='BN', depthwise=False):
			
 
				+        super().__init__()
			
 
				+        inter_dim = out_dim // 2
			
 
				+        self.mp = nn.MaxPool2d((2, 2), 2)
			
 
				+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
			
 
				+        self.cv2 = nn.Sequential(
			
 
				+            Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type),
			
 
				+            Conv(inter_dim, inter_dim, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        x1 = self.cv1(self.mp(x))
			
 
				+        x2 = self.cv2(x)
			
 
				+        out = torch.cat([x1, x2], dim=1)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+# ---------------------------- RepConv Modules ----------------------------
			
 
				+class RepConv(nn.Module):
			
 
				+    """
			
 
				+        The code referenced to https://github.com/WongKinYiu/yolov7/models/common.py
			
 
				+    """
			
 
				+    # Represented convolution
			
 
				+    # https://arxiv.org/abs/2101.03697
			
 
				+
			
 
				+    def __init__(self, c1, c2, k=3, s=1, p=1, g=1, act_type='silu', deploy=False):
			
 
				+        super(RepConv, self).__init__()
			
 
				+        # -------------- Basic parameters --------------
			
 
				+        self.deploy = deploy
			
 
				+        self.groups = g
			
 
				+        self.in_channels = c1
			
 
				+        self.out_channels = c2
			
 
				+
			
 
				+        # -------------- Network parameters --------------
			
 
				+        if deploy:
			
 
				+            self.rbr_reparam = nn.Conv2d(c1, c2, k, s, p, groups=g, bias=True)
			
 
				+
			
 
				+        else:
			
 
				+            self.rbr_identity = (nn.BatchNorm2d(num_features=c1) if c2 == c1 and s == 1 else None)
			
 
				+
			
 
				+            self.rbr_dense = nn.Sequential(
			
 
				+                nn.Conv2d(c1, c2, k, s, p, groups=g, bias=False),
			
 
				+                nn.BatchNorm2d(num_features=c2),
			
 
				+            )
			
 
				+
			
 
				+            self.rbr_1x1 = nn.Sequential(
			
 
				+                nn.Conv2d(c1, c2, kernel_size=1, stride=s, bias=False),
			
 
				+                nn.BatchNorm2d(num_features=c2),
			
 
				+            )
			
 
				+        self.act = get_activation(act_type)
			
 
				+
			
 
				+
			
 
				+    def forward(self, inputs):
			
 
				+        if hasattr(self, "rbr_reparam"):
			
 
				+            return self.act(self.rbr_reparam(inputs))
			
 
				+
			
 
				+        if self.rbr_identity is None:
			
 
				+            id_out = 0
			
 
				+        else:
			
 
				+            id_out = self.rbr_identity(inputs)
			
 
				+
			
 
				+        return self.act(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)
			
 
				+    
			
 
				+    def get_equivalent_kernel_bias(self):
			
 
				+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
			
 
				+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
			
 
				+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
			
 
				+        return (
			
 
				+            kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid,
			
 
				+            bias3x3 + bias1x1 + biasid,
			
 
				+        )
			
 
				+
			
 
				+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
			
 
				+        if kernel1x1 is None:
			
 
				+            return 0
			
 
				+        else:
			
 
				+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
			
 
				+
			
 
				+    def _fuse_bn_tensor(self, branch):
			
 
				+        if branch is None:
			
 
				+            return 0, 0
			
 
				+        if isinstance(branch, nn.Sequential):
			
 
				+            kernel = branch[0].weight
			
 
				+            running_mean = branch[1].running_mean
			
 
				+            running_var = branch[1].running_var
			
 
				+            gamma = branch[1].weight
			
 
				+            beta = branch[1].bias
			
 
				+            eps = branch[1].eps
			
 
				+        else:
			
 
				+            assert isinstance(branch, nn.BatchNorm2d)
			
 
				+            if not hasattr(self, "id_tensor"):
			
 
				+                input_dim = self.in_channels // self.groups
			
 
				+                kernel_value = np.zeros(
			
 
				+                    (self.in_channels, input_dim, 3, 3), dtype=np.float32
			
 
				+                )
			
 
				+                for i in range(self.in_channels):
			
 
				+                    kernel_value[i, i % input_dim, 1, 1] = 1
			
 
				+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
			
 
				+            kernel = self.id_tensor
			
 
				+            running_mean = branch.running_mean
			
 
				+            running_var = branch.running_var
			
 
				+            gamma = branch.weight
			
 
				+            beta = branch.bias
			
 
				+            eps = branch.eps
			
 
				+        std = (running_var + eps).sqrt()
			
 
				+        t = (gamma / std).reshape(-1, 1, 1, 1)
			
 
				+        return kernel * t, beta - running_mean * gamma / std
			
 
				+
			
 
				+    def repvgg_convert(self):
			
 
				+        kernel, bias = self.get_equivalent_kernel_bias()
			
 
				+        return (
			
 
				+            kernel.detach().cpu().numpy(),
			
 
				+            bias.detach().cpu().numpy(),
			
 
				+        )
			
 
				+
			
 
				+    def fuse_conv_bn(self, conv, bn):
			
 
				+
			
 
				+        std = (bn.running_var + bn.eps).sqrt()
			
 
				+        bias = bn.bias - bn.running_mean * bn.weight / std
			
 
				+
			
 
				+        t = (bn.weight / std).reshape(-1, 1, 1, 1)
			
 
				+        weights = conv.weight * t
			
 
				+
			
 
				+        bn = nn.Identity()
			
 
				+        conv = nn.Conv2d(in_channels = conv.in_channels,
			
 
				+                              out_channels = conv.out_channels,
			
 
				+                              kernel_size = conv.kernel_size,
			
 
				+                              stride=conv.stride,
			
 
				+                              padding = conv.padding,
			
 
				+                              dilation = conv.dilation,
			
 
				+                              groups = conv.groups,
			
 
				+                              bias = True,
			
 
				+                              padding_mode = conv.padding_mode)
			
 
				+
			
 
				+        conv.weight = torch.nn.Parameter(weights)
			
 
				+        conv.bias = torch.nn.Parameter(bias)
			
 
				+        return conv
			
 
				+
			
 
				+    def fuse_repvgg_block(self):    
			
 
				+        if self.deploy:
			
 
				+            return
			
 
				+                
			
 
				+        self.rbr_dense = self.fuse_conv_bn(self.rbr_dense[0], self.rbr_dense[1])
			
 
				+        
			
 
				+        self.rbr_1x1 = self.fuse_conv_bn(self.rbr_1x1[0], self.rbr_1x1[1])
			
 
				+        rbr_1x1_bias = self.rbr_1x1.bias
			
 
				+        weight_1x1_expanded = torch.nn.functional.pad(self.rbr_1x1.weight, [1, 1, 1, 1])
			
 
				+        
			
 
				+        # Fuse self.rbr_identity
			
 
				+        if (isinstance(self.rbr_identity, nn.BatchNorm2d) or isinstance(self.rbr_identity, nn.modules.batchnorm.SyncBatchNorm)):
			
 
				+            identity_conv_1x1 = nn.Conv2d(
			
 
				+                    in_channels=self.in_channels,
			
 
				+                    out_channels=self.out_channels,
			
 
				+                    kernel_size=1,
			
 
				+                    stride=1,
			
 
				+                    padding=0,
			
 
				+                    groups=self.groups, 
			
 
				+                    bias=False)
			
 
				+            identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.to(self.rbr_1x1.weight.data.device)
			
 
				+            identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.squeeze().squeeze()
			
 
				+
			
 
				+            identity_conv_1x1.weight.data.fill_(0.0)
			
 
				+            identity_conv_1x1.weight.data.fill_diagonal_(1.0)
			
 
				+            identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.unsqueeze(2).unsqueeze(3)
			
 
				+
			
 
				+            identity_conv_1x1 = self.fuse_conv_bn(identity_conv_1x1, self.rbr_identity)
			
 
				+            bias_identity_expanded = identity_conv_1x1.bias
			
 
				+            weight_identity_expanded = torch.nn.functional.pad(identity_conv_1x1.weight, [1, 1, 1, 1])            
			
 
				+        else:
			
 
				+            bias_identity_expanded = torch.nn.Parameter( torch.zeros_like(rbr_1x1_bias) )
			
 
				+            weight_identity_expanded = torch.nn.Parameter( torch.zeros_like(weight_1x1_expanded) )            
			
 
				+        
			
 
				+        self.rbr_dense.weight = torch.nn.Parameter(self.rbr_dense.weight + weight_1x1_expanded + weight_identity_expanded)
			
 
				+        self.rbr_dense.bias = torch.nn.Parameter(self.rbr_dense.bias + rbr_1x1_bias + bias_identity_expanded)
			
 
				+                
			
 
				+        self.rbr_reparam = self.rbr_dense
			
 
				+        self.deploy = True
			
 
				+
			
 
				+        if self.rbr_identity is not None:
			
 
				+            del self.rbr_identity
			
 
				+            self.rbr_identity = None
			
 
				+
			
 
				+        if self.rbr_1x1 is not None:
			
 
				+            del self.rbr_1x1
			
 
				+            self.rbr_1x1 = None
			
 
				+
			
 
				+        if self.rbr_dense is not None:
			
 
				+            del self.rbr_dense
			
 
				+            self.rbr_dense = None
			
--- a/yolo/models/yolov10/yolov10.py
+++ b/yolo/models/yolov10/yolov10.py
@@ -0,0 +1,302 @@
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+
			
 
				+from utils.misc import multiclass_nms
			
 
				+
			
 
				+from .yolov10_backbone import build_backbone
			
 
				+from .yolov10_neck import build_neck
			
 
				+from .yolov10_pafpn import build_fpn
			
 
				+from .yolov10_head import build_head
			
 
				+
			
 
				+
			
 
				+# YOLOv7
			
 
				+class YOLOv7(nn.Module):
			
 
				+    def __init__(self,
			
 
				+                 cfg,
			
 
				+                 device,
			
 
				+                 num_classes=20,
			
 
				+                 conf_thresh=0.01,
			
 
				+                 topk=100,
			
 
				+                 nms_thresh=0.5,
			
 
				+                 trainable=False,
			
 
				+                 deploy = False,
			
 
				+                 no_multi_labels = False,
			
 
				+                 nms_class_agnostic = False):
			
 
				+        super(YOLOv7, self).__init__()
			
 
				+        # ------------------- Basic parameters -------------------
			
 
				+        self.cfg = cfg                                 # 模型配置文件
			
 
				+        self.device = device                           # cuda或者是cpu
			
 
				+        self.num_classes = num_classes                 # 类别的数量
			
 
				+        self.trainable = trainable                     # 训练的标记
			
 
				+        self.conf_thresh = conf_thresh                 # 得分阈值
			
 
				+        self.nms_thresh = nms_thresh                   # NMS阈值
			
 
				+        self.topk_candidates = topk                    # topk
			
 
				+        self.stride = [8, 16, 32]                      # 网络的输出步长
			
 
				+        self.num_levels = 3
			
 
				+        self.deploy = deploy
			
 
				+        self.no_multi_labels = no_multi_labels
			
 
				+        self.nms_class_agnostic = nms_class_agnostic
			
 
				+        # ------------------- Network Structure -------------------
			
 
				+        ## 主干网络
			
 
				+        self.backbone, feats_dim = build_backbone(cfg, trainable&cfg['pretrained'])
			
 
				+
			
 
				+        ## 颈部网络: SPP模块
			
 
				+        self.neck = build_neck(cfg, in_dim=feats_dim[-1], out_dim=feats_dim[-1]//2)
			
 
				+        feats_dim[-1] = self.neck.out_dim
			
 
				+
			
 
				+        ## 颈部网络: 特征金字塔
			
 
				+        self.fpn = build_fpn(cfg=cfg, in_dims=feats_dim, out_dim=round(256*cfg['channel_width']))
			
 
				+        self.head_dim = self.fpn.out_dim
			
 
				+
			
 
				+        ## 检测头
			
 
				+        self.non_shared_heads = nn.ModuleList(
			
 
				+            [build_head(cfg, head_dim, head_dim, num_classes) 
			
 
				+            for head_dim in self.head_dim
			
 
				+            ])
			
 
				+
			
 
				+        ## 预测层
			
 
				+        self.obj_preds = nn.ModuleList(
			
 
				+                            [nn.Conv2d(head.reg_out_dim, 1, kernel_size=1) 
			
 
				+                                for head in self.non_shared_heads
			
 
				+                              ]) 
			
 
				+        self.cls_preds = nn.ModuleList(
			
 
				+                            [nn.Conv2d(head.cls_out_dim, self.num_classes, kernel_size=1) 
			
 
				+                                for head in self.non_shared_heads
			
 
				+                              ]) 
			
 
				+        self.reg_preds = nn.ModuleList(
			
 
				+                            [nn.Conv2d(head.reg_out_dim, 4, kernel_size=1) 
			
 
				+                                for head in self.non_shared_heads
			
 
				+                              ])                 
			
 
				+
			
 
				+
			
 
				+    # ---------------------- Basic Functions ----------------------
			
 
				+    ## generate anchor points
			
 
				+    def generate_anchors(self, level, fmp_size):
			
 
				+        """
			
 
				+            fmp_size: (List) [H, W]
			
 
				+        """
			
 
				+        # generate grid cells
			
 
				+        fmp_h, fmp_w = fmp_size
			
 
				+        anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
			
 
				+        # [H, W, 2] -> [HW, 2]
			
 
				+        anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2)
			
 
				+        anchor_xy += 0.5  # add center offset
			
 
				+        anchor_xy *= self.stride[level]
			
 
				+        anchors = anchor_xy.to(self.device)
			
 
				+
			
 
				+        return anchors
			
 
				+        
			
 
				+    ## post-process
			
 
				+    def post_process(self, obj_preds, cls_preds, box_preds):
			
 
				+        """
			
 
				+        Input:
			
 
				+            cls_preds: List[np.array] -> [[M, C], ...]
			
 
				+            box_preds: List[np.array] -> [[M, 4], ...]
			
 
				+            obj_preds: List[np.array] -> [[M, 1], ...] or None
			
 
				+        Output:
			
 
				+            bboxes: np.array -> [N, 4]
			
 
				+            scores: np.array -> [N,]
			
 
				+            labels: np.array -> [N,]
			
 
				+        """
			
 
				+        assert len(cls_preds) == self.num_levels
			
 
				+        all_scores = []
			
 
				+        all_labels = []
			
 
				+        all_bboxes = []
			
 
				+        
			
 
				+        for obj_pred_i, cls_pred_i, box_pred_i in zip(obj_preds, cls_preds, box_preds):
			
 
				+            if self.no_multi_labels:
			
 
				+                # [M,]
			
 
				+                scores, labels = torch.max(torch.sqrt(obj_pred_i.sigmoid() * cls_pred_i.sigmoid()), dim=1)
			
 
				+
			
 
				+                # Keep top k top scoring indices only.
			
 
				+                num_topk = min(self.topk_candidates, box_pred_i.size(0))
			
 
				+
			
 
				+                # topk candidates
			
 
				+                predicted_prob, topk_idxs = scores.sort(descending=True)
			
 
				+                topk_scores = predicted_prob[:num_topk]
			
 
				+                topk_idxs = topk_idxs[:num_topk]
			
 
				+
			
 
				+                # filter out the proposals with low confidence score
			
 
				+                keep_idxs = topk_scores > self.conf_thresh
			
 
				+                scores = topk_scores[keep_idxs]
			
 
				+                topk_idxs = topk_idxs[keep_idxs]
			
 
				+
			
 
				+                labels = labels[topk_idxs]
			
 
				+                bboxes = box_pred_i[topk_idxs]
			
 
				+
			
 
				+            else:
			
 
				+                # [M, C] -> [MC,]
			
 
				+                scores_i = (torch.sqrt(obj_pred_i.sigmoid() * cls_pred_i.sigmoid())).flatten()
			
 
				+
			
 
				+                # Keep top k top scoring indices only.
			
 
				+                num_topk = min(self.topk_candidates, box_pred_i.size(0))
			
 
				+
			
 
				+                # torch.sort is actually faster than .topk (at least on GPUs)
			
 
				+                predicted_prob, topk_idxs = scores_i.sort(descending=True)
			
 
				+                topk_scores = predicted_prob[:num_topk]
			
 
				+                topk_idxs = topk_idxs[:num_topk]
			
 
				+
			
 
				+                # filter out the proposals with low confidence score
			
 
				+                keep_idxs = topk_scores > self.conf_thresh
			
 
				+                scores = topk_scores[keep_idxs]
			
 
				+                topk_idxs = topk_idxs[keep_idxs]
			
 
				+
			
 
				+                anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
			
 
				+                labels = topk_idxs % self.num_classes
			
 
				+
			
 
				+                bboxes = box_pred_i[anchor_idxs]
			
 
				+
			
 
				+            all_scores.append(scores)
			
 
				+            all_labels.append(labels)
			
 
				+            all_bboxes.append(bboxes)
			
 
				+
			
 
				+        scores = torch.cat(all_scores)
			
 
				+        labels = torch.cat(all_labels)
			
 
				+        bboxes = torch.cat(all_bboxes)
			
 
				+
			
 
				+        # to cpu & numpy
			
 
				+        scores = scores.cpu().numpy()
			
 
				+        labels = labels.cpu().numpy()
			
 
				+        bboxes = bboxes.cpu().numpy()
			
 
				+
			
 
				+        # nms
			
 
				+        scores, labels, bboxes = multiclass_nms(
			
 
				+            scores, labels, bboxes, self.nms_thresh, self.num_classes, self.nms_class_agnostic)
			
 
				+
			
 
				+        return bboxes, scores, labels
			
 
				+    
			
 
				+
			
 
				+    # ---------------------- Main Process for Inference ----------------------
			
 
				+    @torch.no_grad()
			
 
				+    def inference_single_image(self, x):
			
 
				+        # 主干网络
			
 
				+        pyramid_feats = self.backbone(x)
			
 
				+
			
 
				+        # 颈部网络
			
 
				+        pyramid_feats[-1] = self.neck(pyramid_feats[-1])
			
 
				+
			
 
				+        # 特征金字塔
			
 
				+        pyramid_feats = self.fpn(pyramid_feats)
			
 
				+
			
 
				+        # 检测头
			
 
				+        all_obj_preds = []
			
 
				+        all_cls_preds = []
			
 
				+        all_box_preds = []
			
 
				+        all_anchors = []
			
 
				+        for level, (feat, head) in enumerate(zip(pyramid_feats, self.non_shared_heads)):
			
 
				+            cls_feat, reg_feat = head(feat)
			
 
				+
			
 
				+            # [1, C, H, W]
			
 
				+            obj_pred = self.obj_preds[level](reg_feat)
			
 
				+            cls_pred = self.cls_preds[level](cls_feat)
			
 
				+            reg_pred = self.reg_preds[level](reg_feat)
			
 
				+
			
 
				+            # anchors: [M, 2]
			
 
				+            fmp_size = cls_pred.shape[-2:]
			
 
				+            anchors = self.generate_anchors(level, fmp_size)
			
 
				+
			
 
				+            # [1, C, H, W] -> [H, W, C] -> [M, C]
			
 
				+            obj_pred = obj_pred[0].permute(1, 2, 0).contiguous().view(-1, 1)
			
 
				+            cls_pred = cls_pred[0].permute(1, 2, 0).contiguous().view(-1, self.num_classes)
			
 
				+            reg_pred = reg_pred[0].permute(1, 2, 0).contiguous().view(-1, 4)
			
 
				+
			
 
				+            # decode bbox
			
 
				+            ctr_pred = reg_pred[..., :2] * self.stride[level] + anchors[..., :2]
			
 
				+            wh_pred = torch.exp(reg_pred[..., 2:]) * self.stride[level]
			
 
				+            pred_x1y1 = ctr_pred - wh_pred * 0.5
			
 
				+            pred_x2y2 = ctr_pred + wh_pred * 0.5
			
 
				+            box_pred = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
			
 
				+
			
 
				+            all_obj_preds.append(obj_pred)
			
 
				+            all_cls_preds.append(cls_pred)
			
 
				+            all_box_preds.append(box_pred)
			
 
				+            all_anchors.append(anchors)
			
 
				+
			
 
				+        if self.deploy:
			
 
				+            obj_preds = torch.cat(all_obj_preds, dim=0)
			
 
				+            cls_preds = torch.cat(all_cls_preds, dim=0)
			
 
				+            box_preds = torch.cat(all_box_preds, dim=0)
			
 
				+            scores = torch.sqrt(obj_preds.sigmoid() * cls_preds.sigmoid())
			
 
				+            bboxes = box_preds
			
 
				+            # [n_anchors_all, 4 + C]
			
 
				+            outputs = torch.cat([bboxes, scores], dim=-1)
			
 
				+
			
 
				+        else:
			
 
				+            # post process
			
 
				+            bboxes, scores, labels = self.post_process(
			
 
				+                all_obj_preds, all_cls_preds, all_box_preds)
			
 
				+            outputs = {
			
 
				+                "scores": scores,
			
 
				+                "labels": labels,
			
 
				+                "bboxes": bboxes
			
 
				+            }
			
 
				+
			
 
				+        return outputs
			
 
				+
			
 
				+    # ---------------------- Main Process for Training ----------------------
			
 
				+    def forward(self, x):
			
 
				+        if not self.trainable:
			
 
				+            return self.inference_single_image(x)
			
 
				+        else:
			
 
				+            # 主干网络
			
 
				+            pyramid_feats = self.backbone(x)
			
 
				+
			
 
				+            # 颈部网络
			
 
				+            pyramid_feats[-1] = self.neck(pyramid_feats[-1])
			
 
				+
			
 
				+            # 特征金字塔
			
 
				+            pyramid_feats = self.fpn(pyramid_feats)
			
 
				+
			
 
				+            # 检测头
			
 
				+            all_anchors = []
			
 
				+            all_strides = []
			
 
				+            all_obj_preds = []
			
 
				+            all_cls_preds = []
			
 
				+            all_box_preds = []
			
 
				+            all_reg_preds = []
			
 
				+            for level, (feat, head) in enumerate(zip(pyramid_feats, self.non_shared_heads)):
			
 
				+                cls_feat, reg_feat = head(feat)
			
 
				+
			
 
				+                # [B, C, H, W]
			
 
				+                obj_pred = self.obj_preds[level](reg_feat)
			
 
				+                cls_pred = self.cls_preds[level](cls_feat)
			
 
				+                reg_pred = self.reg_preds[level](reg_feat)
			
 
				+
			
 
				+                B, _, H, W = cls_pred.size()
			
 
				+                fmp_size = [H, W]
			
 
				+                # generate anchor boxes: [M, 4]
			
 
				+                anchors = self.generate_anchors(level, fmp_size)
			
 
				+                
			
 
				+                # stride tensor: [M, 1]
			
 
				+                stride_tensor = torch.ones_like(anchors[..., :1]) * self.stride[level]
			
 
				+
			
 
				+                # [B, C, H, W] -> [B, H, W, C] -> [B, M, C]
			
 
				+                obj_pred = obj_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
			
 
				+                cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
			
 
				+                reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 4)
			
 
				+
			
 
				+                # decode bbox
			
 
				+                ctr_pred = reg_pred[..., :2] * self.stride[level] + anchors[..., :2]
			
 
				+                wh_pred = torch.exp(reg_pred[..., 2:]) * self.stride[level]
			
 
				+                pred_x1y1 = ctr_pred - wh_pred * 0.5
			
 
				+                pred_x2y2 = ctr_pred + wh_pred * 0.5
			
 
				+                box_pred = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
			
 
				+
			
 
				+                all_obj_preds.append(obj_pred)
			
 
				+                all_cls_preds.append(cls_pred)
			
 
				+                all_box_preds.append(box_pred)
			
 
				+                all_reg_preds.append(reg_pred)
			
 
				+                all_anchors.append(anchors)
			
 
				+                all_strides.append(stride_tensor)
			
 
				+            
			
 
				+            # output dict
			
 
				+            outputs = {"pred_obj": all_obj_preds,        # List(Tensor) [B, M, 1]
			
 
				+                       "pred_cls": all_cls_preds,        # List(Tensor) [B, M, C]
			
 
				+                       "pred_box": all_box_preds,        # List(Tensor) [B, M, 4]
			
 
				+                       "pred_reg": all_reg_preds,        # List(Tensor) [B, M, 4]
			
 
				+                       "anchors": all_anchors,           # List(Tensor) [M, 2]
			
 
				+                       "strides": self.stride,           # List(Int) [8, 16, 32]
			
 
				+                       "stride_tensors": all_strides     # List(Tensor) [M, 1]
			
 
				+                       }
			
 
				+
			
 
				+            return outputs