| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248 |
- # Real-time Transformer-based Object Detector
- # ------------------- Det task --------------------
- rtdetr_cfg = {
- 'rtdetr_r18':{
- # ---------------- Model config ----------------
- ## Model scale
- 'width': 1.0,
- 'depth': 1.0,
- ## Image Encoder - Backbone
- 'backbone': 'resnet18',
- 'backbone_norm': 'FrozeBN',
- 'res5_dilation': False,
- 'pretrained': True,
- 'pretrained_weight': 'imagenet1k_v1',
- 'freeze_at': 0,
- 'freeze_stem_only': False,
- 'out_stride': [8, 16, 32],
- 'max_stride': 32,
- ## Image Encoder - FPN
- 'fpn': 'hybrid_encoder',
- 'fpn_act': 'silu',
- 'fpn_norm': 'BN',
- 'fpn_depthwise': False,
- 'hidden_dim': 256,
- 'en_num_heads': 8,
- 'en_num_layers': 1,
- 'en_mlp_ratio': 4.0,
- 'en_dropout': 0.0,
- 'pe_temperature': 10000.,
- 'en_act': 'gelu',
- # Transformer Decoder
- 'transformer': 'rtdetr_transformer',
- 'hidden_dim': 256,
- 'de_num_heads': 8,
- 'de_num_layers': 3,
- 'de_mlp_ratio': 4.0,
- 'de_dropout': 0.0,
- 'de_act': 'relu',
- 'de_num_points': 4,
- 'num_queries': 300,
- 'learnt_init_query': False,
- 'pe_temperature': 10000.,
- 'dn_num_denoising': 100,
- 'dn_label_noise_ratio': 0.5,
- 'dn_box_noise_scale': 1,
- # Head
- 'det_head': 'dino_head',
- # ---------------- Assignment config ----------------
- 'matcher_hpy': {'cost_class': 2.0,
- 'cost_bbox': 5.0,
- 'cost_giou': 2.0,},
- # ---------------- Loss config ----------------
- 'use_vfl': True,
- 'loss_coeff': {'class': 1,
- 'bbox': 5,
- 'giou': 2,},
- # ---------------- Train config ----------------
- ## input
- 'multi_scale': [0.5, 1.25], # 320 -> 800
- 'trans_type': 'rtdetr_base',
- # ---------------- Train config ----------------
- 'trainer_type': 'rtdetr',
- },
- 'rtdetr_r50':{
- # ---------------- Model config ----------------
- ## Model scale
- 'width': 1.0,
- 'depth': 1.0,
- ## Image Encoder - Backbone
- 'backbone': 'resnet50',
- 'backbone_norm': 'FrozeBN',
- 'res5_dilation': False,
- 'pretrained': True,
- 'pretrained_weight': 'imagenet1k_v1',
- 'freeze_at': 0,
- 'freeze_stem_only': False,
- 'out_stride': [8, 16, 32],
- 'max_stride': 32,
- ## Image Encoder - FPN
- 'fpn': 'hybrid_encoder',
- 'fpn_act': 'silu',
- 'fpn_norm': 'BN',
- 'fpn_depthwise': False,
- 'hidden_dim': 256,
- 'en_num_heads': 8,
- 'en_num_layers': 1,
- 'en_mlp_ratio': 4.0,
- 'en_dropout': 0.0,
- 'pe_temperature': 10000.,
- 'en_act': 'gelu',
- # Transformer Decoder
- 'transformer': 'rtdetr_transformer',
- 'hidden_dim': 256,
- 'de_num_heads': 8,
- 'de_num_layers': 6,
- 'de_mlp_ratio': 4.0,
- 'de_dropout': 0.0,
- 'de_act': 'relu',
- 'de_num_points': 4,
- 'num_queries': 300,
- 'learnt_init_query': False,
- 'pe_temperature': 10000.,
- 'dn_num_denoising': 100,
- 'dn_label_noise_ratio': 0.5,
- 'dn_box_noise_scale': 1,
- # Head
- 'det_head': 'dino_head',
- # ---------------- Assignment config ----------------
- 'matcher_hpy': {'cost_class': 2.0,
- 'cost_bbox': 5.0,
- 'cost_giou': 2.0,},
- # ---------------- Loss config ----------------
- 'use_vfl': True,
- 'loss_coeff': {'class': 1,
- 'bbox': 5,
- 'giou': 2,},
- # ---------------- Train config ----------------
- ## input
- 'multi_scale': [0.5, 1.25], # 320 -> 800
- 'trans_type': 'rtdetr_base',
- # ---------------- Train config ----------------
- 'trainer_type': 'rtdetr',
- },
- 'rtdetr_r101':{
- # ---------------- Model config ----------------
- ## Model scale
- 'width': 1.0,
- 'depth': 1.0,
- ## Image Encoder - Backbone
- 'backbone': 'resnet101',
- 'backbone_norm': 'FrozeBN',
- 'res5_dilation': False,
- 'pretrained': True,
- 'pretrained_weight': 'imagenet1k_v1',
- 'freeze_at': 0,
- 'freeze_stem_only': False,
- 'out_stride': [8, 16, 32],
- 'max_stride': 32,
- ## Image Encoder - FPN
- 'fpn': 'hybrid_encoder',
- 'fpn_act': 'silu',
- 'fpn_norm': 'BN',
- 'fpn_depthwise': False,
- 'hidden_dim': 256,
- 'en_num_heads': 8,
- 'en_num_layers': 1,
- 'en_mlp_ratio': 4.0,
- 'en_dropout': 0.0,
- 'pe_temperature': 10000.,
- 'en_act': 'gelu',
- # Transformer Decoder
- 'transformer': 'rtdetr_transformer',
- 'hidden_dim': 256,
- 'de_num_heads': 8,
- 'de_num_layers': 6,
- 'de_mlp_ratio': 4.0,
- 'de_dropout': 0.0,
- 'de_act': 'relu',
- 'de_num_points': 4,
- 'num_queries': 300,
- 'learnt_init_query': False,
- 'pe_temperature': 10000.,
- 'dn_num_denoising': 100,
- 'dn_label_noise_ratio': 0.5,
- 'dn_box_noise_scale': 1,
- # Head
- 'det_head': 'dino_head',
- # ---------------- Assignment config ----------------
- 'matcher_hpy': {'cost_class': 2.0,
- 'cost_bbox': 5.0,
- 'cost_giou': 2.0,},
- # ---------------- Loss config ----------------
- 'use_vfl': True,
- 'loss_coeff': {'class': 1,
- 'bbox': 5,
- 'giou': 2,},
- # ---------------- Train config ----------------
- ## input
- 'multi_scale': [0.5, 1.25], # 320 -> 800
- 'trans_type': 'rtdetr_base',
- # ---------------- Train config ----------------
- 'trainer_type': 'rtdetr',
- },
- # Below RT-DETR is not complete
- 'rtdetr_l':{
- # ---------------- Model config ----------------
- ## Model scale
- 'width': 1.0,
- 'depth': 1.0,
- ## Image Encoder - Backbone
- 'backbone': 'rtcnet_l',
- 'pretrained': True,
- 'freeze_at': 0,
- 'freeze_stem_only': False,
- 'out_stride': [8, 16, 32],
- 'max_stride': 32,
- ## Image Encoder - FPN
- 'fpn': 'hybrid_encoder',
- 'fpn_act': 'silu',
- 'fpn_norm': 'BN',
- 'fpn_depthwise': False,
- 'hidden_dim': 256,
- 'en_num_heads': 8,
- 'en_num_layers': 1,
- 'en_mlp_ratio': 4.0,
- 'en_dropout': 0.0,
- 'pe_temperature': 10000.,
- 'en_act': 'gelu',
- # Transformer Decoder
- 'transformer': 'rtdetr_transformer',
- 'hidden_dim': 256,
- 'de_num_heads': 8,
- 'de_num_layers': 6,
- 'de_mlp_ratio': 4.0,
- 'de_dropout': 0.0,
- 'de_act': 'relu',
- 'de_num_points': 4,
- 'num_queries': 300,
- 'learnt_init_query': False,
- 'pe_temperature': 10000.,
- 'dn_num_denoising': 100,
- 'dn_label_noise_ratio': 0.5,
- 'dn_box_noise_scale': 1,
- # Head
- 'det_head': 'dino_head',
- # ---------------- Assignment config ----------------
- 'matcher_hpy': {'cost_class': 2.0,
- 'cost_bbox': 5.0,
- 'cost_giou': 2.0,},
- # ---------------- Loss config ----------------
- 'use_vfl': True,
- 'loss_coeff': {'class': 1,
- 'bbox': 5,
- 'giou': 2,},
- # ---------------- Train config ----------------
- ## input
- 'multi_scale': [0.5, 1.25], # 320 -> 800
- 'trans_type': 'rtdetr_base',
- # ---------------- Train config ----------------
- 'trainer_type': 'rtdetr',
- },
- }
|