rtdetr_config.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. # Real-time Transformer-based Object Detector
  2. # ------------------- Det task --------------------
  3. rtdetr_cfg = {
  4. 'rtdetr_r18':{
  5. # ---------------- Model config ----------------
  6. ## Model scale
  7. 'width': 1.0,
  8. 'depth': 1.0,
  9. ## Image Encoder - Backbone
  10. 'backbone': 'resnet18',
  11. 'backbone_norm': 'FrozeBN',
  12. 'pretrained': True,
  13. 'pretrained_weight': 'imagenet1k_v1',
  14. 'freeze_at': 0,
  15. 'freeze_stem_only': False,
  16. 'out_stride': [8, 16, 32],
  17. 'max_stride': 32,
  18. ## Image Encoder - FPN
  19. 'fpn': 'hybrid_encoder',
  20. 'fpn_act': 'silu',
  21. 'fpn_norm': 'BN',
  22. 'fpn_depthwise': False,
  23. 'hidden_dim': 256,
  24. 'en_num_heads': 8,
  25. 'en_num_layers': 1,
  26. 'en_mlp_ratio': 4.0,
  27. 'en_dropout': 0.0,
  28. 'pe_temperature': 10000.,
  29. 'en_act': 'gelu',
  30. # Transformer Decoder
  31. 'transformer': 'rtdetr_transformer',
  32. 'hidden_dim': 256,
  33. 'de_num_heads': 8,
  34. 'de_num_layers': 3,
  35. 'de_mlp_ratio': 4.0,
  36. 'de_dropout': 0.0,
  37. 'de_act': 'relu',
  38. 'de_num_points': 4,
  39. 'num_queries': 300,
  40. 'learnt_init_query': False,
  41. 'pe_temperature': 10000.,
  42. 'dn_num_denoising': 100,
  43. 'dn_label_noise_ratio': 0.5,
  44. 'dn_box_noise_scale': 1,
  45. # Head
  46. 'det_head': 'dino_head',
  47. # ---------------- Assignment config ----------------
  48. 'matcher_hpy': {'cost_class': 2.0,
  49. 'cost_bbox': 5.0,
  50. 'cost_giou': 2.0,},
  51. # ---------------- Loss config ----------------
  52. 'use_vfl': True,
  53. 'loss_coeff': {'class': 1,
  54. 'bbox': 5,
  55. 'giou': 2,},
  56. # ---------------- Train config ----------------
  57. ## input
  58. 'multi_scale': [0.5, 1.25], # 320 -> 800
  59. 'trans_type': 'rtdetr_base',
  60. # ---------------- Train config ----------------
  61. 'trainer_type': 'rtdetr',
  62. },
  63. 'rtdetr_r50':{
  64. # ---------------- Model config ----------------
  65. ## Model scale
  66. 'width': 1.0,
  67. 'depth': 1.0,
  68. ## Image Encoder - Backbone
  69. 'backbone': 'resnet50',
  70. 'backbone_norm': 'FrozeBN',
  71. 'pretrained': True,
  72. 'pretrained_weight': 'imagenet1k_v1',
  73. 'freeze_at': 0,
  74. 'freeze_stem_only': False,
  75. 'out_stride': [8, 16, 32],
  76. 'max_stride': 32,
  77. ## Image Encoder - FPN
  78. 'fpn': 'hybrid_encoder',
  79. 'fpn_act': 'silu',
  80. 'fpn_norm': 'BN',
  81. 'fpn_depthwise': False,
  82. 'hidden_dim': 256,
  83. 'en_num_heads': 8,
  84. 'en_num_layers': 1,
  85. 'en_mlp_ratio': 4.0,
  86. 'en_dropout': 0.0,
  87. 'pe_temperature': 10000.,
  88. 'en_act': 'gelu',
  89. # Transformer Decoder
  90. 'transformer': 'rtdetr_transformer',
  91. 'hidden_dim': 256,
  92. 'de_num_heads': 8,
  93. 'de_num_layers': 6,
  94. 'de_mlp_ratio': 4.0,
  95. 'de_dropout': 0.0,
  96. 'de_act': 'relu',
  97. 'de_num_points': 4,
  98. 'num_queries': 300,
  99. 'learnt_init_query': False,
  100. 'pe_temperature': 10000.,
  101. 'dn_num_denoising': 100,
  102. 'dn_label_noise_ratio': 0.5,
  103. 'dn_box_noise_scale': 1,
  104. # Head
  105. 'det_head': 'dino_head',
  106. # ---------------- Assignment config ----------------
  107. 'matcher_hpy': {'cost_class': 2.0,
  108. 'cost_bbox': 5.0,
  109. 'cost_giou': 2.0,},
  110. # ---------------- Loss config ----------------
  111. 'use_vfl': True,
  112. 'loss_coeff': {'class': 1,
  113. 'bbox': 5,
  114. 'giou': 2,},
  115. # ---------------- Train config ----------------
  116. ## input
  117. 'multi_scale': [0.5, 1.25], # 320 -> 800
  118. 'trans_type': 'rtdetr_base',
  119. # ---------------- Train config ----------------
  120. 'trainer_type': 'rtdetr',
  121. },
  122. 'rtdetr_r101':{
  123. # ---------------- Model config ----------------
  124. ## Model scale
  125. 'width': 1.0,
  126. 'depth': 1.0,
  127. ## Image Encoder - Backbone
  128. 'backbone': 'resnet101',
  129. 'backbone_norm': 'FrozeBN',
  130. 'pretrained': True,
  131. 'pretrained_weight': 'imagenet1k_v1',
  132. 'freeze_at': 0,
  133. 'freeze_stem_only': False,
  134. 'out_stride': [8, 16, 32],
  135. 'max_stride': 32,
  136. ## Image Encoder - FPN
  137. 'fpn': 'hybrid_encoder',
  138. 'fpn_act': 'silu',
  139. 'fpn_norm': 'BN',
  140. 'fpn_depthwise': False,
  141. 'hidden_dim': 256,
  142. 'en_num_heads': 8,
  143. 'en_num_layers': 1,
  144. 'en_mlp_ratio': 4.0,
  145. 'en_dropout': 0.0,
  146. 'pe_temperature': 10000.,
  147. 'en_act': 'gelu',
  148. # Transformer Decoder
  149. 'transformer': 'rtdetr_transformer',
  150. 'hidden_dim': 256,
  151. 'de_num_heads': 8,
  152. 'de_num_layers': 6,
  153. 'de_mlp_ratio': 4.0,
  154. 'de_dropout': 0.0,
  155. 'de_act': 'relu',
  156. 'de_num_points': 4,
  157. 'num_queries': 300,
  158. 'learnt_init_query': False,
  159. 'pe_temperature': 10000.,
  160. 'dn_num_denoising': 100,
  161. 'dn_label_noise_ratio': 0.5,
  162. 'dn_box_noise_scale': 1,
  163. # Head
  164. 'det_head': 'dino_head',
  165. # ---------------- Assignment config ----------------
  166. 'matcher_hpy': {'cost_class': 2.0,
  167. 'cost_bbox': 5.0,
  168. 'cost_giou': 2.0,},
  169. # ---------------- Loss config ----------------
  170. 'use_vfl': True,
  171. 'loss_coeff': {'class': 1,
  172. 'bbox': 5,
  173. 'giou': 2,},
  174. # ---------------- Train config ----------------
  175. ## input
  176. 'multi_scale': [0.5, 1.25], # 320 -> 800
  177. 'trans_type': 'rtdetr_base',
  178. # ---------------- Train config ----------------
  179. 'trainer_type': 'rtdetr',
  180. },
  181. # Below RT-DETR is not complete
  182. 'rtdetr_l':{
  183. # ---------------- Model config ----------------
  184. ## Model scale
  185. 'width': 1.0,
  186. 'depth': 1.0,
  187. ## Image Encoder - Backbone
  188. 'backbone': 'rtcnet_l',
  189. 'pretrained': True,
  190. 'freeze_at': 0,
  191. 'freeze_stem_only': False,
  192. 'out_stride': [8, 16, 32],
  193. 'max_stride': 32,
  194. ## Image Encoder - FPN
  195. 'fpn': 'hybrid_encoder',
  196. 'fpn_act': 'silu',
  197. 'fpn_norm': 'BN',
  198. 'fpn_depthwise': False,
  199. 'hidden_dim': 256,
  200. 'en_num_heads': 8,
  201. 'en_num_layers': 1,
  202. 'en_mlp_ratio': 4.0,
  203. 'en_dropout': 0.0,
  204. 'pe_temperature': 10000.,
  205. 'en_act': 'gelu',
  206. # Transformer Decoder
  207. 'transformer': 'rtdetr_transformer',
  208. 'hidden_dim': 256,
  209. 'de_num_heads': 8,
  210. 'de_num_layers': 6,
  211. 'de_mlp_ratio': 4.0,
  212. 'de_dropout': 0.0,
  213. 'de_act': 'relu',
  214. 'de_num_points': 4,
  215. 'num_queries': 300,
  216. 'learnt_init_query': False,
  217. 'pe_temperature': 10000.,
  218. 'dn_num_denoising': 100,
  219. 'dn_label_noise_ratio': 0.5,
  220. 'dn_box_noise_scale': 1,
  221. # Head
  222. 'det_head': 'dino_head',
  223. # ---------------- Assignment config ----------------
  224. 'matcher_hpy': {'cost_class': 2.0,
  225. 'cost_bbox': 5.0,
  226. 'cost_giou': 2.0,},
  227. # ---------------- Loss config ----------------
  228. 'use_vfl': True,
  229. 'loss_coeff': {'class': 1,
  230. 'bbox': 5,
  231. 'giou': 2,},
  232. # ---------------- Train config ----------------
  233. ## input
  234. 'multi_scale': [0.5, 1.25], # 320 -> 800
  235. 'trans_type': 'rtdetr_base',
  236. # ---------------- Train config ----------------
  237. 'trainer_type': 'rtdetr',
  238. },
  239. }