basic.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. import math
  2. import torch
  3. import torch.nn as nn
  4. # ----------------- MLP modules -----------------
  5. class MLP(nn.Module):
  6. def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
  7. super().__init__()
  8. self.num_layers = num_layers
  9. h = [hidden_dim] * (num_layers - 1)
  10. self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([in_dim] + h, h + [out_dim]))
  11. def forward(self, x):
  12. for i, layer in enumerate(self.layers):
  13. x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
  14. return x
  15. class FFN(nn.Module):
  16. def __init__(self, d_model=256, mlp_ratio=4.0, dropout=0., act_type='relu'):
  17. super().__init__()
  18. self.fpn_dim = round(d_model * mlp_ratio)
  19. self.linear1 = nn.Linear(d_model, self.fpn_dim)
  20. self.activation = get_activation(act_type)
  21. self.dropout2 = nn.Dropout(dropout)
  22. self.linear2 = nn.Linear(self.fpn_dim, d_model)
  23. self.dropout3 = nn.Dropout(dropout)
  24. self.norm = nn.LayerNorm(d_model)
  25. def forward(self, src):
  26. src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
  27. src = src + self.dropout3(src2)
  28. src = self.norm(src)
  29. return src
  30. # ----------------- CNN modules -----------------
  31. def get_conv2d(c1, c2, k, p, s, g, bias=False):
  32. conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, groups=g, bias=bias)
  33. return conv
  34. def get_activation(act_type=None):
  35. if act_type == 'relu':
  36. return nn.ReLU(inplace=True)
  37. elif act_type == 'lrelu':
  38. return nn.LeakyReLU(0.1, inplace=True)
  39. elif act_type == 'mish':
  40. return nn.Mish(inplace=True)
  41. elif act_type == 'silu':
  42. return nn.SiLU(inplace=True)
  43. elif act_type is None:
  44. return nn.Identity()
  45. else:
  46. raise NotImplementedError
  47. def get_norm(norm_type, dim):
  48. if norm_type == 'BN':
  49. return nn.BatchNorm2d(dim)
  50. elif norm_type == 'GN':
  51. return nn.GroupNorm(num_groups=32, num_channels=dim)
  52. elif norm_type is None:
  53. return nn.Identity()
  54. else:
  55. raise NotImplementedError
  56. def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
  57. """3x3 convolution with padding"""
  58. return nn.Conv2d(
  59. in_planes,
  60. out_planes,
  61. kernel_size=3,
  62. stride=stride,
  63. padding=dilation,
  64. groups=groups,
  65. bias=False,
  66. dilation=dilation,
  67. )
  68. def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
  69. """1x1 convolution"""
  70. return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
  71. class FrozenBatchNorm2d(torch.nn.Module):
  72. def __init__(self, n):
  73. super(FrozenBatchNorm2d, self).__init__()
  74. self.register_buffer("weight", torch.ones(n))
  75. self.register_buffer("bias", torch.zeros(n))
  76. self.register_buffer("running_mean", torch.zeros(n))
  77. self.register_buffer("running_var", torch.ones(n))
  78. def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
  79. missing_keys, unexpected_keys, error_msgs):
  80. num_batches_tracked_key = prefix + 'num_batches_tracked'
  81. if num_batches_tracked_key in state_dict:
  82. del state_dict[num_batches_tracked_key]
  83. super(FrozenBatchNorm2d, self)._load_from_state_dict(
  84. state_dict, prefix, local_metadata, strict,
  85. missing_keys, unexpected_keys, error_msgs)
  86. def forward(self, x):
  87. # move reshapes to the beginning
  88. # to make it fuser-friendly
  89. w = self.weight.reshape(1, -1, 1, 1)
  90. b = self.bias.reshape(1, -1, 1, 1)
  91. rv = self.running_var.reshape(1, -1, 1, 1)
  92. rm = self.running_mean.reshape(1, -1, 1, 1)
  93. eps = 1e-5
  94. scale = w * (rv + eps).rsqrt()
  95. bias = b - rm * scale
  96. return x * scale + bias
  97. class BasicConv(nn.Module):
  98. def __init__(self,
  99. in_dim, # in channels
  100. out_dim, # out channels
  101. kernel_size=1, # kernel size
  102. padding=0, # padding
  103. stride=1, # padding
  104. act_type :str = 'lrelu', # activation
  105. norm_type :str = 'BN', # normalization
  106. ):
  107. super(BasicConv, self).__init__()
  108. add_bias = False if norm_type else True
  109. self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, g=1, bias=add_bias)
  110. self.norm = get_norm(norm_type, out_dim)
  111. self.act = get_activation(act_type)
  112. def forward(self, x):
  113. return self.act(self.norm(self.conv(x)))
  114. class DepthwiseConv(nn.Module):
  115. def __init__(self,
  116. in_dim, # in channels
  117. out_dim, # out channels
  118. kernel_size=1, # kernel size
  119. padding=0, # padding
  120. stride=1, # padding
  121. act_type :str = None, # activation
  122. norm_type :str = 'BN', # normalization
  123. ):
  124. super(DepthwiseConv, self).__init__()
  125. assert in_dim == out_dim
  126. add_bias = False if norm_type else True
  127. self.conv = get_conv2d(in_dim, out_dim, k=kernel_size, p=padding, s=stride, g=out_dim, bias=add_bias)
  128. self.norm = get_norm(norm_type, out_dim)
  129. self.act = get_activation(act_type)
  130. def forward(self, x):
  131. return self.act(self.norm(self.conv(x)))
  132. class PointwiseConv(nn.Module):
  133. def __init__(self,
  134. in_dim, # in channels
  135. out_dim, # out channels
  136. act_type :str = 'lrelu', # activation
  137. norm_type :str = 'BN', # normalization
  138. ):
  139. super(DepthwiseConv, self).__init__()
  140. assert in_dim == out_dim
  141. add_bias = False if norm_type else True
  142. self.conv = get_conv2d(in_dim, out_dim, k=1, p=0, s=1, g=1, bias=add_bias)
  143. self.norm = get_norm(norm_type, out_dim)
  144. self.act = get_activation(act_type)
  145. def forward(self, x):
  146. return self.act(self.norm(self.conv(x)))
  147. ## Yolov8's BottleNeck
  148. class Bottleneck(nn.Module):
  149. def __init__(self,
  150. in_dim,
  151. out_dim,
  152. expand_ratio = 0.5,
  153. kernel_sizes = [3, 3],
  154. shortcut = True,
  155. act_type = 'silu',
  156. norm_type = 'BN',
  157. depthwise = False,):
  158. super(Bottleneck, self).__init__()
  159. inter_dim = int(out_dim * expand_ratio)
  160. if depthwise:
  161. self.cv1 = nn.Sequential(
  162. DepthwiseConv(in_dim, in_dim, kernel_size=kernel_sizes[0], padding=kernel_sizes[0]//2, act_type=act_type, norm_type=norm_type),
  163. PointwiseConv(in_dim, inter_dim, act_type=act_type, norm_type=norm_type),
  164. )
  165. self.cv2 = nn.Sequential(
  166. DepthwiseConv(inter_dim, inter_dim, kernel_size=kernel_sizes[1], padding=kernel_sizes[1]//2, act_type=act_type, norm_type=norm_type),
  167. PointwiseConv(inter_dim, out_dim, act_type=act_type, norm_type=norm_type),
  168. )
  169. else:
  170. self.cv1 = BasicConv(in_dim, inter_dim, kernel_size=kernel_sizes[0], padding=kernel_sizes[0]//2, act_type=act_type, norm_type=norm_type)
  171. self.cv2 = BasicConv(inter_dim, out_dim, kernel_size=kernel_sizes[1], padding=kernel_sizes[1]//2, act_type=act_type, norm_type=norm_type)
  172. self.shortcut = shortcut and in_dim == out_dim
  173. def forward(self, x):
  174. h = self.cv2(self.cv1(x))
  175. return x + h if self.shortcut else h
  176. # Yolov8's StageBlock
  177. class RTCBlock(nn.Module):
  178. def __init__(self,
  179. in_dim,
  180. out_dim,
  181. num_blocks = 1,
  182. shortcut = False,
  183. act_type = 'silu',
  184. norm_type = 'BN',
  185. depthwise = False,):
  186. super(RTCBlock, self).__init__()
  187. self.inter_dim = out_dim // 2
  188. self.input_proj = BasicConv(in_dim, out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
  189. self.m = nn.Sequential(*(
  190. Bottleneck(self.inter_dim, self.inter_dim, 1.0, [3, 3], shortcut, act_type, norm_type, depthwise)
  191. for _ in range(num_blocks)))
  192. self.output_proj = BasicConv((2 + num_blocks) * self.inter_dim, out_dim, kernel_size=1, act_type=act_type, norm_type=norm_type)
  193. def forward(self, x):
  194. # Input proj
  195. x1, x2 = torch.chunk(self.input_proj(x), 2, dim=1)
  196. out = list([x1, x2])
  197. # Bottlenecl
  198. out.extend(m(out[-1]) for m in self.m)
  199. # Output proj
  200. out = self.output_proj(torch.cat(out, dim=1))
  201. return out
  202. # ----------------- Transformer modules -----------------
  203. ## Transformer layer
  204. class TransformerLayer(nn.Module):
  205. def __init__(self,
  206. d_model :int = 256,
  207. num_heads :int = 8,
  208. mlp_ratio :float = 4.0,
  209. dropout :float = 0.1,
  210. act_type :str = "relu",
  211. ):
  212. super().__init__()
  213. # ----------- Basic parameters -----------
  214. self.d_model = d_model
  215. self.num_heads = num_heads
  216. self.mlp_ratio = mlp_ratio
  217. self.dropout = dropout
  218. self.act_type = act_type
  219. # ----------- Basic parameters -----------
  220. # Multi-head Self-Attn
  221. self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
  222. self.dropout = nn.Dropout(dropout)
  223. self.norm = nn.LayerNorm(d_model)
  224. # Feedforwaed Network
  225. self.ffn = FFN(d_model, mlp_ratio, dropout, act_type)
  226. def with_pos_embed(self, tensor, pos):
  227. return tensor if pos is None else tensor + pos
  228. def forward(self, src, pos):
  229. """
  230. Input:
  231. src: [torch.Tensor] -> [B, N, C]
  232. pos: [torch.Tensor] -> [B, N, C]
  233. Output:
  234. src: [torch.Tensor] -> [B, N, C]
  235. """
  236. q = k = self.with_pos_embed(src, pos)
  237. # -------------- MHSA --------------
  238. src2 = self.self_attn(q, k, value=src)
  239. src = src + self.dropout(src2)
  240. src = self.norm(src)
  241. # -------------- FFN --------------
  242. src = self.ffn(src)
  243. return src