2 years ago · 6f4f753562
--- a/config/data_config/transform_config.py
+++ b/config/data_config/transform_config.py
@@ -451,7 +451,7 @@ rtcdet_v2_nano_trans_config = {
 
				     'hsv_v': 0.4,
			
 
				     # Mosaic & Mixup
			
 
				     'mosaic_prob': 1.0,
			
 
				-    'mixup_prob': 0.5,
			
 
				+    'mixup_prob': 0.1,
			
 
				     'mosaic_type': 'yolov5_mosaic',
			
 
				     'mixup_type': 'yolov5_mixup',
			
 
				     'mixup_scale': [0.5, 1.5]   # "mixup_scale" is not used for YOLOv5MixUp
			
--- a/dataset/scripts/data_to_h5py.py
+++ b/dataset/scripts/data_to_h5py.py
@@ -0,0 +1,61 @@
 
				+import cv2
			
 
				+import h5py
			
 
				+import os
			
 
				+import argparse
			
 
				+import numpy as np
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append('..')
			
 
				+from voc import VOCDetection
			
 
				+from coco import COCODataset
			
 
				+
			
 
				+# ---------------------- Opt ----------------------
			
 
				+parser = argparse.ArgumentParser(description='Cache-Dataset')
			
 
				+parser.add_argument('-d', '--dataset', default='voc',
			
 
				+                    help='coco, voc, widerface, crowdhuman')
			
 
				+parser.add_argument('--root', default='/Users/liuhaoran/Desktop/python_work/object-detection/dataset/',
			
 
				+                    help='data root')
			
 
				+parser.add_argument('-size', '--img_size', default=640, type=int,
			
 
				+                    help='input image size.')
			
 
				+parser.add_argument('--mosaic', default=None, type=float,
			
 
				+                    help='mosaic augmentation.')
			
 
				+parser.add_argument('--mixup', default=None, type=float,
			
 
				+                    help='mixup augmentation.')
			
 
				+parser.add_argument('--keep_ratio', action="store_true", default=False,
			
 
				+                    help='keep aspect ratio.')
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+
			
 
				+# ---------------------- Build Dataset ----------------------
			
 
				+if args.dataset == 'voc':
			
 
				+    root = os.path.join(args.root, 'VOCdevkit')
			
 
				+    dataset = VOCDetection(args.img_size, root)
			
 
				+elif args.dataset == 'coco':
			
 
				+    root = os.path.join(args.root, 'COCO')
			
 
				+    dataset = COCODataset(args.img_size, args.root)
			
 
				+print('Data length: ', len(dataset))
			
 
				+
			
 
				+
			
 
				+# ---------------------- Main Process ----------------------
			
 
				+for i in range(len(dataset)):
			
 
				+    # load an image
			
 
				+    image, image_id = dataset.pull_image(i)
			
 
				+    orig_h, orig_w, _ = image.shape
			
 
				+
			
 
				+    # resize image
			
 
				+    if args.keep_ratio:
			
 
				+        r = args.img_size / max(orig_h, orig_w)
			
 
				+        if r != 1: 
			
 
				+            interp = cv2.INTER_LINEAR
			
 
				+            new_size = (int(orig_w * r), int(orig_h * r))
			
 
				+            image = cv2.resize(image, new_size, interpolation=interp)
			
 
				+        pad_image = np.ones([args.img_size, args.img_size, image.shape[2]], dtype=image.dtype) * 114
			
 
				+        pad_image[:new_size[1], :new_size[0]] = image
			
 
				+        image = pad_image
			
 
				+    else:
			
 
				+        image = cv2.resize(image, (int(args.img_size), int(args.img_size)))
			
 
				+
			
 
				+    cv2.imshow('image', image)
			
 
				+    # cv2.imwrite(str(i)+'.jpg', img)
			
 
				+    cv2.waitKey(0)
			
--- a/dataset/voc.py
+++ b/dataset/voc.py
@@ -97,7 +97,9 @@ class VOCDetection(data.Dataset):
 
				                  image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
			
 
				                  trans_config=None,
			
 
				                  transform=None,
			
 
				-                 is_train=False):
			
 
				+                 is_train=False,
			
 
				+                 load_cache=False
			
 
				+                 ):
			
 
				         self.root = data_dir
			
 
				         self.img_size = img_size
			
 
				         self.image_set = image_sets
			
@@ -106,6 +108,7 @@ class VOCDetection(data.Dataset):
 
				         self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
			
 
				         self.ids = list()
			
 
				         self.is_train = is_train
			
 
				+        self.load_cache = load_cache
			
 
				         for (year, name) in image_sets:
			
 
				             rootpath = osp.join(self.root, 'VOC' + year)
			
 
				             for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
			
@@ -121,6 +124,10 @@ class VOCDetection(data.Dataset):
 
				         print('use Mixup Augmentation: {}'.format(self.mixup_prob))
			
 
				         print('==============================')
			
 
				 
			
 
				+        # load cache data
			
 
				+        if load_cache:
			
 
				+            self._load_cache()
			
 
				+
			
 
				 
			
 
				     def __getitem__(self, index):
			
 
				         image, target, deltas = self.pull_item(index)
			
@@ -131,24 +138,43 @@ class VOCDetection(data.Dataset):
 
				         return len(self.ids)
			
 
				 
			
 
				 
			
 
				-    def load_image_target(self, index):
			
 
				-        # load an image
			
 
				-        img_id = self.ids[index]
			
 
				-        image = cv2.imread(self._imgpath % img_id)
			
 
				-        height, width, channels = image.shape
			
 
				+    def _load_cache(self):
			
 
				+        # load image cache
			
 
				+        self.image_list = None  # TODO: H5PY file
			
 
				 
			
 
				-        # laod an annotation
			
 
				-        anno = ET.parse(self._annopath % img_id).getroot()
			
 
				-        if self.target_transform is not None:
			
 
				+        # load target cache
			
 
				+        self.target_list = []
			
 
				+        for img_id in self.ids:
			
 
				+            anno = ET.parse(self._annopath % img_id).getroot()
			
 
				             anno = self.target_transform(anno)
			
 
				+            anno = np.array(anno).reshape(-1, 5)
			
 
				+            self.target_list.append({"boxes": anno[:, :4], "labels": anno[:, 4]})
			
 
				+        
			
 
				 
			
 
				-        # guard against no boxes via resizing
			
 
				-        anno = np.array(anno).reshape(-1, 5)
			
 
				-        target = {
			
 
				-            "boxes": anno[:, :4],
			
 
				-            "labels": anno[:, 4],
			
 
				-            "orig_size": [height, width]
			
 
				-        }
			
 
				+    def load_image_target(self, index):
			
 
				+        if self.load_cache:
			
 
				+            image = self.image_list[index]
			
 
				+            target = self.target_list[index]
			
 
				+            height, width, channels = image.shape
			
 
				+            target["orig_size"] = [height, width]
			
 
				+        else:
			
 
				+            # load an image
			
 
				+            img_id = self.ids[index]
			
 
				+            image = cv2.imread(self._imgpath % img_id)
			
 
				+            height, width, channels = image.shape
			
 
				+
			
 
				+            # laod an annotation
			
 
				+            anno = ET.parse(self._annopath % img_id).getroot()
			
 
				+            if self.target_transform is not None:
			
 
				+                anno = self.target_transform(anno)
			
 
				+
			
 
				+            # guard against no boxes via resizing
			
 
				+            anno = np.array(anno).reshape(-1, 5)
			
 
				+            target = {
			
 
				+                "boxes": anno[:, :4],
			
 
				+                "labels": anno[:, 4],
			
 
				+                "orig_size": [height, width]
			
 
				+            }
			
 
				         
			
 
				         return image, target
			
 
				 
			
@@ -259,7 +285,8 @@ if __name__ == "__main__":
 
				                         help='mixup augmentation.')
			
 
				     parser.add_argument('--is_train', action="store_true", default=False,
			
 
				                         help='mixup augmentation.')
			
 
				-    
			
 
				+    parser.add_argument('--load_cache', action="store_true", default=False,
			
 
				+                        help='load cached data.')
			
 
				     
			
 
				     args = parser.parse_args()
			
 
				 
			
--- a/engine.py
+++ b/engine.py
@@ -507,7 +507,7 @@ class YoloxTrainer(object):
 
				             images = images.to(self.device, non_blocking=True).float() / 255.
			
 
				 
			
 
				             # Multi scale
			
 
				-            if self.args.multi_scale:
			
 
				+            if self.args.multi_scale and ni % 10 == 0:
			
 
				                 images, targets, img_size = self.rescale_image_targets(
			
 
				                     images, targets, self.model_cfg['stride'], self.args.min_box_size, self.model_cfg['multi_scale'])
			
 
				             else:
			
@@ -590,6 +590,14 @@ class YoloxTrainer(object):
 
				             print(' - Close < perspective of rotation > ...')
			
 
				             self.trans_cfg['perspective'] = 0.0
			
 
				 
			
 
				+        # close random affine
			
 
				+        if 'translate' in self.trans_cfg.keys() and self.trans_cfg['translate'] > 0.0:
			
 
				+            print(' - Close < translate of affine > ...')
			
 
				+            self.trans_cfg['translate'] = 0.0
			
 
				+        if 'scale' in self.trans_cfg.keys():
			
 
				+            print(' - Close < scale of affine >...')
			
 
				+            self.trans_cfg['scale'] = [1.0, 1.0]
			
 
				+
			
 
				         # build a new transform for second stage
			
 
				         print(' - Rebuild transforms ...')
			
 
				         self.train_transform, self.trans_cfg = build_transform(
			
--- a/models/detectors/rtcdet_v2/README.md
+++ b/models/detectors/rtcdet_v2/README.md
@@ -2,19 +2,19 @@
 
				 
			
 
				 |   Model    | Scale | Batch | AP<sup>test<br>0.5:0.95 | AP<sup>test<br>0.5 | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
			
 
				 |------------|-------|-------|-------------------------|--------------------|------------------------|-------------------|-------------------|--------------------|--------|
			
 
				-| RTCDetv2-N |  640  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				-| RTCDetv2-T |  640  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				-| RTCDetv2-S |  640  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				-| RTCDetv2-M |  640  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				-| RTCDetv2-L |  640  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-N |  640  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-T |  640  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-S |  640  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-M |  640  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-L |  640  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				 | RTCDetv2-X |  640  |       |                         |                    |                        |                   |                   |                    |  |
			
 
				 
			
 
				 |   Model    | Scale | Batch | AP<sup>test<br>0.5:0.95 | AP<sup>test<br>0.5 | AP<sup>val<br>0.5:0.95 | AP<sup>val<br>0.5 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
			
 
				 |------------|-------|-------|-------------------------|--------------------|------------------------|-------------------|-------------------|--------------------|--------|
			
 
				-| RTCDetv2-P |  320  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				-| RTCDetv2-P |  416  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				-| RTCDetv2-P |  512  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				-| RTCDetv2-P |  640  | 4xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-P |  320  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-P |  416  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-P |  512  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				+| RTCDetv2-P |  640  | 8xb16 |                         |                    |                        |                   |                   |                    |  |
			
 
				 
			
 
				 - For training, we train my RTCDetv2 series series with 300 epochs on COCO.
			
 
				 - For data augmentation, we use the large scale jitter (LSJ), Mosaic augmentation and Mixup augmentation, following the setting of [YOLOX](https://github.com/ultralytics/yolov5), but we remove the rotation transformation which is used in YOLOX's strong augmentation.
			
--- a/train_ddp.sh
+++ b/train_ddp.sh
@@ -11,9 +11,9 @@ python -m torch.distributed.run --nproc_per_node=8 train.py \
 
				                                                     --wp_epoch 3 \
			
 
				                                                     --max_epoch 300 \
			
 
				                                                     --eval_epoch 10 \
			
 
				-                                                    --no_aug_epoch 20 \
			
 
				+                                                    --no_aug_epoch 15 \
			
 
				                                                     --ema \
			
 
				                                                     --fp16 \
			
 
				                                                     --sybn \
			
 
				                                                     --multi_scale \
			
 
				-                                                    # --resume weights/coco/yolovx_s/yolovx_s_best.pth \
			
 
				+                                                     --resume weights/coco/yolox_l/yolox_l_best.pth \