2 years ago · 677d92909d
--- a/engine.py
+++ b/engine.py
@@ -80,7 +80,7 @@ class Yolov8Trainer(object):
 
				         # ---------------------------- Build LR Scheduler ----------------------------
			
 
				         self.lr_scheduler, self.lf = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, self.args.max_epoch)
			
 
				         self.lr_scheduler.last_epoch = self.start_epoch - 1  # do not move
			
 
				-        if self.args.resume:
			
 
				+        if self.args.resume and self.args.resume != 'None':
			
 
				             self.lr_scheduler.step()
			
 
				 
			
 
				         # ---------------------------- Build Model-EMA ----------------------------
			
@@ -442,7 +442,7 @@ class YoloxTrainer(object):
 
				         # ---------------------------- Build LR Scheduler ----------------------------
			
 
				         self.lr_scheduler, self.lf = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, self.args.max_epoch - self.no_aug_epoch)
			
 
				         self.lr_scheduler.last_epoch = self.start_epoch - 1  # do not move
			
 
				-        if self.args.resume:
			
 
				+        if self.args.resume and self.args.resume != 'None':
			
 
				             self.lr_scheduler.step()
			
 
				 
			
 
				         # ---------------------------- Build Model-EMA ----------------------------
			
@@ -805,7 +805,7 @@ class RTCTrainer(object):
 
				         # ---------------------------- Build LR Scheduler ----------------------------
			
 
				         self.lr_scheduler, self.lf = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, args.max_epoch - args.no_aug_epoch)
			
 
				         self.lr_scheduler.last_epoch = self.start_epoch - 1  # do not move
			
 
				-        if self.args.resume:
			
 
				+        if self.args.resume and self.args.resume != 'None':
			
 
				             self.lr_scheduler.step()
			
 
				 
			
 
				         # ---------------------------- Build Model-EMA ----------------------------
			
@@ -1168,7 +1168,7 @@ class RTRTrainer(object):
 
				         # ---------------------------- Build LR Scheduler ----------------------------
			
 
				         self.lr_scheduler, self.lf = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, args.max_epoch - args.no_aug_epoch)
			
 
				         self.lr_scheduler.last_epoch = self.start_epoch - 1  # do not move
			
 
				-        if self.args.resume:
			
 
				+        if self.args.resume and self.args.resume != 'None':
			
 
				             self.lr_scheduler.step()
			
 
				 
			
 
				         # ---------------------------- Build Model-EMA ----------------------------
			
--- a/models/detectors/__init__.py
+++ b/models/detectors/__init__.py
@@ -77,7 +77,7 @@ def build_model(args,
 
				             model.load_state_dict(checkpoint_state_dict, strict=False)
			
 
				 
			
 
				         # keep training
			
 
				-        if args.resume is not None:
			
 
				+        if args.resume and args.resume != "None":
			
 
				             print('keep training: ', args.resume)
			
 
				             checkpoint = torch.load(args.resume, map_location='cpu')
			
 
				             # checkpoint state dict
			
--- a/train.py
+++ b/train.py
@@ -1,5 +1,6 @@
 
				 from __future__ import division
			
 
				 
			
 
				+import os
			
 
				 import argparse
			
 
				 from copy import deepcopy
			
 
				 
			
@@ -115,16 +116,25 @@ def train():
 
				     print("----------------------------------------------------------")
			
 
				 
			
 
				     # Build DDP
			
 
				+    local_rank = local_process_rank = -1
			
 
				     if args.distributed:
			
 
				         distributed_utils.init_distributed_mode(args)
			
 
				         print("git:\n  {}\n".format(distributed_utils.get_sha()))
			
 
				+        try:
			
 
				+            # Multiple Mechine & Multiple GPUs (world size > 8)
			
 
				+            local_rank = torch.distributed.get_rank()
			
 
				+            local_process_rank = int(os.getenv('LOCAL_PROCESS_RANK', '0'))
			
 
				+        except:
			
 
				+            # Single Mechine & Multiple GPUs (world size <= 8)
			
 
				+            local_rank = local_process_rank = torch.distributed.get_rank()
			
 
				     world_size = distributed_utils.get_world_size()
			
 
				-    print('World size: {}'.format(world_size))
			
 
				+    print("LOCAL RANK: ", local_rank)
			
 
				+    print("LOCAL_PROCESS_RANL: ", local_process_rank)
			
 
				+    print('WORLD SIZE: {}'.format(world_size))
			
 
				 
			
 
				     # Build CUDA
			
 
				-    if args.cuda:
			
 
				+    if args.cuda and torch.cuda.is_available():
			
 
				         print('use cuda')
			
 
				-        # cudnn.benchmark = True
			
 
				         device = torch.device("cuda")
			
 
				     else:
			
 
				         device = torch.device("cpu")
			
@@ -136,15 +146,6 @@ def train():
 
				 
			
 
				     # Build Model
			
 
				     model, criterion = build_model(args, model_cfg, device, data_cfg['num_classes'], True)
			
 
				-
			
 
				-    # Keep training
			
 
				-    if distributed_utils.is_main_process and args.resume is not None:
			
 
				-        print('keep training: ', args.resume)
			
 
				-        checkpoint = torch.load(args.resume, map_location='cpu')
			
 
				-        # checkpoint state dict
			
 
				-        checkpoint_state_dict = checkpoint.pop("model")
			
 
				-        model.load_state_dict(checkpoint_state_dict)
			
 
				-
			
 
				     model = model.to(device).train()
			
 
				     model_without_ddp = model
			
 
				     if args.sybn and args.distributed:
			
--- a/train_multi_gpus.sh
+++ b/train_multi_gpus.sh
@@ -1,39 +1,64 @@
 
				-# -------------------------- Train YOLOX & YOLOv7 --------------------------
			
 
				-# python -m torch.distributed.run --nproc_per_node=8 train.py \
			
 
				-#                                                     --cuda \
			
 
				-#                                                     -dist \
			
 
				-#                                                     -d coco \
			
 
				-#                                                     --root /data/datasets/ \
			
 
				-#                                                     -m rtcdet_n \
			
 
				-#                                                     -bs 64 \
			
 
				-#                                                     -size 640 \
			
 
				-#                                                     --wp_epoch 3 \
			
 
				-#                                                     --max_epoch 300 \
			
 
				-#                                                     --eval_epoch 10 \
			
 
				-#                                                     --no_aug_epoch 15 \
			
 
				-#                                                     --ema \
			
 
				-#                                                     --fp16 \
			
 
				-#                                                     --sybn \
			
 
				-#                                                     --multi_scale \
			
 
				-#                                                     # --load_cache \
			
 
				-#                                                     # --resume weights/coco/yolox_l/yolox_l_best.pth \
			
 
				+# Dataset setting
			
 
				+DATASET="coco"
			
 
				+DATA_ROOT="/data/datasets/"
			
 
				+# DATA_ROOT="/Users/liuhaoran/Desktop/python_work/object-detection/dataset/"
			
 
				 
			
 
				-# -------------------------- Train YOLOv1~v5 --------------------------
			
 
				+# MODEL setting
			
 
				+MODEL="yolov8_n"
			
 
				+IMAGE_SIZE=640
			
 
				+RESUME="None"
			
 
				+if [[ $MODEL == *"yolov8"* ]]; then
			
 
				+    # Epoch setting
			
 
				+    BATCH_SIZE=128
			
 
				+    MAX_EPOCH=500
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=20
			
 
				+elif [[ $MODEL == *"yolox"* ]]; then
			
 
				+    # Epoch setting
			
 
				+    BATCH_SIZE=64
			
 
				+    MAX_EPOCH=300
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=15
			
 
				+elif [[ $MODEL == *"yolov7"* ]]; then
			
 
				+    # Epoch setting
			
 
				+    BATCH_SIZE=128
			
 
				+    MAX_EPOCH=300
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=20
			
 
				+elif [[ $MODEL == *"yolov5"* || $MODEL == *"yolov4"* || $MODEL == *"yolov3"* ]]; then
			
 
				+    # Epoch setting
			
 
				+    BATCH_SIZE=128
			
 
				+    MAX_EPOCH=300
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=15
			
 
				+else
			
 
				+    # Epoch setting
			
 
				+    BATCH_SIZE=128
			
 
				+    MAX_EPOCH=150
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=0
			
 
				+fi
			
 
				+
			
 
				+# -------------------------- Train Pipeline --------------------------
			
 
				 python -m torch.distributed.run --nproc_per_node=8 train.py \
			
 
				                                                     --cuda \
			
 
				                                                     -dist \
			
 
				-                                                    -d coco \
			
 
				-                                                    --root /data/datasets/ \
			
 
				-                                                    -m yolov8_n\
			
 
				-                                                    -bs 128 \
			
 
				-                                                    -size 640 \
			
 
				-                                                    --wp_epoch 3 \
			
 
				-                                                    --max_epoch 500 \
			
 
				-                                                    --eval_epoch 10 \
			
 
				-                                                    --no_aug_epoch 20 \
			
 
				+                                                    --dataset ${DATASET} \
			
 
				+                                                    --root ${DATA_ROOT} \
			
 
				+                                                    --model ${MODEL} \
			
 
				+                                                    --batch_size ${BATCH_SIZE} \
			
 
				+                                                    --img_size ${IMAGE_SIZE} \
			
 
				+                                                    --wp_epoch ${WP_EPOCH} \
			
 
				+                                                    --max_epoch ${MAX_EPOCH} \
			
 
				+                                                    --eval_epoch ${EVAL_EPOCH} \
			
 
				+                                                    --no_aug_epoch ${NO_AUG_EPOCH} \
			
 
				+                                                    --resume ${RESUME} \
			
 
				                                                     --ema \
			
 
				                                                     --fp16 \
			
 
				-                                                    --sybn \
			
 
				                                                     --multi_scale \
			
 
				-                                                    # --load_cache
			
 
				-                                                    # --resume weights/coco/yolov5_l/yolov5_l_best.pth \
			
 
				+                                                    --sybn \
			
--- a/train_single_gpu.sh
+++ b/train_single_gpu.sh
@@ -1,59 +1,62 @@
 
				-# -------------------------- Train RTCDet series --------------------------
			
 
				-# python train.py \
			
 
				-#         --cuda \
			
 
				-#         -d coco \
			
 
				-#         --root /data/datasets/ \
			
 
				-#         -m rtrdet_l \
			
 
				-#         -bs 16 \
			
 
				-#         -size 640 \
			
 
				-#         --wp_epoch 1 \
			
 
				-#         --max_epoch 300 \
			
 
				-#         --eval_epoch 10 \
			
 
				-#         --no_aug_epoch 20 \
			
 
				-#         --grad_accumulate 1 \
			
 
				-#         --ema \
			
 
				-#         --fp16 \
			
 
				-#         --multi_scale \
			
 
				-#         --eval_first \
			
 
				-#         # --load_cache \
			
 
				-#         # --resume weights/coco/yolox_m/yolox_m_best.pth \
			
 
				-#         # --eval_first
			
 
				+# Dataset setting
			
 
				+DATASET="coco"
			
 
				+DATA_ROOT="/data/datasets/"
			
 
				+# DATA_ROOT="/Users/liuhaoran/Desktop/python_work/object-detection/dataset/"
			
 
				 
			
 
				-# -------------------------- Train YOLOX & YOLOv7 series --------------------------
			
 
				-# python train.py \
			
 
				-#         --cuda \
			
 
				-#         -d coco \
			
 
				-#         --root /data/datasets/ \
			
 
				-#         -m yolox_s \
			
 
				-#         -bs 8 \
			
 
				-#         -size 640 \
			
 
				-#         --wp_epoch 3 \
			
 
				-#         --max_epoch 300 \
			
 
				-#         --eval_epoch 10 \
			
 
				-#         --no_aug_epoch 15 \
			
 
				-#         --grad_accumulate 8 \
			
 
				-#         --ema \
			
 
				-#         --fp16 \
			
 
				-#         --multi_scale \
			
 
				-#         # --load_cache \
			
 
				-#         # --resume weights/coco/yolox_m/yolox_m_best.pth \
			
 
				-#         # --eval_first
			
 
				+# MODEL setting
			
 
				+MODEL="yolov8_n"
			
 
				+IMAGE_SIZE=640
			
 
				+RESUME="None"
			
 
				+if [[ $MODEL == *"yolov8"* ]]; then
			
 
				+    # Epoch setting
			
 
				+    MAX_EPOCH=500
			
 
				+    BATCH_SIZE=16
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=20
			
 
				+elif [[ $MODEL == *"yolox"* ]]; then
			
 
				+    # Epoch setting
			
 
				+    MAX_EPOCH=300
			
 
				+    BATCH_SIZE=16
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=15
			
 
				+elif [[ $MODEL == *"yolov7"* ]]; then
			
 
				+    # Epoch setting
			
 
				+    MAX_EPOCH=300
			
 
				+    BATCH_SIZE=16
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=20
			
 
				+elif [[ $MODEL == *"yolov5"* || $MODEL == *"yolov4"* || $MODEL == *"yolov3"* ]]; then
			
 
				+    # Epoch setting
			
 
				+    MAX_EPOCH=300
			
 
				+    BATCH_SIZE=16
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=15
			
 
				+else
			
 
				+    # Epoch setting
			
 
				+    MAX_EPOCH=150
			
 
				+    BATCH_SIZE=16
			
 
				+    WP_EPOCH=3
			
 
				+    EVAL_EPOCH=10
			
 
				+    NO_AUG_EPOCH=0
			
 
				+fi
			
 
				 
			
 
				-# -------------------------- Train YOLOv1~v5 series --------------------------
			
 
				+# -------------------------- Train Pipeline --------------------------
			
 
				 python train.py \
			
 
				         --cuda \
			
 
				-        -d coco \
			
 
				-        --root /data/datasets/ \
			
 
				-        -m yolov8_n \
			
 
				-        -bs 16 \
			
 
				-        -size 640 \
			
 
				-        --wp_epoch 3 \
			
 
				-        --max_epoch 500 \
			
 
				-        --eval_epoch 10 \
			
 
				-        --no_aug_epoch 10 \
			
 
				+        --dataset ${DATASET} \
			
 
				+        --root ${DATA_ROOT} \
			
 
				+        --model ${MODEL} \
			
 
				+        --batch_size ${BATCH_SIZE} \
			
 
				+        --img_size ${IMAGE_SIZE} \
			
 
				+        --wp_epoch ${WP_EPOCH} \
			
 
				+        --max_epoch ${MAX_EPOCH} \
			
 
				+        --eval_epoch ${EVAL_EPOCH} \
			
 
				+        --no_aug_epoch ${NO_AUG_EPOCH} \
			
 
				+        --resume ${RESUME} \
			
 
				         --ema \
			
 
				         --fp16 \
			
 
				-        --multi_scale \
			
 
				-        # --load_cache \
			
 
				-        # --resume weights/coco/yolov5_l/yolov5_l_best.pth \
			
 
				-        # --eval_first
			
 
				+        --multi_scale
			
--- a/utils/solver/optimizer.py
+++ b/utils/solver/optimizer.py
@@ -32,7 +32,7 @@ def build_yolo_optimizer(cfg, model, resume=None):
 
				     optimizer.add_param_group({'params': g[1], 'weight_decay': 0.0})                  # add g1 (BatchNorm2d weights)
			
 
				 
			
 
				     start_epoch = 0
			
 
				-    if resume is not None:
			
 
				+    if resume and resume != "None":
			
 
				         print('keep training: ', resume)
			
 
				         checkpoint = torch.load(resume)
			
 
				         # checkpoint state dict
			
@@ -66,7 +66,7 @@ def build_detr_optimizer(cfg, model, resume=None):
 
				         raise NotImplementedError('Optimizer {} not implemented.'.format(cfg['optimizer']))
			
 
				 
			
 
				     start_epoch = 0
			
 
				-    if resume is not None:
			
 
				+    if resume and resume != 'None':
			
 
				         print('keep training: ', resume)
			
 
				         checkpoint = torch.load(resume)
			
 
				         # checkpoint state dict