Browse Source

modify the train script

yjh0410 2 years ago
parent
commit
677d92909d
6 changed files with 133 additions and 104 deletions
  1. 4 4
      engine.py
  2. 1 1
      models/detectors/__init__.py
  3. 13 12
      train.py
  4. 57 32
      train_multi_gpus.sh
  5. 56 53
      train_single_gpu.sh
  6. 2 2
      utils/solver/optimizer.py

+ 4 - 4
engine.py

@@ -80,7 +80,7 @@ class Yolov8Trainer(object):
         # ---------------------------- Build LR Scheduler ----------------------------
         self.lr_scheduler, self.lf = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, self.args.max_epoch)
         self.lr_scheduler.last_epoch = self.start_epoch - 1  # do not move
-        if self.args.resume:
+        if self.args.resume and self.args.resume != 'None':
             self.lr_scheduler.step()
 
         # ---------------------------- Build Model-EMA ----------------------------
@@ -442,7 +442,7 @@ class YoloxTrainer(object):
         # ---------------------------- Build LR Scheduler ----------------------------
         self.lr_scheduler, self.lf = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, self.args.max_epoch - self.no_aug_epoch)
         self.lr_scheduler.last_epoch = self.start_epoch - 1  # do not move
-        if self.args.resume:
+        if self.args.resume and self.args.resume != 'None':
             self.lr_scheduler.step()
 
         # ---------------------------- Build Model-EMA ----------------------------
@@ -805,7 +805,7 @@ class RTCTrainer(object):
         # ---------------------------- Build LR Scheduler ----------------------------
         self.lr_scheduler, self.lf = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, args.max_epoch - args.no_aug_epoch)
         self.lr_scheduler.last_epoch = self.start_epoch - 1  # do not move
-        if self.args.resume:
+        if self.args.resume and self.args.resume != 'None':
             self.lr_scheduler.step()
 
         # ---------------------------- Build Model-EMA ----------------------------
@@ -1168,7 +1168,7 @@ class RTRTrainer(object):
         # ---------------------------- Build LR Scheduler ----------------------------
         self.lr_scheduler, self.lf = build_lr_scheduler(self.lr_schedule_dict, self.optimizer, args.max_epoch - args.no_aug_epoch)
         self.lr_scheduler.last_epoch = self.start_epoch - 1  # do not move
-        if self.args.resume:
+        if self.args.resume and self.args.resume != 'None':
             self.lr_scheduler.step()
 
         # ---------------------------- Build Model-EMA ----------------------------

+ 1 - 1
models/detectors/__init__.py

@@ -77,7 +77,7 @@ def build_model(args,
             model.load_state_dict(checkpoint_state_dict, strict=False)
 
         # keep training
-        if args.resume is not None:
+        if args.resume and args.resume != "None":
             print('keep training: ', args.resume)
             checkpoint = torch.load(args.resume, map_location='cpu')
             # checkpoint state dict

+ 13 - 12
train.py

@@ -1,5 +1,6 @@
 from __future__ import division
 
+import os
 import argparse
 from copy import deepcopy
 
@@ -115,16 +116,25 @@ def train():
     print("----------------------------------------------------------")
 
     # Build DDP
+    local_rank = local_process_rank = -1
     if args.distributed:
         distributed_utils.init_distributed_mode(args)
         print("git:\n  {}\n".format(distributed_utils.get_sha()))
+        try:
+            # Multiple Mechine & Multiple GPUs (world size > 8)
+            local_rank = torch.distributed.get_rank()
+            local_process_rank = int(os.getenv('LOCAL_PROCESS_RANK', '0'))
+        except:
+            # Single Mechine & Multiple GPUs (world size <= 8)
+            local_rank = local_process_rank = torch.distributed.get_rank()
     world_size = distributed_utils.get_world_size()
-    print('World size: {}'.format(world_size))
+    print("LOCAL RANK: ", local_rank)
+    print("LOCAL_PROCESS_RANL: ", local_process_rank)
+    print('WORLD SIZE: {}'.format(world_size))
 
     # Build CUDA
-    if args.cuda:
+    if args.cuda and torch.cuda.is_available():
         print('use cuda')
-        # cudnn.benchmark = True
         device = torch.device("cuda")
     else:
         device = torch.device("cpu")
@@ -136,15 +146,6 @@ def train():
 
     # Build Model
     model, criterion = build_model(args, model_cfg, device, data_cfg['num_classes'], True)
-
-    # Keep training
-    if distributed_utils.is_main_process and args.resume is not None:
-        print('keep training: ', args.resume)
-        checkpoint = torch.load(args.resume, map_location='cpu')
-        # checkpoint state dict
-        checkpoint_state_dict = checkpoint.pop("model")
-        model.load_state_dict(checkpoint_state_dict)
-
     model = model.to(device).train()
     model_without_ddp = model
     if args.sybn and args.distributed:

+ 57 - 32
train_multi_gpus.sh

@@ -1,39 +1,64 @@
-# -------------------------- Train YOLOX & YOLOv7 --------------------------
-# python -m torch.distributed.run --nproc_per_node=8 train.py \
-#                                                     --cuda \
-#                                                     -dist \
-#                                                     -d coco \
-#                                                     --root /data/datasets/ \
-#                                                     -m rtcdet_n \
-#                                                     -bs 64 \
-#                                                     -size 640 \
-#                                                     --wp_epoch 3 \
-#                                                     --max_epoch 300 \
-#                                                     --eval_epoch 10 \
-#                                                     --no_aug_epoch 15 \
-#                                                     --ema \
-#                                                     --fp16 \
-#                                                     --sybn \
-#                                                     --multi_scale \
-#                                                     # --load_cache \
-#                                                     # --resume weights/coco/yolox_l/yolox_l_best.pth \
+# Dataset setting
+DATASET="coco"
+DATA_ROOT="/data/datasets/"
+# DATA_ROOT="/Users/liuhaoran/Desktop/python_work/object-detection/dataset/"
 
-# -------------------------- Train YOLOv1~v5 --------------------------
+# MODEL setting
+MODEL="yolov8_n"
+IMAGE_SIZE=640
+RESUME="None"
+if [[ $MODEL == *"yolov8"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=128
+    MAX_EPOCH=500
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=20
+elif [[ $MODEL == *"yolox"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=64
+    MAX_EPOCH=300
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=15
+elif [[ $MODEL == *"yolov7"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=128
+    MAX_EPOCH=300
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=20
+elif [[ $MODEL == *"yolov5"* || $MODEL == *"yolov4"* || $MODEL == *"yolov3"* ]]; then
+    # Epoch setting
+    BATCH_SIZE=128
+    MAX_EPOCH=300
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=15
+else
+    # Epoch setting
+    BATCH_SIZE=128
+    MAX_EPOCH=150
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=0
+fi
+
+# -------------------------- Train Pipeline --------------------------
 python -m torch.distributed.run --nproc_per_node=8 train.py \
                                                     --cuda \
                                                     -dist \
-                                                    -d coco \
-                                                    --root /data/datasets/ \
-                                                    -m yolov8_n\
-                                                    -bs 128 \
-                                                    -size 640 \
-                                                    --wp_epoch 3 \
-                                                    --max_epoch 500 \
-                                                    --eval_epoch 10 \
-                                                    --no_aug_epoch 20 \
+                                                    --dataset ${DATASET} \
+                                                    --root ${DATA_ROOT} \
+                                                    --model ${MODEL} \
+                                                    --batch_size ${BATCH_SIZE} \
+                                                    --img_size ${IMAGE_SIZE} \
+                                                    --wp_epoch ${WP_EPOCH} \
+                                                    --max_epoch ${MAX_EPOCH} \
+                                                    --eval_epoch ${EVAL_EPOCH} \
+                                                    --no_aug_epoch ${NO_AUG_EPOCH} \
+                                                    --resume ${RESUME} \
                                                     --ema \
                                                     --fp16 \
-                                                    --sybn \
                                                     --multi_scale \
-                                                    # --load_cache
-                                                    # --resume weights/coco/yolov5_l/yolov5_l_best.pth \
+                                                    --sybn \

+ 56 - 53
train_single_gpu.sh

@@ -1,59 +1,62 @@
-# -------------------------- Train RTCDet series --------------------------
-# python train.py \
-#         --cuda \
-#         -d coco \
-#         --root /data/datasets/ \
-#         -m rtrdet_l \
-#         -bs 16 \
-#         -size 640 \
-#         --wp_epoch 1 \
-#         --max_epoch 300 \
-#         --eval_epoch 10 \
-#         --no_aug_epoch 20 \
-#         --grad_accumulate 1 \
-#         --ema \
-#         --fp16 \
-#         --multi_scale \
-#         --eval_first \
-#         # --load_cache \
-#         # --resume weights/coco/yolox_m/yolox_m_best.pth \
-#         # --eval_first
+# Dataset setting
+DATASET="coco"
+DATA_ROOT="/data/datasets/"
+# DATA_ROOT="/Users/liuhaoran/Desktop/python_work/object-detection/dataset/"
 
-# -------------------------- Train YOLOX & YOLOv7 series --------------------------
-# python train.py \
-#         --cuda \
-#         -d coco \
-#         --root /data/datasets/ \
-#         -m yolox_s \
-#         -bs 8 \
-#         -size 640 \
-#         --wp_epoch 3 \
-#         --max_epoch 300 \
-#         --eval_epoch 10 \
-#         --no_aug_epoch 15 \
-#         --grad_accumulate 8 \
-#         --ema \
-#         --fp16 \
-#         --multi_scale \
-#         # --load_cache \
-#         # --resume weights/coco/yolox_m/yolox_m_best.pth \
-#         # --eval_first
+# MODEL setting
+MODEL="yolov8_n"
+IMAGE_SIZE=640
+RESUME="None"
+if [[ $MODEL == *"yolov8"* ]]; then
+    # Epoch setting
+    MAX_EPOCH=500
+    BATCH_SIZE=16
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=20
+elif [[ $MODEL == *"yolox"* ]]; then
+    # Epoch setting
+    MAX_EPOCH=300
+    BATCH_SIZE=16
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=15
+elif [[ $MODEL == *"yolov7"* ]]; then
+    # Epoch setting
+    MAX_EPOCH=300
+    BATCH_SIZE=16
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=20
+elif [[ $MODEL == *"yolov5"* || $MODEL == *"yolov4"* || $MODEL == *"yolov3"* ]]; then
+    # Epoch setting
+    MAX_EPOCH=300
+    BATCH_SIZE=16
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=15
+else
+    # Epoch setting
+    MAX_EPOCH=150
+    BATCH_SIZE=16
+    WP_EPOCH=3
+    EVAL_EPOCH=10
+    NO_AUG_EPOCH=0
+fi
 
-# -------------------------- Train YOLOv1~v5 series --------------------------
+# -------------------------- Train Pipeline --------------------------
 python train.py \
         --cuda \
-        -d coco \
-        --root /data/datasets/ \
-        -m yolov8_n \
-        -bs 16 \
-        -size 640 \
-        --wp_epoch 3 \
-        --max_epoch 500 \
-        --eval_epoch 10 \
-        --no_aug_epoch 10 \
+        --dataset ${DATASET} \
+        --root ${DATA_ROOT} \
+        --model ${MODEL} \
+        --batch_size ${BATCH_SIZE} \
+        --img_size ${IMAGE_SIZE} \
+        --wp_epoch ${WP_EPOCH} \
+        --max_epoch ${MAX_EPOCH} \
+        --eval_epoch ${EVAL_EPOCH} \
+        --no_aug_epoch ${NO_AUG_EPOCH} \
+        --resume ${RESUME} \
         --ema \
         --fp16 \
-        --multi_scale \
-        # --load_cache \
-        # --resume weights/coco/yolov5_l/yolov5_l_best.pth \
-        # --eval_first
+        --multi_scale

+ 2 - 2
utils/solver/optimizer.py

@@ -32,7 +32,7 @@ def build_yolo_optimizer(cfg, model, resume=None):
     optimizer.add_param_group({'params': g[1], 'weight_decay': 0.0})                  # add g1 (BatchNorm2d weights)
 
     start_epoch = 0
-    if resume is not None:
+    if resume and resume != "None":
         print('keep training: ', resume)
         checkpoint = torch.load(resume)
         # checkpoint state dict
@@ -66,7 +66,7 @@ def build_detr_optimizer(cfg, model, resume=None):
         raise NotImplementedError('Optimizer {} not implemented.'.format(cfg['optimizer']))
 
     start_epoch = 0
-    if resume is not None:
+    if resume and resume != 'None':
         print('keep training: ', resume)
         checkpoint = torch.load(resume)
         # checkpoint state dict