from __future__ import division

import os
import argparse
from copy import deepcopy

import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

from utils import distributed_utils
from utils.com_flops_params import FLOPs_and_Params
from utils.misc import ModelEMA, CollateFunc, build_dataset, build_dataloader
from utils.solver.optimizer import build_optimizer
from utils.solver.lr_scheduler import build_lr_scheduler

from engine import train_one_epoch, val_one_epoch

from config import build_model_config, build_trans_config
from models import build_model


def parse_args():
    parser = argparse.ArgumentParser(description='YOLO-Tutorial')
    # basic
    parser.add_argument('--cuda', action='store_true', default=False,
                        help='use cuda.')
    parser.add_argument('-size', '--img_size', default=640, type=int, 
                        help='input image size')
    parser.add_argument('--num_workers', default=4, type=int, 
                        help='Number of workers used in dataloading')
    parser.add_argument('--tfboard', action='store_true', default=False,
                        help='use tensorboard')
    parser.add_argument('--save_folder', default='weights/', type=str, 
                        help='path to save weight')
    parser.add_argument('--eval_first', action='store_true', default=False,
                        help='evaluate model before training.')
    parser.add_argument('--fp16', dest="fp16", action="store_true", default=False,
                        help="Adopting mix precision training.")
    parser.add_argument('--vis_tgt', action="store_true", default=False,
                        help="visualize training data.")
    
    # Batchsize
    parser.add_argument('-bs', '--batch_size', default=16, type=int, 
                        help='batch size on all the GPUs.')

    # Epoch
    parser.add_argument('--max_epoch', default=150, type=int, 
                        help='max epoch.')
    parser.add_argument('--wp_epoch', default=1, type=int, 
                        help='warmup epoch.')
    parser.add_argument('--eval_epoch', default=10, type=int, 
                        help='after eval epoch, the model is evaluated on val dataset.')
    parser.add_argument('--step_epoch', nargs='+', default=[90, 120], type=int,
                        help='lr epoch to decay')

    # model
    parser.add_argument('-m', '--model', default='yolov1', type=str,
                        choices=['yolov1', 'yolov2', 'yolov3', 'yolov4', 'yolov5', 'yolov7', 'yolox'], help='build yolo')
    parser.add_argument('-ct', '--conf_thresh', default=0.005, type=float,
                        help='confidence threshold')
    parser.add_argument('-nt', '--nms_thresh', default=0.6, type=float,
                        help='NMS threshold')
    parser.add_argument('--topk', default=1000, type=int,
                        help='topk candidates for evaluation')
    parser.add_argument('-p', '--pretrained', default=None, type=str,
                        help='load pretrained weight')
    parser.add_argument('-r', '--resume', default=None, type=str,
                        help='keep training')

    # dataset
    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
                        help='data root')
    parser.add_argument('-d', '--dataset', default='coco',
                        help='coco, voc, widerface, crowdhuman')
    
    # train trick
    parser.add_argument('-ms', '--multi_scale', action='store_true', default=False,
                        help='Multi scale')
    parser.add_argument('--ema', action='store_true', default=False,
                        help='Model EMA')
    parser.add_argument('--min_box_size', default=8.0, type=float,
                        help='min size of target bounding box.')
    parser.add_argument('--mosaic', default=None, type=float,
                        help='mosaic augmentation.')
    parser.add_argument('--mixup', default=None, type=float,
                        help='mixup augmentation.')

    # DDP train
    parser.add_argument('-dist', '--distributed', action='store_true', default=False,
                        help='distributed training')
    parser.add_argument('--dist_url', default='env://', 
                        help='url used to set up distributed training')
    parser.add_argument('--world_size', default=1, type=int,
                        help='number of distributed processes')
    parser.add_argument('--sybn', action='store_true', default=False, 
                        help='use sybn.')

    return parser.parse_args()


def train():
    args = parse_args()
    print("Setting Arguments.. : ", args)
    print("----------------------------------------------------------")

    # dist
    world_size = distributed_utils.get_world_size()
    per_gpu_batch = args.batch_size // world_size
    print('World size: {}'.format(world_size))
    if args.distributed:
        distributed_utils.init_distributed_mode(args)
        print("git:\n  {}\n".format(distributed_utils.get_sha()))

    # path to save model
    path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
    os.makedirs(path_to_save, exist_ok=True)

    # cuda
    if args.cuda:
        print('use cuda')
        # cudnn.benchmark = True
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    # config
    model_cfg = build_model_config(args)
    trans_cfg = build_trans_config(model_cfg['trans_type'])

    # dataset and evaluator
    dataset, dataset_info, evaluator = build_dataset(args, trans_cfg, device, is_train=True)
    num_classes = dataset_info[0]

    # dataloader
    dataloader = build_dataloader(args, dataset, per_gpu_batch, CollateFunc())

    # build model
    model, criterion = build_model(
        args=args, 
        model_cfg=model_cfg,
        device=device,
        num_classes=num_classes,
        trainable=True,
        )
    model = model.to(device).train()

    # DDP
    model_without_ddp = model
    if args.distributed:
        model = DDP(model, device_ids=[args.gpu])
        model_without_ddp = model.module

    # SyncBatchNorm
    if args.sybn and args.distributed:
        print('use SyncBatchNorm ...')
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    # compute FLOPs and Params
    if distributed_utils.is_main_process:
        model_copy = deepcopy(model_without_ddp)
        model_copy.trainable = False
        model_copy.eval()
        FLOPs_and_Params(model=model_copy, 
                         img_size=args.img_size, 
                         device=device)
        del model_copy
    if args.distributed:
        # wait for all processes to synchronize
        dist.barrier()

    # amp
    scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)

    # batch size
    total_bs = args.batch_size
    accumulate = max(1, round(64 / total_bs))
    print('Grad_Accumulate: ', accumulate)

    # optimizer
    model_cfg['weight_decay'] *= total_bs * accumulate / 64
    optimizer, start_epoch = build_optimizer(model_cfg, model_without_ddp, model_cfg['lr0'], args.resume)

    # Scheduler
    scheduler, lf = build_lr_scheduler(model_cfg, optimizer, args.max_epoch)
    scheduler.last_epoch = start_epoch - 1  # do not move
    if args.resume:
        scheduler.step()

    # EMA
    if args.ema and distributed_utils.get_rank() in [-1, 0]:
        print('Build ModelEMA ...')
        ema = ModelEMA(model, decay=model_cfg['ema_decay'], tau=model_cfg['ema_tau'], updates=start_epoch * len(dataloader))
    else:
        ema = None

    # start training loop
    best_map = -1.0
    last_opt_step = -1
    total_epochs = args.max_epoch
    heavy_eval = False
    optimizer.zero_grad()
    
    # eval before training
    if args.eval_first and distributed_utils.is_main_process():
        # to check whether the evaluator can work
        model_eval = ema.ema if ema else model_without_ddp
        val_one_epoch(
            args=args, model=model_eval, evaluator=evaluator, optimizer=optimizer,
            epoch=0, best_map=best_map, path_to_save=path_to_save)

    # start to train
    for epoch in range(start_epoch, total_epochs):
        if args.distributed:
            dataloader.batch_sampler.sampler.set_epoch(epoch)

        # check second stage
        if epoch >= (total_epochs - model_cfg['no_aug_epoch'] - 1):
            # close mosaic augmentation
            if dataloader.dataset.mosaic_prob > 0.:
                print('close Mosaic Augmentation ...')
                dataloader.dataset.mosaic_prob = 0.
                heavy_eval = True
            # close mixup augmentation
            if dataloader.dataset.mixup_prob > 0.:
                print('close Mixup Augmentation ...')
                dataloader.dataset.mixup_prob = 0.
                heavy_eval = True

        # train one epoch
        last_opt_step = train_one_epoch(
            epoch=epoch,
            total_epochs=total_epochs,
            args=args, 
            device=device,
            ema=ema, 
            model=model,
            criterion=criterion,
            cfg=model_cfg, 
            dataloader=dataloader, 
            optimizer=optimizer,
            scheduler=scheduler,
            lf=lf,
            scaler=scaler,
            last_opt_step=last_opt_step)

        # eval
        if heavy_eval:
            best_map = val_one_epoch(
                            args=args, 
                            model=ema.ema if ema else model_without_ddp, 
                            evaluator=evaluator,
                            optimizer=optimizer,
                            epoch=epoch,
                            best_map=best_map,
                            path_to_save=path_to_save)
        else:
            if (epoch % args.eval_epoch) == 0 or (epoch == total_epochs - 1):
                best_map = val_one_epoch(
                                args=args, 
                                model=ema.ema if ema else model_without_ddp, 
                                evaluator=evaluator,
                                optimizer=optimizer,
                                epoch=epoch,
                                best_map=best_map,
                                path_to_save=path_to_save)

    # Empty cache after train loop
    if args.cuda:
        torch.cuda.empty_cache()

if __name__ == '__main__':
    train()