# Copyright (c) OpenMMLab. All rights reserved. import argparse import copy import multiprocessing as mp import os import os.path as osp import platform import time import warnings import cv2 import mmcv import torch from mmcv import Config, DictAction from mmcv.runner import get_dist_info, init_dist from mmcv.utils import get_git_hash from mmgen import __version__ from mmgen.apis import set_random_seed, train_model from mmgen.datasets import build_dataset from mmgen.models import build_model from mmgen.utils import collect_env, get_root_logger cv2.setNumThreads(0) def parse_args(): parser = argparse.ArgumentParser(description='Train a GAN model') parser.add_argument('config', help='train config file path') parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument( '--resume-from', help='the checkpoint file to resume from') parser.add_argument( '--no-validate', action='store_true', help='whether not to evaluate the checkpoint during training') group_gpus = parser.add_mutually_exclusive_group() group_gpus.add_argument( '--gpus', type=int, help='(Deprecated, please use --gpu-id) number of gpus to use ' '(only applicable to non-distributed training)') group_gpus.add_argument( '--gpu-ids', type=int, nargs='+', help='(Deprecated, please use --gpu-id) ids of gpus to use ' '(only applicable to non-distributed training)') group_gpus.add_argument( '--gpu-id', type=int, default=0, help='id of gpu to use ' '(only applicable to non-distributed training)') parser.add_argument('--seed', type=int, default=2021, help='random seed') parser.add_argument( '--diff_seed', action='store_true', help='Whether or not set different seeds for different ranks') parser.add_argument( '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file.') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) return args def setup_multi_processes(cfg): # set multi-process start method as `fork` to speed up the training if platform.system() != 'Windows': mp_start_method = cfg.get('mp_start_method', 'fork') mp.set_start_method(mp_start_method) # disable opencv multithreading to avoid system being overloaded opencv_num_threads = cfg.get('opencv_num_threads', 0) cv2.setNumThreads(opencv_num_threads) # setup OMP threads # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py # noqa if ('OMP_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1): omp_num_threads = 1 warnings.warn( f'Setting OMP_NUM_THREADS environment variable for each process ' f'to be {omp_num_threads} in default, to avoid your system being ' f'overloaded, please further tune the variable for optimal ' f'performance in your application as needed.') os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) # setup MKL threads if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1: mkl_num_threads = 1 warnings.warn( f'Setting MKL_NUM_THREADS environment variable for each process ' f'to be {mkl_num_threads} in default, to avoid your system being ' f'overloaded, please further tune the variable for optimal ' f'performance in your application as needed.') os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads) def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) setup_multi_processes(cfg) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpus is not None: cfg.gpu_ids = range(1) warnings.warn('`--gpus` is deprecated because we only support ' 'single GPU mode in non-distributed training. ' 'Use `gpus=1` now.') if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids[0:1] warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 'Because we only support single GPU mode in ' 'non-distributed training. Use the first GPU ' 'in `gpu_ids` now.') if args.gpus is None and args.gpu_ids is None: cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # re-set gpu_ids with distributed training mode _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info meta['config'] = cfg.pretty_text # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}, ' f'use_rank_shift: {args.diff_seed}') set_random_seed( args.seed, deterministic=args.deterministic, use_rank_shift=args.diff_seed) cfg.seed = args.seed meta['seed'] = args.seed meta['exp_name'] = osp.basename(args.config) model = build_model( cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.val.pipeline datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmgen version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmgen_version=__version__ + get_git_hash()[:7]) train_model( model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta) if __name__ == '__main__': main()