#!/usr/bin/env python3 # -*- coding:utf-8 -*- # Copyright (c) Megvii, Inc. and its affiliates. import argparse import random import warnings from loguru import logger import torch import torch.backends.cudnn as cudnn from unicorn.core import Trainer from unicorn.exp import get_exp from unicorn.utils import configure_nccl, configure_omp, get_num_devices import sys import torch.distributed as dist import torch.multiprocessing as mp import unicorn.utils.dist as comm def launch( local_rank, main_func, num_gpus_per_machine, backend, args ): assert ( torch.cuda.is_available() ), "cuda is not available. Please check your installation." dist.init_process_group(backend=backend) global_rank = dist.get_rank() logger.info("Rank {} initialization finished.".format(global_rank)) # Setup the local process group (which contains ranks within the same machine) assert comm._LOCAL_PROCESS_GROUP is None num_machines = dist.get_world_size() // num_gpus_per_machine machine_rank = dist.get_rank() // num_gpus_per_machine for i in range(num_machines): ranks_on_i = list( range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine) ) pg = dist.new_group(ranks_on_i) if i == machine_rank: comm._LOCAL_PROCESS_GROUP = pg # synchronize is needed here to prevent a possible timeout after calling init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() assert num_gpus_per_machine <= torch.cuda.device_count() torch.cuda.set_device(local_rank) main_func(*args) """2021.10.03 Support distributed training on arnold""" def make_parser(): parser = argparse.ArgumentParser("YOLOX train parser") parser.add_argument("-expn", "--experiment-name", type=str, default=None) parser.add_argument("-n", "--name", type=str, default=None, help="model name") # distributed parser.add_argument( "--dist-backend", default="nccl", type=str, help="distributed backend" ) parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size") parser.add_argument( "-d", "--devices", default=None, type=int, help="device for training" ) parser.add_argument( "-f", "--exp_file", default=None, type=str, help="plz input your experiment description file", ) parser.add_argument( "--resume", default=False, action="store_true", help="resume training" ) parser.add_argument("-c", "--ckpt", default=None, type=str, help="checkpoint file") parser.add_argument( "-e", "--start_epoch", default=None, type=int, help="resume training start epoch", ) parser.add_argument( "--fp16", dest="fp16", default=False, action="store_true", help="Adopting mix precision training.", ) parser.add_argument( "--cache", dest="cache", default=False, action="store_true", help="Caching imgs to RAM for fast training.", ) parser.add_argument( "-o", "--occupy", dest="occupy", default=False, action="store_true", help="occupy GPU memory first for training.", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument("--local_rank", type=int) return parser @logger.catch def main(exp, args): if exp.seed is not None: random.seed(exp.seed) torch.manual_seed(exp.seed) cudnn.deterministic = True warnings.warn( "You have chosen to seed training. This will turn on the CUDNN deterministic setting, " "which can slow down your training considerably! You may see unexpected behavior " "when restarting from checkpoints." ) # set environment variables for distributed training configure_nccl() configure_omp() cudnn.benchmark = True trainer = Trainer(exp, args) trainer.train() if __name__ == "__main__": args = make_parser().parse_args() exp = get_exp(args.exp_file, args.name) exp.merge(args.opts) if not args.experiment_name: args.experiment_name = exp.exp_name num_gpu = get_num_devices() if args.devices is None else args.devices assert num_gpu <= get_num_devices() local_rank = args.local_rank launch( local_rank, main, num_gpu, backend=args.dist_backend, args=(exp, args), )