import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import DistSamplerSeedHook, Runner from mmdet3d.core import build_optimizer from mmdet3d.datasets import build_dataloader, build_dataset from mmdet.apis.train import parse_losses from mmdet.core import (DistEvalHook, DistOptimizerHook, EvalHook, Fp16OptimizerHook) from mmdet.utils import get_root_logger def batch_processor(model, data, train_mode): """Process a data batch. This method is required as an argument of Runner, which defines how to process a data batch and obtain proper outputs. The first 3 arguments of batch_processor are fixed. Args: model (nn.Module): A PyTorch model. data (dict): The data batch in a dict. train_mode (bool): Training mode or not. It may be useless for some models. Returns: dict: A dict containing losses and log vars. """ losses = model(**data) loss, log_vars = parse_losses(losses) if 'img_meta' in data: num_samples = len(data['img_meta'].data) else: num_samples = len(data['img'].data) outputs = dict(loss=loss, log_vars=log_vars, num_samples=num_samples) return outputs def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # start training if distributed: _dist_train( model, dataset, cfg, validate=validate, logger=logger, timestamp=timestamp, meta=meta) else: _non_dist_train( model, dataset, cfg, validate=validate, logger=logger, timestamp=timestamp, meta=meta) def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=True, seed=cfg.seed) for ds in dataset ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=True, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(EvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)