import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import DistSamplerSeedHook, Runner from mmdet3d.utils import get_root_logger from mmdet.apis.train import parse_losses from mmdet.core import (DistEvalHook, DistOptimizerHook, EvalHook, Fp16OptimizerHook, build_optimizer) from mmdet.datasets import build_dataloader, build_dataset def batch_processor(model, data, train_mode): """Process a data batch. This method is required as an argument of Runner, which defines how to process a data batch and obtain proper outputs. The first 3 arguments of batch_processor are fixed. Args: model (nn.Module): A PyTorch model. data (dict): The data batch in a dict. train_mode (bool): Training mode or not. It may be useless for some models. Returns: dict: A dict containing losses and log vars. """ losses = model(**data) loss, log_vars = parse_losses(losses) if 'img_meta' in data: num_samples = len(data['img_meta'].data) else: num_samples = len(data['img'].data) outputs = dict(loss=loss, log_vars=log_vars, num_samples=num_samples) return outputs def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)