Commit 701d40a5 authored by Kai Chen's avatar Kai Chen
Browse files

Merge branch 'dev' into single-stage

parents cae36ab6 df2aab9b
# model settings
model = dict(
type='FastRCNN',
pretrained='modelzoo://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCRoIHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False),
mask_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
mask_head=dict(
type='FCNMaskHead',
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=81))
# model training and testing settings
train_cfg = dict(
rcnn=dict(
mask_size=28,
pos_iou_thr=0.5,
neg_iou_thr=0.5,
crowd_thr=1.1,
roi_batch_size=512,
add_gt_as_proposals=True,
pos_fraction=0.25,
pos_balance_sampling=False,
neg_pos_ub=512,
neg_balance_thr=0,
min_pos_iou=0.5,
pos_weight=-1,
debug=False))
test_cfg = dict(
rcnn=dict(
score_thr=0.05, max_per_img=100, nms_thr=0.5, mask_thr_binary=0.5))
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl',
flip_ratio=0.5,
with_mask=True,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
size_divisor=32,
flip_ratio=0,
with_mask=True,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/fast_mask_rcnn_r50_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
# model settings
model = dict(
type='FastRCNN',
pretrained='modelzoo://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCRoIHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False))
# model training and testing settings
train_cfg = dict(
rcnn=dict(
pos_iou_thr=0.5,
neg_iou_thr=0.5,
crowd_thr=1.1,
roi_batch_size=512,
add_gt_as_proposals=True,
pos_fraction=0.25,
pos_balance_sampling=False,
neg_pos_ub=512,
neg_balance_thr=0,
min_pos_iou=0.5,
pos_weight=-1,
debug=False))
test_cfg = dict(rcnn=dict(score_thr=0.05, max_per_img=100, nms_thr=0.5))
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl',
flip_ratio=0.5,
with_mask=False,
with_crowd=True,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=True,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/fast_rcnn_r50_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]
...@@ -65,7 +65,7 @@ train_cfg = dict( ...@@ -65,7 +65,7 @@ train_cfg = dict(
pos_balance_sampling=False, pos_balance_sampling=False,
neg_pos_ub=512, neg_pos_ub=512,
neg_balance_thr=0, neg_balance_thr=0,
min_pos_iou=1.1, min_pos_iou=0.5,
pos_weight=-1, pos_weight=-1,
debug=False)) debug=False))
test_cfg = dict( test_cfg = dict(
...@@ -139,7 +139,6 @@ log_config = dict( ...@@ -139,7 +139,6 @@ log_config = dict(
# yapf:enable # yapf:enable
# runtime settings # runtime settings
total_epochs = 12 total_epochs = 12
device_ids = range(8)
dist_params = dict(backend='nccl') dist_params = dict(backend='nccl')
log_level = 'INFO' log_level = 'INFO'
work_dir = './work_dirs/faster_rcnn_r50_fpn_1x' work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
......
...@@ -77,7 +77,7 @@ train_cfg = dict( ...@@ -77,7 +77,7 @@ train_cfg = dict(
pos_balance_sampling=False, pos_balance_sampling=False,
neg_pos_ub=512, neg_pos_ub=512,
neg_balance_thr=0, neg_balance_thr=0,
min_pos_iou=1.1, min_pos_iou=0.5,
pos_weight=-1, pos_weight=-1,
debug=False)) debug=False))
test_cfg = dict( test_cfg = dict(
...@@ -152,7 +152,6 @@ log_config = dict( ...@@ -152,7 +152,6 @@ log_config = dict(
# yapf:enable # yapf:enable
# runtime settings # runtime settings
total_epochs = 12 total_epochs = 12
device_ids = range(8)
dist_params = dict(backend='nccl') dist_params = dict(backend='nccl')
log_level = 'INFO' log_level = 'INFO'
work_dir = './work_dirs/mask_rcnn_r50_fpn_1x' work_dir = './work_dirs/mask_rcnn_r50_fpn_1x'
......
from .env import init_dist, get_root_logger, set_random_seed
from .train import train_detector
from .inference import inference_detector
__all__ = [
'init_dist', 'get_root_logger', 'set_random_seed', 'train_detector',
'inference_detector'
]
import logging
import os
import random
import numpy as np
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from mmcv.runner import get_dist_info
def init_dist(launcher, backend='nccl', **kwargs):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn')
if launcher == 'pytorch':
_init_dist_pytorch(backend, **kwargs)
elif launcher == 'mpi':
_init_dist_mpi(backend, **kwargs)
elif launcher == 'slurm':
_init_dist_slurm(backend, **kwargs)
else:
raise ValueError('Invalid launcher type: {}'.format(launcher))
def _init_dist_pytorch(backend, **kwargs):
# TODO: use local_rank instead of rank % num_gpus
rank = int(os.environ['RANK'])
num_gpus = torch.cuda.device_count()
torch.cuda.set_device(rank % num_gpus)
dist.init_process_group(backend=backend, **kwargs)
def _init_dist_mpi(backend, **kwargs):
raise NotImplementedError
def _init_dist_slurm(backend, **kwargs):
raise NotImplementedError
def set_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def get_root_logger(log_level=logging.INFO):
logger = logging.getLogger()
if not logger.hasHandlers():
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s',
level=log_level)
rank, _ = get_dist_info()
if rank != 0:
logger.setLevel('ERROR')
return logger
import mmcv
import numpy as np
import torch
from mmdet.datasets import to_tensor
from mmdet.datasets.transforms import ImageTransform
from mmdet.core import get_classes
def _prepare_data(img, img_transform, cfg, device):
ori_shape = img.shape
img, img_shape, pad_shape, scale_factor = img_transform(
img, scale=cfg.data.test.img_scale)
img = to_tensor(img).to(device).unsqueeze(0)
img_meta = [
dict(
ori_shape=ori_shape,
img_shape=img_shape,
pad_shape=pad_shape,
scale_factor=scale_factor,
flip=False)
]
return dict(img=[img], img_meta=[img_meta])
def inference_detector(model, imgs, cfg, device='cuda:0'):
imgs = imgs if isinstance(imgs, list) else [imgs]
img_transform = ImageTransform(
size_divisor=cfg.data.test.size_divisor, **cfg.img_norm_cfg)
model = model.to(device)
model.eval()
for img in imgs:
img = mmcv.imread(img)
data = _prepare_data(img, img_transform, cfg, device)
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
yield result
def show_result(img, result, dataset='coco', score_thr=0.3):
class_names = get_classes(dataset)
labels = [
np.full(bbox.shape[0], i, dtype=np.int32)
for i, bbox in enumerate(result)
]
labels = np.concatenate(labels)
bboxes = np.vstack(result)
mmcv.imshow_det_bboxes(
img.copy(),
bboxes,
labels,
class_names=class_names,
score_thr=score_thr)
from __future__ import division
from collections import OrderedDict
import torch
from mmcv.runner import Runner, DistSamplerSeedHook
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmdet.core import (DistOptimizerHook, CocoDistEvalRecallHook,
CocoDistEvalmAPHook)
from mmdet.datasets import build_dataloader
from mmdet.models import RPN
from .env import get_root_logger
def parse_losses(losses):
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
else:
raise TypeError(
'{} is not a tensor or list of tensors'.format(loss_name))
loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
log_vars['loss'] = loss
for name in log_vars:
log_vars[name] = log_vars[name].item()
return loss, log_vars
def batch_processor(model, data, train_mode):
losses = model(**data)
loss, log_vars = parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
def train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
logger=None):
if logger is None:
logger = get_root_logger(cfg.log_level)
# start training
if distributed:
_dist_train(model, dataset, cfg, validate=validate)
else:
_non_dist_train(model, dataset, cfg, validate=validate)
def _dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
data_loaders = [
build_dataloader(
dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
dist=True)
]
# put model on gpus
model = MMDistributedDataParallel(model.cuda())
# build runner
runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
cfg.log_level)
# register hooks
optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config)
runner.register_hook(DistSamplerSeedHook())
# register eval hooks
if validate:
if isinstance(model.module, RPN):
runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
elif cfg.data.val.type == 'CocoDataset':
runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
data_loaders = [
build_dataloader(
dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
cfg.gpus,
dist=False)
]
# put model on gpus
model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
# build runner
runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
cfg.log_level)
runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
cfg.checkpoint_config, cfg.log_config)
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
...@@ -16,8 +16,8 @@ def coco_eval(result_file, result_types, coco, max_dets=(100, 300, 1000)): ...@@ -16,8 +16,8 @@ def coco_eval(result_file, result_types, coco, max_dets=(100, 300, 1000)):
coco = COCO(coco) coco = COCO(coco)
assert isinstance(coco, COCO) assert isinstance(coco, COCO)
if res_type == 'proposal_fast': if result_types == ['proposal_fast']:
ar = fast_eval_recall(result_file, coco, max_dets) ar = fast_eval_recall(result_file, coco, np.array(max_dets))
for i, num in enumerate(max_dets): for i, num in enumerate(max_dets):
print('AR@{}\t= {:.4f}'.format(num, ar[i])) print('AR@{}\t= {:.4f}'.format(num, ar[i]))
return return
......
from .dist_utils import init_dist, allreduce_grads, DistOptimizerHook from .dist_utils import allreduce_grads, DistOptimizerHook
from .misc import tensor2imgs, unmap, multi_apply from .misc import tensor2imgs, unmap, multi_apply
__all__ = [ __all__ = [
'init_dist', 'allreduce_grads', 'DistOptimizerHook', 'tensor2imgs', 'allreduce_grads', 'DistOptimizerHook', 'tensor2imgs', 'unmap',
'unmap', 'multi_apply' 'multi_apply'
] ]
import os
from collections import OrderedDict from collections import OrderedDict
import torch
import torch.multiprocessing as mp
import torch.distributed as dist import torch.distributed as dist
from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors, from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors,
_take_tensors) _take_tensors)
from mmcv.runner import OptimizerHook from mmcv.runner import OptimizerHook
def init_dist(launcher, backend='nccl', **kwargs):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn')
if launcher == 'pytorch':
_init_dist_pytorch(backend, **kwargs)
elif launcher == 'mpi':
_init_dist_mpi(backend, **kwargs)
elif launcher == 'slurm':
_init_dist_slurm(backend, **kwargs)
else:
raise ValueError('Invalid launcher type: {}'.format(launcher))
def _init_dist_pytorch(backend, **kwargs):
# TODO: use local_rank instead of rank % num_gpus
rank = int(os.environ['RANK'])
num_gpus = torch.cuda.device_count()
torch.cuda.set_device(rank % num_gpus)
dist.init_process_group(backend=backend, **kwargs)
def _init_dist_mpi(backend, **kwargs):
raise NotImplementedError
def _init_dist_slurm(backend, **kwargs):
raise NotImplementedError
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
if bucket_size_mb > 0: if bucket_size_mb > 0:
bucket_size_bytes = bucket_size_mb * 1024 * 1024 bucket_size_bytes = bucket_size_mb * 1024 * 1024
......
...@@ -15,7 +15,7 @@ resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) ...@@ -15,7 +15,7 @@ resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
def build_dataloader(dataset, def build_dataloader(dataset,
imgs_per_gpu, imgs_per_gpu,
workers_per_gpu, workers_per_gpu,
num_gpus, num_gpus=1,
dist=True, dist=True,
**kwargs): **kwargs):
if dist: if dist:
......
...@@ -140,7 +140,6 @@ class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin, ...@@ -140,7 +140,6 @@ class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin,
def simple_test(self, img, img_meta, proposals=None, rescale=False): def simple_test(self, img, img_meta, proposals=None, rescale=False):
"""Test without augmentation.""" """Test without augmentation."""
assert proposals is None, "Fast RCNN hasn't been implemented."
assert self.with_bbox, "Bbox head must be implemented." assert self.with_bbox, "Bbox head must be implemented."
x = self.extract_feat(img) x = self.extract_feat(img)
......
...@@ -39,7 +39,13 @@ def parse_args(): ...@@ -39,7 +39,13 @@ def parse_args():
parser = argparse.ArgumentParser(description='MMDet test detector') parser = argparse.ArgumentParser(description='MMDet test detector')
parser.add_argument('config', help='test config file path') parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument('--gpus', default=1, type=int) parser.add_argument(
'--gpus', default=1, type=int, help='GPU number used for testing')
parser.add_argument(
'--proc_per_gpu',
default=1,
type=int,
help='Number of processes per GPU')
parser.add_argument('--out', help='output result file') parser.add_argument('--out', help='output result file')
parser.add_argument( parser.add_argument(
'--eval', '--eval',
...@@ -55,6 +61,9 @@ def parse_args(): ...@@ -55,6 +61,9 @@ def parse_args():
def main(): def main():
args = parse_args() args = parse_args()
if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
raise ValueError('The output file must be a pkl file.')
cfg = mmcv.Config.fromfile(args.config) cfg = mmcv.Config.fromfile(args.config)
cfg.model.pretrained = None cfg.model.pretrained = None
cfg.data.test.test_mode = True cfg.data.test.test_mode = True
...@@ -78,15 +87,27 @@ def main(): ...@@ -78,15 +87,27 @@ def main():
model_args = cfg.model.copy() model_args = cfg.model.copy()
model_args.update(train_cfg=None, test_cfg=cfg.test_cfg) model_args.update(train_cfg=None, test_cfg=cfg.test_cfg)
model_type = getattr(detectors, model_args.pop('type')) model_type = getattr(detectors, model_args.pop('type'))
outputs = parallel_test(model_type, model_args, args.checkpoint, outputs = parallel_test(
dataset, _data_func, range(args.gpus)) model_type,
model_args,
args.checkpoint,
dataset,
_data_func,
range(args.gpus),
workers_per_gpu=args.proc_per_gpu)
if args.out: if args.out:
print('writing results to {}'.format(args.out))
mmcv.dump(outputs, args.out) mmcv.dump(outputs, args.out)
if args.eval: eval_types = args.eval
json_file = args.out + '.json' if eval_types:
results2json(dataset, outputs, json_file) print('Starting evaluate {}'.format(' and '.join(eval_types)))
coco_eval(json_file, args.eval, dataset.coco) if eval_types == ['proposal_fast']:
result_file = args.out
else:
result_file = args.out + '.json'
results2json(dataset, outputs, result_file)
coco_eval(result_file, eval_types, dataset.coco)
if __name__ == '__main__': if __name__ == '__main__':
......
from __future__ import division from __future__ import division
import argparse import argparse
import logging
import random
from collections import OrderedDict
import numpy as np
import torch
from mmcv import Config from mmcv import Config
from mmcv.runner import Runner, obj_from_dict, DistSamplerSeedHook from mmcv.runner import obj_from_dict
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmdet import datasets, __version__ from mmdet import datasets, __version__
from mmdet.core import (init_dist, DistOptimizerHook, CocoDistEvalRecallHook, from mmdet.api import (train_detector, init_dist, get_root_logger,
CocoDistEvalmAPHook) set_random_seed)
from mmdet.datasets import build_dataloader from mmdet.models import build_detector
from mmdet.models import build_detector, RPN
def parse_losses(losses):
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
else:
raise TypeError(
'{} is not a tensor or list of tensors'.format(loss_name))
loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
log_vars['loss'] = loss
for name in log_vars:
log_vars[name] = log_vars[name].item()
return loss, log_vars
def batch_processor(model, data, train_mode):
losses = model(**data)
loss, log_vars = parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
def get_logger(log_level):
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s', level=log_level)
logger = logging.getLogger()
return logger
def set_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def parse_args(): def parse_args():
...@@ -69,10 +17,14 @@ def parse_args(): ...@@ -69,10 +17,14 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--validate', '--validate',
action='store_true', action='store_true',
help='whether to add a validate phase') help='whether to evaluate the checkpoint during training')
parser.add_argument( parser.add_argument(
'--gpus', type=int, default=1, help='number of gpus to use') '--gpus',
parser.add_argument('--seed', type=int, help='random seed') type=int,
default=1,
help='number of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument( parser.add_argument(
'--launcher', '--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'], choices=['none', 'pytorch', 'slurm', 'mpi'],
...@@ -88,69 +40,41 @@ def main(): ...@@ -88,69 +40,41 @@ def main():
args = parse_args() args = parse_args()
cfg = Config.fromfile(args.config) cfg = Config.fromfile(args.config)
# update configs according to CLI args
if args.work_dir is not None: if args.work_dir is not None:
cfg.work_dir = args.work_dir cfg.work_dir = args.work_dir
cfg.gpus = args.gpus cfg.gpus = args.gpus
# save mmdet version in checkpoint as meta data if cfg.checkpoint_config is not None:
cfg.checkpoint_config.meta = dict( # save mmdet version in checkpoints as meta data
mmdet_version=__version__, config=cfg.text) cfg.checkpoint_config.meta = dict(
mmdet_version=__version__, config=cfg.text)
logger = get_logger(cfg.log_level)
# set random seed if specified # init distributed env first, since logger depends on the dist info.
if args.seed is not None:
logger.info('Set random seed to {}'.format(args.seed))
set_random_seed(args.seed)
# init distributed environment if necessary
if args.launcher == 'none': if args.launcher == 'none':
dist = False distributed = False
logger.info('Non-distributed training.')
else: else:
dist = True distributed = True
init_dist(args.launcher, **cfg.dist_params) init_dist(args.launcher, **cfg.dist_params)
if torch.distributed.get_rank() != 0:
logger.setLevel('ERROR')
logger.info('Distributed training.')
# prepare data loaders # init logger before other steps
train_dataset = obj_from_dict(cfg.data.train, datasets) logger = get_root_logger(cfg.log_level)
data_loaders = [ logger.info('Distributed training: {}'.format(distributed))
build_dataloader(train_dataset, cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu, cfg.gpus, dist) # set random seeds
] if args.seed is not None:
logger.info('Set random seed to {}'.format(args.seed))
set_random_seed(args.seed)
# build model
model = build_detector( model = build_detector(
cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
if dist: train_dataset = obj_from_dict(cfg.data.train, datasets)
model = MMDistributedDataParallel(model.cuda()) train_detector(
else: model,
model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() train_dataset,
cfg,
# build runner distributed=distributed,
runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, validate=args.validate,
cfg.log_level) logger=logger)
# register hooks
optimizer_config = DistOptimizerHook(
**cfg.optimizer_config) if dist else cfg.optimizer_config
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config)
if dist:
runner.register_hook(DistSamplerSeedHook())
# register eval hooks
if args.validate:
if isinstance(model.module, RPN):
runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
elif cfg.data.val.type == 'CocoDataset':
runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment