Merge branch 'add-ori-mvx' into 'master'

Add ori mvx See merge request open-mmlab/mmdet.3d!79

Merge branch 'add-ori-mvx' into 'master'
Add ori mvx See merge request open-mmlab/mmdet.3d!79
5a1575a0 · zhangwenwei · 0ed9c576 · 3298db8a · 5a1575a0 · 5a1575a0
Commit 5a1575a0 authored Jun 19, 2020 by zhangwenwei
20 changed files
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -27,7 +27,7 @@ linting:
  stage: test
  script:
    - echo "Start building..."
-    - pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI"
+    - pip install "git+https://github.com/open-mmlab/cocoapi.git#subdirectory=pycocotools"
    - pip install git+https://github.com/open-mmlab/mmcv.git
    - pip install git+https://github.com/open-mmlab/mmdetection.git
    - python -c "import mmdet; print(mmdet.__version__)"

--- a/configs/mvxnet/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
+++ b/configs/mvxnet/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
@@ -4,8 +4,6 @@ point_cloud_range = [0, -40, -3, 70.4, 40, 1]
 model = dict(
    type='DynamicMVXFasterRCNN',
-    pretrained=('./pretrain_detectron/'
-                'ImageNetPretrained/MSRA/resnet50_msra.pth'),
    img_backbone=dict(
        type='ResNet',
        depth=50,
@@ -136,22 +134,10 @@ class_names = ['Pedestrian', 'Cyclist', 'Car']
 img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
 input_modality = dict(use_lidar=True, use_camera=True)
-db_sampler = dict(
-    type='MMDataBaseSampler',
-    data_root=data_root,
-    info_path=data_root + 'kitti_mm_dbinfos_train.pkl',
-    rate=1.0,
-    object_rot_range=[0.0, 0.0],
-    blending_type=['box', 'gaussian', 'poisson'],
-    depth_consistent=True,
-    check_2D_collision=True,
-    collision_thr=[0, 0.3, 0.5, 0.7],
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
-    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6),
-    classes=class_names)
 train_pipeline = [
+    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='Resize',
        img_scale=[(640, 192), (2560, 768)],
@@ -171,10 +157,11 @@ train_pipeline = [
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
        type='Collect3D',
-        keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+        keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']),
 ]
 test_pipeline = [
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
+    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1280, 384),
@@ -196,7 +183,7 @@ test_pipeline = [
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
-            dict(type='Collect3D', keys=['points'])
+            dict(type='Collect3D', keys=['points', 'img'])
        ])
 ]
@@ -204,15 +191,18 @@ data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
-        type=dataset_type,
+        type='RepeatDataset',
-        data_root=data_root,
+        times=2,
-        ann_file=data_root + 'kitti_infos_train.pkl',
+        dataset=dict(
-        split='training',
+            type=dataset_type,
-        pts_prefix='velodyne_reduced',
+            data_root=data_root,
-        pipeline=train_pipeline,
+            ann_file=data_root + 'kitti_infos_train.pkl',
-        modality=input_modality,
+            split='training',
-        classes=class_names,
+            pts_prefix='velodyne_reduced',
-        test_mode=False),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False)),
    val=dict(
        type=dataset_type,
        data_root=data_root,
@@ -255,10 +245,10 @@ log_config = dict(
 # yapf:enable
 evaluation = dict(interval=1)
 # runtime settings
-total_epochs = 80
+total_epochs = 40
 dist_params = dict(backend='nccl')
 log_level = 'INFO'
-work_dir = './work_dirs/sec_secfpn_80e'
+work_dir = None
-load_from = './pretrain_mmdet/mvx_faster_rcnn_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_1x_coco-3-class_44.7_20200205-b1c1533f.pth'  # noqa
+load_from = './pretrain_mmdet/mvx_faster_rcnn_detectron2-caffe_20e_coco-pretrain_gt-sample_kitti-3-class_moderate-79.3_20200207-a4a6a3c7.pth'  # noqa
 resume_from = None
 workflow = [('train', 1)]
--- a/configs/mvxnet/faster_rcnn_regnet-3gf_fpn_2x8_1x_nus.py
+++ b/configs/mvxnet/faster_rcnn_regnet-3gf_fpn_2x8_1x_nus.py
-# model settings
-norm_cfg = dict(type='BN', requires_grad=False)
-model = dict(
-    type='FasterRCNN',
-    pretrained='open-mmlab://regnetx_3.2gf',
-    backbone=dict(
-        type='RegNet',
-        arch='regnetx_3.2gf',
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        base_channels=32,
-        norm_cfg=dict(type='BN', requires_grad=True),
-        norm_eval=True,
-        style='pytorch'),
-    neck=dict(
-        type='FPN',
-        in_channels=[96, 192, 432, 1008],
-        out_channels=256,
-        num_outs=5),
-    rpn_head=dict(
-        type='RPNHead',
-        in_channels=256,
-        feat_channels=256,
-        anchor_generator=dict(
-            type='AnchorGenerator',
-            scales=[8],
-            ratios=[0.5, 1.0, 2.0],
-            strides=[4, 8, 16, 32, 64]),
-        bbox_coder=dict(
-            type='DeltaXYWHBBoxCoder',
-            target_means=[.0, .0, .0, .0],
-            target_stds=[1.0, 1.0, 1.0, 1.0]),
-        loss_cls=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
-    roi_head=dict(
-        type='StandardRoIHead',
-        bbox_roi_extractor=dict(
-            type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
-            out_channels=256,
-            featmap_strides=[4, 8, 16, 32]),
-        bbox_head=dict(
-            type='Shared2FCBBoxHead',
-            in_channels=256,
-            fc_out_channels=1024,
-            roi_feat_size=7,
-            num_classes=10,
-            bbox_coder=dict(
-                type='DeltaXYWHBBoxCoder',
-                target_means=[0., 0., 0., 0.],
-                target_stds=[0.1, 0.1, 0.2, 0.2]),
-            reg_class_agnostic=False,
-            loss_cls=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
-# model training and testing settings
-train_cfg = dict(
-    rpn=dict(
-        assigner=dict(
-            type='MaxIoUAssigner',
-            pos_iou_thr=0.7,
-            neg_iou_thr=0.3,
-            min_pos_iou=0.3,
-            ignore_iof_thr=-1),
-        sampler=dict(
-            type='RandomSampler',
-            num=256,
-            pos_fraction=0.5,
-            neg_pos_ub=-1,
-            add_gt_as_proposals=False),
-        allowed_border=-1,
-        pos_weight=-1,
-        debug=False),
-    rpn_proposal=dict(
-        nms_across_levels=False,
-        nms_pre=2000,
-        # following the setting of detectron,
-        # which improves ~0.2 bbox mAP.
-        nms_post=1000,
-        max_num=1000,
-        nms_thr=0.7,
-        min_bbox_size=0),
-    rcnn=dict(
-        assigner=dict(
-            type='MaxIoUAssigner',
-            pos_iou_thr=0.5,
-            neg_iou_thr=0.5,
-            min_pos_iou=0.5,
-            ignore_iof_thr=-1),
-        sampler=dict(
-            type='RandomSampler',
-            num=512,
-            pos_fraction=0.25,
-            neg_pos_ub=-1,
-            add_gt_as_proposals=True),
-        pos_weight=-1,
-        debug=False))
-test_cfg = dict(
-    rpn=dict(
-        nms_across_levels=False,
-        nms_pre=1000,
-        nms_post=1000,
-        max_num=1000,
-        nms_thr=0.7,
-        min_bbox_size=0),
-    rcnn=dict(
-        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
-    # soft-nms is also supported for rcnn testing
-    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
-)
-# dataset settings
-dataset_type = 'CocoDataset'
-data_root = 'data/nuscenes/'
-classes = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
-           'motorcycle', 'pedestrian', 'traffic_cone', 'barrier')
-img_norm_cfg = dict(
-    # The mean and std is used in PyCls when training RegNets
-    mean=[103.53, 116.28, 123.675],
-    std=[57.375, 57.12, 58.395],
-    to_rgb=False)
-file_client_args = dict(
-    backend='petrel',
-    path_mapping=dict({
-        './data/nuscenes/': 's3://nuscenes/nuscenes/',
-        'data/nuscenes/': 's3://nuscenes/nuscenes/'
-    }))
-train_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=file_client_args),
-    dict(
-        type='LoadAnnotations',
-        with_bbox=True,
-        with_mask=False,
-        file_client_args=file_client_args),
-    dict(
-        type='Resize',
-        img_scale=(1280, 720),
-        ratio_range=(0.75, 1.25),
-        keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=file_client_args),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1280, 720),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
-        type=dataset_type,
-        classes=classes,
-        ann_file=data_root + 'nuscenes_infos_train.coco.json',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        classes=classes,
-        ann_file=data_root + 'nuscenes_infos_val.coco.json',
-        pipeline=test_pipeline),
-    test=dict(
-        type=dataset_type,
-        classes=classes,
-        ann_file=data_root + 'nuscenes_infos_val.coco.json',
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=1000,
-    warmup_ratio=1.0 / 1000,
-    step=[8, 11])
-checkpoint_config = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-evaluation = dict(interval=1)
-# runtime settings
-total_epochs = 12
-dist_params = dict(backend='nccl', port=29501)
-log_level = 'INFO'
-work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
-load_from = './pretrain_mmdet/mask_rcnn_regnetx-3GF_fpn_mstrain_3x_coco_box-AP-43.1_mask-AP-38.7-e003695a.pth'  # noqa
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/mvxnet/retinanet_r50_fpn_caffe_2x8_1x_nus.py
+++ b/configs/mvxnet/retinanet_r50_fpn_caffe_2x8_1x_nus.py
-# model settings
-norm_cfg = dict(type='BN', requires_grad=False)
-model = dict(
-    type='RetinaNet',
-    pretrained=('open-mmlab://resnet50_caffe_bgr'),
-    backbone=dict(
-        type='ResNet',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=norm_cfg,
-        norm_eval=True,
-        style='caffe'),
-    neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        start_level=1,
-        add_extra_convs=True,
-        num_outs=5),
-    bbox_head=dict(
-        type='RetinaHead',
-        num_classes=10,
-        in_channels=256,
-        stacked_convs=4,
-        feat_channels=256,
-        anchor_generator=dict(
-            type='AnchorGenerator',
-            octave_base_scale=4,
-            scales_per_octave=3,
-            ratios=[0.5, 1.0, 2.0],
-            strides=[8, 16, 32, 64, 128]),
-        bbox_coder=dict(
-            type='DeltaXYWHBBoxCoder',
-            target_means=[.0, .0, .0, .0],
-            target_stds=[1.0, 1.0, 1.0, 1.0]),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=1.0)))
-# training and testing settings
-train_cfg = dict(
-    assigner=dict(
-        type='MaxIoUAssigner',
-        pos_iou_thr=0.5,
-        neg_iou_thr=0.4,
-        min_pos_iou=0,
-        ignore_iof_thr=-1),
-    allowed_border=-1,
-    pos_weight=-1,
-    debug=False)
-test_cfg = dict(
-    nms_pre=1000,
-    min_bbox_size=0,
-    score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.5),
-    max_per_img=100)
-# dataset settings
-dataset_type = 'NuScenes2DDataset'
-data_root = 'data/nuscenes/'
-# Values to be used for image normalization (BGR order)
-# Default mean pixel value are from ImageNet: [103.53, 116.28, 123.675]
-# When using pre-trained models in Detectron1 or any MSRA models,
-# std has been absorbed into its conv1 weights, so the std needs to be set 1.
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True),
-    dict(
-        type='Resize',
-        img_scale=(1600, 900),
-        ratio_range=(0.8, 1.2),
-        keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1600, 900),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
-        type=dataset_type,
-        ann_file=data_root + 'nuscenes_infos_train.coco.json',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=data_root + 'nuscenes_infos_val.coco.json',
-        pipeline=test_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=data_root + 'nuscenes_infos_val.coco.json',
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=1000,
-    warmup_ratio=1.0 / 1000,
-    step=[8, 11])
-checkpoint_config = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 12
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/retinanet_r50_fpn_1x'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/docs/api.rst
+++ b/docs/api.rst
 API Documentation
 =================
-mmdet3d.apis
--------------
-.. automodule:: mmdet3d.apis
-    :members:
 mmdet3d.core
 --------------

--- a/mmdet3d/apis/__init__.py
+++ b/mmdet3d/apis/__init__.py
-from .train import batch_processor, train_detector
-__all__ = ['batch_processor', 'train_detector']
--- a/mmdet3d/apis/train.py
+++ b/mmdet3d/apis/train.py
-import torch
-from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import DistSamplerSeedHook, Runner, build_optimizer
-from mmdet3d.utils import get_root_logger
-from mmdet.apis.train import parse_losses
-from mmdet.core import (DistEvalHook, DistOptimizerHook, EvalHook,
-                        Fp16OptimizerHook)
-from mmdet.datasets import build_dataloader, build_dataset
-def batch_processor(model, data, train_mode):
-    """Process a data batch.
-    This method is required as an argument of Runner, which defines how to
-    process a data batch and obtain proper outputs. The first 3 arguments of
-    batch_processor are fixed.
-    Args:
-        model (nn.Module): A PyTorch model.
-        data (dict): The data batch in a dict.
-        train_mode (bool): Training mode or not. It may be useless for some
-            models.
-    Returns:
-        dict: A dict containing losses and log vars.
-    """
-    losses = model(**data)
-    loss, log_vars = parse_losses(losses)
-    if 'img_metas' in data:
-        num_samples = len(data['img_metas'].data)
-    else:
-        num_samples = len(data['img'].data)
-    outputs = dict(loss=loss, log_vars=log_vars, num_samples=num_samples)
-    return outputs
-def train_detector(model,
-                   dataset,
-                   cfg,
-                   distributed=False,
-                   validate=False,
-                   timestamp=None,
-                   meta=None):
-    logger = get_root_logger(cfg.log_level)
-    # prepare data loaders
-    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
-    data_loaders = [
-        build_dataloader(
-            ds,
-            cfg.data.samples_per_gpu,
-            cfg.data.workers_per_gpu,
-            # cfg.gpus will be ignored if distributed
-            len(cfg.gpu_ids),
-            dist=distributed,
-            seed=cfg.seed) for ds in dataset
-    ]
-    # put model on gpus
-    if distributed:
-        find_unused_parameters = cfg.get('find_unused_parameters', False)
-        # Sets the `find_unused_parameters` parameter in
-        # torch.nn.parallel.DistributedDataParallel
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False,
-            find_unused_parameters=find_unused_parameters)
-    else:
-        model = MMDataParallel(
-            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
-    # build runner
-    optimizer = build_optimizer(model, cfg.optimizer)
-    runner = Runner(
-        model,
-        batch_processor,
-        optimizer,
-        cfg.work_dir,
-        logger=logger,
-        meta=meta)
-    # an ugly walkaround to make the .log and .log.json filenames the same
-    runner.timestamp = timestamp
-    # fp16 setting
-    fp16_cfg = cfg.get('fp16', None)
-    if fp16_cfg is not None:
-        optimizer_config = Fp16OptimizerHook(
-            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
-    elif distributed and 'type' not in cfg.optimizer_config:
-        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
-    else:
-        optimizer_config = cfg.optimizer_config
-    # register hooks
-    runner.register_training_hooks(cfg.lr_config, optimizer_config,
-                                   cfg.checkpoint_config, cfg.log_config,
-                                   cfg.get('momentum_config', None))
-    if distributed:
-        runner.register_hook(DistSamplerSeedHook())
-    # register eval hooks
-    if validate:
-        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
-        val_dataloader = build_dataloader(
-            val_dataset,
-            samples_per_gpu=1,
-            workers_per_gpu=cfg.data.workers_per_gpu,
-            dist=distributed,
-            shuffle=False)
-        eval_cfg = cfg.get('evaluation', {})
-        eval_hook = DistEvalHook if distributed else EvalHook
-        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
-    if cfg.resume_from:
-        runner.resume(cfg.resume_from)
-    elif cfg.load_from:
-        runner.load_checkpoint(cfg.load_from)
-    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -52,7 +52,7 @@ class KittiDataset(Custom3DDataset):
    def get_data_info(self, index):
        info = self.data_infos[index]
        sample_idx = info['image']['image_idx']
-        img_filename = os.path.join(self.root_split,
+        img_filename = os.path.join(self.data_root,
                                    info['image']['image_path'])
        # TODO: consider use torch.Tensor only
@@ -65,7 +65,8 @@ class KittiDataset(Custom3DDataset):
        input_dict = dict(
            sample_idx=sample_idx,
            pts_filename=pts_filename,
-            img_filename=img_filename,
+            img_prefix=None,
+            img_info=dict(filename=img_filename),
            lidar2img=lidar2img)
        if not self.test_mode:
@@ -113,8 +114,8 @@ class KittiDataset(Custom3DDataset):
        anns_results = dict(
            gt_bboxes_3d=gt_bboxes_3d,
            gt_labels_3d=gt_labels_3d,
-            gt_bboxes=gt_bboxes,
+            bboxes=gt_bboxes,
-            gt_labels=gt_labels)
+            labels=gt_labels)
        return anns_results
    def drop_arrays_by_name(self, gt_names, used_classes):
@@ -151,6 +152,24 @@ class KittiDataset(Custom3DDataset):
            result_files = self.bbox2result_kitti2d(outputs, self.CLASSES,
                                                    pklfile_prefix,
                                                    submission_prefix)
+        elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0]:
+            result_files = dict()
+            for name in outputs[0]:
+                results_ = [out[name] for out in outputs]
+                pklfile_prefix_ = pklfile_prefix + name
+                if submission_prefix is not None:
+                    submission_prefix_ = submission_prefix + name
+                else:
+                    submission_prefix_ = None
+                if 'img' in name:
+                    result_files = self.bbox2result_kitti2d(
+                        results_, self.CLASSES, pklfile_prefix_,
+                        submission_prefix_)
+                else:
+                    result_files_ = self.bbox2result_kitti(
+                        results_, self.CLASSES, pklfile_prefix_,
+                        submission_prefix_)
+                result_files[name] = result_files_
        else:
            result_files = self.bbox2result_kitti(outputs, self.CLASSES,
                                                  pklfile_prefix,
@@ -162,8 +181,7 @@ class KittiDataset(Custom3DDataset):
                 metric=None,
                 logger=None,
                 pklfile_prefix=None,
-                 submission_prefix=None,
+                 submission_prefix=None):
-                 result_names=['pts_bbox']):
        """Evaluation in KITTI protocol.
        Args:
@@ -178,18 +196,38 @@ class KittiDataset(Custom3DDataset):
                If not specified, the submission data will not be generated.
        Returns:
-            dict[str: float]
+            dict[str: float]: results of each evaluation metric
        """
        result_files, tmp_dir = self.format_results(results, pklfile_prefix)
        from mmdet3d.core.evaluation import kitti_eval
        gt_annos = [info['annos'] for info in self.data_infos]
-        if metric == 'img_bbox':
-            ap_result_str, ap_dict = kitti_eval(
+        if isinstance(result_files, dict):
-                gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
+            ap_dict = dict()
+            for name, result_files_ in result_files.items():
+                eval_types = ['bbox', 'bev', '3d']
+                if 'img' in name:
+                    eval_types = ['bbox']
+                ap_result_str, ap_dict_ = kitti_eval(
+                    gt_annos,
+                    result_files_,
+                    self.CLASSES,
+                    eval_types=eval_types)
+                for ap_type, ap in ap_dict_.items():
+                    ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap))
+                print_log(
+                    f'Results of {name}:\n' + ap_result_str, logger=logger)
        else:
-            ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
+            if metric == 'img_bbox':
-                                                self.CLASSES)
+                ap_result_str, ap_dict = kitti_eval(
-        print_log('\n' + ap_result_str, logger=logger)
+                    gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
+            else:
+                ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
+                                                    self.CLASSES)
+            print_log('\n' + ap_result_str, logger=logger)
        if tmp_dir is not None:
            tmp_dir.cleanup()
        return ap_dict

--- a/mmdet3d/datasets/pipelines/__init__.py
+++ b/mmdet3d/datasets/pipelines/__init__.py
 from mmdet.datasets.pipelines import Compose
-from .dbsampler import DataBaseSampler, MMDataBaseSampler
+from .dbsampler import DataBaseSampler
 from .formating import DefaultFormatBundle, DefaultFormatBundle3D
 from .indoor_augment import (IndoorFlipData, IndoorGlobalRotScaleTrans,
                             IndoorPointsColorJitter)
@@ -19,6 +19,6 @@ __all__ = [
    'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
    'IndoorGlobalRotScaleTrans', 'IndoorPointsColorJitter', 'IndoorFlipData',
-    'MMDataBaseSampler', 'NormalizePointsColor', 'LoadAnnotations3D',
+    'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
-    'IndoorPointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D'
+    'PointSegClassMapping', 'MultiScaleFlipAug3D'
 ]
--- a/mmdet3d/datasets/pipelines/dbsampler.py
+++ b/mmdet3d/datasets/pipelines/dbsampler.py
@@ -2,8 +2,6 @@ import copy
 import os
 import pickle
-import cv2
-import mmcv
 import numpy as np
 from mmdet3d.core.bbox import box_np_ops
@@ -263,255 +261,3 @@ class DataBaseSampler(object):
                        boxes[i, -1] - sp_boxes[i - num_gt, -1])
                valid_samples.append(sampled[i - num_gt])
        return valid_samples
-@OBJECTSAMPLERS.register_module()
-class MMDataBaseSampler(DataBaseSampler):
-    def __init__(self,
-                 info_path,
-                 data_root,
-                 rate,
-                 prepare,
-                 object_rot_range,
-                 sample_groups,
-                 classes=None,
-                 check_2D_collision=False,
-                 collision_thr=0,
-                 collision_in_classes=False,
-                 depth_consistent=False,
-                 blending_type=None):
-        super(MMDataBaseSampler, self).__init__(
-            info_path=info_path,
-            data_root=data_root,
-            rate=rate,
-            prepare=prepare,
-            object_rot_range=object_rot_range,
-            sample_groups=sample_groups,
-            classes=classes)
-        self.blending_type = blending_type
-        self.depth_consistent = depth_consistent
-        self.check_2D_collision = check_2D_collision
-        self.collision_thr = collision_thr
-        self.collision_in_classes = collision_in_classes
-    def sample_all(self, gt_bboxes_3d, gt_names, gt_bboxes_2d=None, img=None):
-        sampled_num_dict = {}
-        sample_num_per_class = []
-        for class_name, max_sample_num in zip(self.sample_classes,
-                                              self.sample_max_nums):
-            sampled_num = int(max_sample_num -
-                              np.sum([n == class_name for n in gt_names]))
-            sampled_num = np.round(self.rate * sampled_num).astype(np.int64)
-            sampled_num_dict[class_name] = sampled_num
-            sample_num_per_class.append(sampled_num)
-        sampled = []
-        sampled_gt_bboxes_3d = []
-        sampled_gt_bboxes_2d = []
-        avoid_coll_boxes_3d = gt_bboxes_3d
-        avoid_coll_boxes_2d = gt_bboxes_2d
-        for class_name, sampled_num in zip(self.sample_classes,
-                                           sample_num_per_class):
-            if sampled_num > 0:
-                sampled_cls = self.sample_class_v2(class_name, sampled_num,
-                                                   avoid_coll_boxes_3d,
-                                                   avoid_coll_boxes_2d)
-                sampled += sampled_cls
-                if len(sampled_cls) > 0:
-                    if len(sampled_cls) == 1:
-                        sampled_gt_box_3d = sampled_cls[0]['box3d_lidar'][
-                            np.newaxis, ...]
-                        sampled_gt_box_2d = sampled_cls[0]['box2d_camera'][
-                            np.newaxis, ...]
-                    else:
-                        sampled_gt_box_3d = np.stack(
-                            [s['box3d_lidar'] for s in sampled_cls], axis=0)
-                        sampled_gt_box_2d = np.stack(
-                            [s['box2d_camera'] for s in sampled_cls], axis=0)
-                    sampled_gt_bboxes_3d += [sampled_gt_box_3d]
-                    sampled_gt_bboxes_2d += [sampled_gt_box_2d]
-                    if self.collision_in_classes:
-                        # TODO: check whether check collision check among
-                        # classes is necessary
-                        avoid_coll_boxes_3d = np.concatenate(
-                            [avoid_coll_boxes_3d, sampled_gt_box_3d], axis=0)
-                        avoid_coll_boxes_2d = np.concatenate(
-                            [avoid_coll_boxes_2d, sampled_gt_box_2d], axis=0)
-        ret = None
-        if len(sampled) > 0:
-            sampled_gt_bboxes_3d = np.concatenate(sampled_gt_bboxes_3d, axis=0)
-            sampled_gt_bboxes_2d = np.concatenate(sampled_gt_bboxes_2d, axis=0)
-            s_points_list = []
-            count = 0
-            if self.depth_consistent:
-                # change the paster order based on distance
-                center = sampled_gt_bboxes_3d[:, 0:3]
-                paste_order = np.argsort(
-                    -np.power(np.sum(np.power(center, 2), axis=-1), 1 / 2),
-                    axis=-1)
-            for idx in range(len(sampled)):
-                if self.depth_consistent:
-                    inds = np.where(paste_order == idx)[0][0]
-                    info = sampled[inds]
-                else:
-                    info = sampled[idx]
-                pcd_file_path = os.path.join(
-                    self.data_root,
-                    info['path']) if self.data_root else info['path']
-                img_file_path = pcd_file_path + '.png'
-                mask_file_path = pcd_file_path + '.mask.png'
-                s_points = np.fromfile(
-                    pcd_file_path, dtype=np.float32).reshape([-1, 4])
-                s_patch = mmcv.imread(img_file_path)
-                s_mask = mmcv.imread(mask_file_path, 'grayscale')
-                if 'rot_transform' in info:
-                    rot = info['rot_transform']
-                    s_points[:, :3] = box_np_ops.rotation_points_single_angle(
-                        s_points[:, :3], rot, axis=2)
-                    # TODO: might need to rot 2d bbox in the future
-                # the points of each sample already minus the object center
-                # so this time it needs to add the offset back
-                s_points[:, :3] += info['box3d_lidar'][:3]
-                img = self.paste_obj(
-                    img,
-                    s_patch,
-                    s_mask,
-                    bbox_2d=info['box2d_camera'].astype(np.int32))
-                count += 1
-                s_points_list.append(s_points)
-            ret = dict(
-                img=img,
-                gt_names=np.array([s['name'] for s in sampled]),
-                difficulty=np.array([s['difficulty'] for s in sampled]),
-                gt_bboxes_3d=sampled_gt_bboxes_3d,
-                gt_bboxes_2d=sampled_gt_bboxes_2d,
-                points=np.concatenate(s_points_list, axis=0),
-                group_ids=np.arange(gt_bboxes_3d.shape[0],
-                                    gt_bboxes_3d.shape[0] + len(sampled)))
-        return ret
-    def paste_obj(self, img, obj_img, obj_mask, bbox_2d):
-        # paste the image patch back
-        x1, y1, x2, y2 = bbox_2d
-        # the bbox might exceed the img size because the img is different
-        img_h, img_w = img.shape[:2]
-        w = np.maximum(min(x2, img_w - 1) - x1 + 1, 1)
-        h = np.maximum(min(y2, img_h - 1) - y1 + 1, 1)
-        obj_mask = obj_mask[:h, :w]
-        obj_img = obj_img[:h, :w]
-        # choose a blend option
-        if not self.blending_type:
-            blending_op = 'none'
-        else:
-            blending_choice = np.random.randint(len(self.blending_type))
-            blending_op = self.blending_type[blending_choice]
-        if blending_op.find('poisson') != -1:
-            # options: cv2.NORMAL_CLONE=1, or cv2.MONOCHROME_TRANSFER=3
-            # cv2.MIXED_CLONE mixed the texture, thus is not used.
-            if blending_op == 'poisson':
-                mode = np.random.choice([1, 3], 1)[0]
-            elif blending_op == 'poisson_normal':
-                mode = cv2.NORMAL_CLONE
-            elif blending_op == 'poisson_transfer':
-                mode = cv2.MONOCHROME_TRANSFER
-            else:
-                raise NotImplementedError
-            center = (int(x1 + w / 2), int(y1 + h / 2))
-            img = cv2.seamlessClone(obj_img, img, obj_mask * 255, center, mode)
-        else:
-            if blending_op == 'gaussian':
-                obj_mask = cv2.GaussianBlur(
-                    obj_mask.astype(np.float32), (5, 5), 2)
-            elif blending_op == 'box':
-                obj_mask = cv2.blur(obj_mask.astype(np.float32), (3, 3))
-            paste_mask = 1 - obj_mask
-            img[y1:y1 + h,
-                x1:x1 + w] = (img[y1:y1 + h, x1:x1 + w].astype(np.float32) *
-                              paste_mask[..., None]).astype(np.uint8)
-            img[y1:y1 + h, x1:x1 + w] += (obj_img.astype(np.float32) *
-                                          obj_mask[..., None]).astype(np.uint8)
-        return img
-    def sample_class_v2(self, name, num, gt_bboxes_3d, gt_bboxes_2d):
-        sampled = self.sampler_dict[name].sample(num)
-        sampled = copy.deepcopy(sampled)
-        num_gt = gt_bboxes_3d.shape[0]
-        num_sampled = len(sampled)
-        # avoid collision in BEV first
-        gt_bboxes_bv = box_np_ops.center_to_corner_box2d(
-            gt_bboxes_3d[:, 0:2], gt_bboxes_3d[:, 3:5], gt_bboxes_3d[:, 6])
-        sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0)
-        sp_boxes_bv = box_np_ops.center_to_corner_box2d(
-            sp_boxes[:, 0:2], sp_boxes[:, 3:5], sp_boxes[:, 6])
-        total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0)
-        coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv)
-        # Then avoid collision in 2D space
-        if self.check_2D_collision:
-            sp_boxes_2d = np.stack([i['box2d_camera'] for i in sampled],
-                                   axis=0)
-            total_bbox_2d = np.concatenate([gt_bboxes_2d, sp_boxes_2d],
-                                           axis=0)  # Nx4
-            # random select a collision threshold
-            if isinstance(self.collision_thr, float):
-                collision_thr = self.collision_thr
-            elif isinstance(self.collision_thr, list):
-                collision_thr = np.random.choice(self.collision_thr)
-            elif isinstance(self.collision_thr, dict):
-                mode = self.collision_thr.get('mode', 'value')
-                if mode == 'value':
-                    collision_thr = np.random.choice(
-                        self.collision_thr['thr_range'])
-                elif mode == 'range':
-                    collision_thr = np.random.uniform(
-                        self.collision_thr['thr_range'][0],
-                        self.collision_thr['thr_range'][1])
-            if collision_thr == 0:
-                # use similar collision test as BEV did
-                # Nx4 (x1, y1, x2, y2) -> corners: Nx4x2
-                # ((x1, y1), (x2, y1), (x1, y2), (x2, y2))
-                x1y1 = total_bbox_2d[:, :2]
-                x2y2 = total_bbox_2d[:, 2:]
-                x1y2 = np.stack([total_bbox_2d[:, 0], total_bbox_2d[:, 3]],
-                                axis=-1)
-                x2y1 = np.stack([total_bbox_2d[:, 2], total_bbox_2d[:, 1]],
-                                axis=-1)
-                total_2d = np.stack([x1y1, x2y1, x1y2, x2y2], axis=1)
-                coll_mat_2d = data_augment_utils.box_collision_test(
-                    total_2d, total_2d)
-            else:
-                # use iof rather than iou to protect the foreground
-                overlaps = box_np_ops.iou_jit(total_bbox_2d, total_bbox_2d,
-                                              'iof')
-                coll_mat_2d = overlaps > collision_thr
-            coll_mat = coll_mat + coll_mat_2d
-        diag = np.arange(total_bv.shape[0])
-        coll_mat[diag, diag] = False
-        valid_samples = []
-        for i in range(num_gt, num_gt + num_sampled):
-            if coll_mat[i].any():
-                coll_mat[i] = False
-                coll_mat[:, i] = False
-            else:
-                valid_samples.append(sampled[i - num_gt])
-        return valid_samples
--- a/mmdet3d/models/detectors/base.py
+++ b/mmdet3d/models/detectors/base.py
-from abc import ABCMeta, abstractmethod
+from mmdet.models.detectors import BaseDetector
-import torch.nn as nn
+class Base3DDetector(BaseDetector):
-class Base3DDetector(nn.Module, metaclass=ABCMeta):
    """Base class for detectors"""
-    def __init__(self):
+    def forward_test(self, points, img_metas, img=None, **kwargs):
-        super(Base3DDetector, self).__init__()
-        self.fp16_enabled = False
-    @property
-    def with_neck(self):
-        return hasattr(self, 'neck') and self.neck is not None
-    @property
-    def with_shared_head(self):
-        return hasattr(self, 'shared_head') and self.shared_head is not None
-    @property
-    def with_bbox(self):
-        return hasattr(self, 'bbox_head') and self.bbox_head is not None
-    @property
-    def with_mask(self):
-        return hasattr(self, 'mask_head') and self.mask_head is not None
-    @abstractmethod
-    def extract_feat(self, imgs):
-        pass
-    def extract_feats(self, imgs):
-        assert isinstance(imgs, list)
-        for img in imgs:
-            yield self.extract_feat(img)
-    @abstractmethod
-    def forward_train(self, **kwargs):
-        pass
-    @abstractmethod
-    def simple_test(self, **kwargs):
-        pass
-    @abstractmethod
-    def aug_test(self, **kwargs):
-        pass
-    def init_weights(self, pretrained=None):
-        if pretrained is not None:
-            from mmdet3d.utils import get_root_logger
-            logger = get_root_logger()
-            logger.info('load model from: {}'.format(pretrained))
-    def forward_test(self, points, img_metas, imgs=None, **kwargs):
        """
        Args:
            points (List[Tensor]): the outer list indicates test-time
@@ -62,7 +13,7 @@ class Base3DDetector(nn.Module, metaclass=ABCMeta):
            img_metas (List[List[dict]]): the outer list indicates test-time
                augs (multiscale, flip, etc.) and the inner list indicates
                images in a batch
-            imgs (List[Tensor], optional): the outer list indicates test-time
+            img (List[Tensor], optional): the outer list indicates test-time
                augmentations and inner Tensor should have a shape NxCxHxW,
                which contains all images in the batch. Defaults to None.
        """
@@ -81,10 +32,10 @@ class Base3DDetector(nn.Module, metaclass=ABCMeta):
        assert samples_per_gpu == 1
        if num_augs == 1:
-            imgs = [imgs] if imgs is None else imgs
+            img = [img] if img is None else img
-            return self.simple_test(points[0], img_metas[0], imgs[0], **kwargs)
+            return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
        else:
-            return self.aug_test(points, img_metas, imgs, **kwargs)
+            return self.aug_test(points, img_metas, img, **kwargs)
    def forward(self, return_loss=True, **kwargs):
        """

--- a/mmdet3d/models/detectors/mvx_two_stage.py
+++ b/mmdet3d/models/detectors/mvx_two_stage.py
@@ -265,13 +265,13 @@ class MVXTwoStageDetector(Base3DDetector):
            proposal_list = proposals
        # bbox head forward and loss
-        img_roi_losses = self.roi_head.forward_train(x, img_metas,
+        if self.with_img_bbox:
-                                                     proposal_list, gt_bboxes,
+            # bbox head forward and loss
-                                                     gt_labels,
+            img_roi_losses = self.img_roi_head.forward_train(
-                                                     gt_bboxes_ignore,
+                x, img_metas, proposal_list, gt_bboxes, gt_labels,
-                                                     **kwargs)
+                gt_bboxes_ignore, **kwargs)
+            losses.update(img_roi_losses)
-        losses.update(img_roi_losses)
        return losses
    def simple_test_img(self, x, img_metas, proposals=None, rescale=False):

--- a/mmdet3d/models/detectors/parta2.py
+++ b/mmdet3d/models/detectors/parta2.py
@@ -2,12 +2,17 @@ import torch
 import torch.nn.functional as F
 from mmdet3d.ops import Voxelization
-from mmdet.models import DETECTORS, TwoStageDetector
+from mmdet.models import DETECTORS
 from .. import builder
+from .two_stage import TwoStage3DDetector
 @DETECTORS.register_module()
-class PartA2(TwoStageDetector):
+class PartA2(TwoStage3DDetector):
+    """Part-A2 detector
+    Please refer to the `paper <https://arxiv.org/abs/1907.03670>`_
+    """
    def __init__(self,
                 voxel_layer,
@@ -111,41 +116,6 @@ class PartA2(TwoStageDetector):
        return losses
-    def forward_test(self, points, img_metas, imgs=None, **kwargs):
-        """
-        Args:
-            points (List[Tensor]): the outer list indicates test-time
-                augmentations and inner Tensor should have a shape NxC,
-                which contains all points in the batch.
-            img_metas (List[List[dict]]): the outer list indicates test-time
-                augs (multiscale, flip, etc.) and the inner list indicates
-                images in a batch
-        """
-        for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
-            if not isinstance(var, list):
-                raise TypeError('{} must be a list, but got {}'.format(
-                    name, type(var)))
-        num_augs = len(points)
-        if num_augs != len(img_metas):
-            raise ValueError(
-                'num of augmentations ({}) != num of image meta ({})'.format(
-                    len(points), len(img_metas)))
-        # TODO: remove the restriction of imgs_per_gpu == 1 when prepared
-        samples_per_gpu = len(points[0])
-        assert samples_per_gpu == 1
-        if num_augs == 1:
-            return self.simple_test(points[0], img_metas[0], **kwargs)
-        else:
-            return self.aug_test(points, img_metas, **kwargs)
-    def forward(self, return_loss=True, **kwargs):
-        if return_loss:
-            return self.forward_train(**kwargs)
-        else:
-            return self.forward_test(**kwargs)
    def simple_test(self, points, img_metas, proposals=None, rescale=False):
        feats_dict, voxels_dict = self.extract_feat(points, img_metas)
@@ -159,6 +129,3 @@ class PartA2(TwoStageDetector):
        return self.roi_head.simple_test(feats_dict, voxels_dict, img_metas,
                                         proposal_list)
-    def aug_test(self, **kwargs):
-        raise NotImplementedError
--- a/mmdet3d/models/detectors/single_stage.py
+++ b/mmdet3d/models/detectors/single_stage.py
@@ -6,6 +6,21 @@ from .base import Base3DDetector
 @DETECTORS.register_module()
 class SingleStage3DDetector(Base3DDetector):
+    """SingleStage3DDetector
+    This class serves as a base class for single-stage 3D detectors.
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        neck (dict, optional): Config dict of neck. Defaults to None.
+        bbox_head (dict, optional): Config dict of box head. Defaults to None.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        pretrained (str, optional): Path of pretrained models.
+            Defaults to None.
+    """
    def __init__(self,
                 backbone,

--- a/mmdet3d/models/detectors/two_stage.py
+++ b/mmdet3d/models/detectors/two_stage.py
+from mmdet.models import DETECTORS, TwoStageDetector
+from .base import Base3DDetector
+@DETECTORS.register_module()
+class TwoStage3DDetector(Base3DDetector, TwoStageDetector):
+    """Base class of two-stage 3D detector
+    It inherits original ``:class:TwoStageDetector`` and
+    ``:class:Base3DDetector``. This class could serve as a base class for
+    all two-stage 3D detectors.
+    """
+    def __init__(self, **kwargs):
+        super(TwoStage3DDetector, self).__init__(**kwargs)
--- a/mmdet3d/models/fusion_layers/point_fusion.py
+++ b/mmdet3d/models/fusion_layers/point_fusion.py
@@ -287,7 +287,7 @@ class PointFusion(nn.Module):
            pts.new_tensor(img_meta['pcd_rotation']) if 'pcd_rotation'
            in img_meta.keys() else torch.eye(3).type_as(pts).to(pts.device))
        img_scale_factor = (
-            img_meta['scale_factor']
+            pts.new_tensor(img_meta['scale_factor'][:2])
            if 'scale_factor' in img_meta.keys() else 1)
        pcd_flip = img_meta['pcd_flip'] if 'pcd_flip' in img_meta.keys(
        ) else False

--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
 matplotlib
-mmcv>=0.5.1
+mmcv>=0.6.0
 numba==0.48.0
 numpy
 # need older pillow until torchvision is fixed

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -115,10 +115,8 @@ def test_config_data_pipeline():
    print('Found config_dpath = {!r}'.format(config_dpath))
    # Only tests a representative subset of configurations
-    # TODO: test pipelines using Albu, current Albu throw None given empty GT
    config_names = [
        'mvxnet/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py',
-        'mvxnet/retinanet_r50_fpn_caffe_2x8_1x_nus.py',
        'mvxnet/'
        'faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py',
    ]

--- a/tests/test_forward.py
+++ b/tests/test_forward.py
@@ -82,10 +82,10 @@ def _test_two_stage_forward(cfg_file):
        gt_masks=gt_masks,
        return_loss=True)
    assert isinstance(losses, dict)
-    from mmdet.apis.train import parse_losses
+    loss, _ = detector._parse_losses(losses)
-    total_loss = parse_losses(losses)[0].requires_grad_(True)
+    loss.requires_grad_(True)
-    assert float(total_loss.item()) > 0
+    assert float(loss.item()) > 0
-    total_loss.backward()
+    loss.backward()
    # Test forward train with an empty truth batch
    mm_inputs = _demo_mm_inputs(input_shape, num_items=[0])
@@ -102,10 +102,9 @@ def _test_two_stage_forward(cfg_file):
        gt_masks=gt_masks,
        return_loss=True)
    assert isinstance(losses, dict)
-    from mmdet.apis.train import parse_losses
+    loss, _ = detector._parse_losses(losses)
-    total_loss = parse_losses(losses)[0].requires_grad_(True)
+    assert float(loss.item()) > 0
-    assert float(total_loss.item()) > 0
+    loss.backward()
-    total_loss.backward()
    # Test forward test
    with torch.no_grad():
@@ -140,6 +139,8 @@ def _test_single_stage_forward(cfg_file):
        gt_labels=gt_labels,
        return_loss=True)
    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
    # Test forward test
    with torch.no_grad():

--- a/tools/train.py
+++ b/tools/train.py
@@ -12,11 +12,10 @@ from mmcv import Config, DictAction
 from mmcv.runner import init_dist
 from mmdet3d import __version__
-from mmdet3d.apis import train_detector
 from mmdet3d.datasets import build_dataset
 from mmdet3d.models import build_detector
 from mmdet3d.utils import collect_env, get_root_logger
-from mmdet.apis import set_random_seed
+from mmdet.apis import set_random_seed, train_detector
 def parse_args():