[Refactor]Refactor pointpillars model interface

7c6810e3 · VVsssssk · ChaimZhu · 49a1e555 · 7c6810e3 · 7c6810e3
Commit 7c6810e3 authored Jun 08, 2022 by VVsssssk Committed by ChaimZhu Jul 20, 2022
13 changed files
--- a/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
+++ b/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
@@ -48,34 +48,36 @@ model = dict(
        diff_rad_by_sin=True,
        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
        loss_cls=dict(
-            type='FocalLoss',
+            type='mmdet.FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
    # model training and testing settings
    train_cfg=dict(
        assigner=[
            dict(  # for Pedestrian
-                type='MaxIoUAssigner',
-                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
                pos_iou_thr=0.5,
                neg_iou_thr=0.35,
                min_pos_iou=0.35,
                ignore_iof_thr=-1),
            dict(  # for Cyclist
-                type='MaxIoUAssigner',
-                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
                pos_iou_thr=0.5,
                neg_iou_thr=0.35,
                min_pos_iou=0.35,
                ignore_iof_thr=-1),
            dict(  # for Car
-                type='MaxIoUAssigner',
-                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
                pos_iou_thr=0.6,
                neg_iou_thr=0.45,
                min_pos_iou=0.45,

--- a/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -17,8 +17,8 @@ model = dict(
    train_cfg=dict(
        _delete_=True,
        assigner=dict(
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            type='Max3DIoUAssigner',
+            iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
            pos_iou_thr=0.6,
            neg_iou_thr=0.45,
            min_pos_iou=0.45,

--- a/mmdet3d/core/bbox/assigners/__init__.py
+++ b/mmdet3d/core/bbox/assigners/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.core.bbox import AssignResult, BaseAssigner, MaxIoUAssigner
+from mmdet.core.bbox import AssignResult, BaseAssigner
+from .max_3d_iou_assigner import MaxIoUAssigner

 __all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult']
--- a/mmdet3d/core/bbox/assigners/max_3d_iou_assigner.py
+++ b/mmdet3d/core/bbox/assigners/max_3d_iou_assigner.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+from mmengine.data import InstanceData
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet.core.bbox.assigners import MaxIoUAssigner
+from ..assigners import AssignResult
+
+
+@TASK_UTILS.register_module()
+class Max3DIoUAssigner(MaxIoUAssigner):
+    # TODO: This is a temporary box assigner.
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, or a semi-positive integer
+    indicating the ground truth index.
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            `min_pos_iou` is set to avoid assigning bboxes that have extremely
+            small iou with GT as positive samples.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow low quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage. Details are demonstrated in Step 4.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+        iou_calculator (dict): Config of overlaps Calculator.
+    """
+
+    def __init__(self,
+                 pos_iou_thr: float,
+                 neg_iou_thr: Union[float, tuple],
+                 min_pos_iou: float = .0,
+                 gt_max_assign_all: bool = True,
+                 ignore_iof_thr: float = -1,
+                 ignore_wrt_candidates: bool = True,
+                 match_low_quality: bool = True,
+                 gpu_assign_thr: float = -1,
+                 iou_calculator: dict = dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, or a semi-positive number. -1 means negative
+        sample, semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to the background
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+
+        Example:
+            >>> from mmengine.data import InstanceData
+            >>> self = MaxIoUAssigner(0.5, 0.5)
+            >>> pred_instances = InstanceData()
+            >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10],
+            ...                                      [10, 10, 20, 20]])
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 9]])
+            >>> gt_instances.labels = torch.Tensor([0])
+            >>> assign_result = self.assign(pred_instances, gt_instances)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        gt_bboxes = gt_instances.bboxes_3d
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels_3d
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes_3d
+        else:
+            gt_bboxes_ignore = None
+
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = priors.device
+            priors = priors.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+
+        overlaps = self.iou_calculator(gt_bboxes, priors)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    priors, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, priors, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
--- a/mmdet3d/core/bbox/samplers/__init__.py
+++ b/mmdet3d/core/bbox/samplers/__init__.py
@@ -2,9 +2,9 @@
 from mmdet.core.bbox.samplers import (BaseSampler, CombinedSampler,
                                      InstanceBalancedPosSampler,
                                      IoUBalancedNegSampler, OHEMSampler,
-                                      PseudoSampler, RandomSampler,
-                                      SamplingResult)
+                                      RandomSampler, SamplingResult)
 from .iou_neg_piecewise_sampler import IoUNegPiecewiseSampler
+from .pseudosample import PseudoSampler

 __all__ = [
    'BaseSampler', 'PseudoSampler', 'RandomSampler',

--- a/mmdet3d/core/bbox/samplers/pseudosample.py
+++ b/mmdet3d/core/bbox/samplers/pseudosample.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.data import InstanceData
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet.core.bbox.assigners import AssignResult
+from ..samplers import BaseSampler, SamplingResult
+
+
+@TASK_UTILS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    # TODO: This is a temporary pseudo sampler.
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        gt_bboxes = gt_instances.bboxes_3d
+        priors = pred_instances.priors
+
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+
+        gt_flags = priors.new_zeros(priors.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=False)
+        return sampling_result
--- a/mmdet3d/core/post_processing/merge_augs.py
+++ b/mmdet3d/core/post_processing/merge_augs.py
@@ -5,14 +5,14 @@ from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
 from ..bbox import bbox3d2result, bbox3d_mapping_back, xywhr2xyxyr


-def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg):
+def merge_aug_bboxes_3d(aug_results, aug_batch_input_metas, test_cfg):
    """Merge augmented detection 3D bboxes and scores.

    Args:
        aug_results (list[dict]): The dict of detection results.
            The dict contains the following keys

-            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - bboxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
            - scores_3d (torch.Tensor): Detection scores.
            - labels_3d (torch.Tensor): Predicted box labels.
        img_metas (list[dict]): Meta information of each sample.
@@ -21,26 +21,27 @@ def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg):
    Returns:
        dict: Bounding boxes results in cpu mode, containing merged results.

-            - boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox.
+            - bboxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox.
            - scores_3d (torch.Tensor): Merged detection scores.
            - labels_3d (torch.Tensor): Merged predicted box labels.
    """

-    assert len(aug_results) == len(img_metas), \
+    assert len(aug_results) == len(aug_batch_input_metas), \
        '"aug_results" should have the same length as "img_metas", got len(' \
-        f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}'
+        f'aug_results)={len(aug_results)} and ' \
+        f'len(img_metas)={len(aug_batch_input_metas)}'

    recovered_bboxes = []
    recovered_scores = []
    recovered_labels = []

-    for bboxes, img_info in zip(aug_results, img_metas):
-        scale_factor = img_info[0]['pcd_scale_factor']
-        pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip']
-        pcd_vertical_flip = img_info[0]['pcd_vertical_flip']
+    for bboxes, input_info in zip(aug_results, aug_batch_input_metas):
+        scale_factor = input_info['pcd_scale_factor']
+        pcd_horizontal_flip = input_info['pcd_horizontal_flip']
+        pcd_vertical_flip = input_info['pcd_vertical_flip']
        recovered_scores.append(bboxes['scores_3d'])
        recovered_labels.append(bboxes['labels_3d'])
-        bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor,
+        bboxes = bbox3d_mapping_back(bboxes['bboxes_3d'], scale_factor,
                                     pcd_horizontal_flip, pcd_vertical_flip)
        recovered_bboxes.append(bboxes)


--- a/mmdet3d/models/dense_heads/anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor3d_head.py
--- a/mmdet3d/models/dense_heads/train_mixins.py
+++ b/mmdet3d/models/dense_heads/train_mixins.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
+from mmengine.data import InstanceData

 from mmdet3d.core import limit_period
 from mmdet.core import images_to_levels, multi_apply
@@ -11,10 +12,9 @@ class AnchorTrainMixin(object):

    def anchor_target_3d(self,
                         anchor_list,
-                         gt_bboxes_list,
-                         input_metas,
-                         gt_bboxes_ignore_list=None,
-                         gt_labels_list=None,
+                         batch_gt_instances_3d,
+                         batch_input_metas,
+                         batch_gt_instances_ignore=None,
                         label_channels=1,
                         num_classes=1,
                         sampling=True):
@@ -22,11 +22,10 @@ class AnchorTrainMixin(object):

        Args:
            anchor_list (list[list]): Multi level anchors of each image.
-            gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Ground truth
                bboxes of each image.
-            input_metas (list[dict]): Meta info of each image.
-            gt_bboxes_ignore_list (list): Ignore list of gt bboxes.
-            gt_labels_list (list[torch.Tensor]): Gt labels of batches.
+            batch_input_metas (list[dict]): Meta info of each image.
+            batch_gt_instances_ignore (list): Ignore list of gt bboxes.
            label_channels (int): The channel of labels.
            num_classes (int): The number of classes.
            sampling (bool): Whether to sample anchors.
@@ -38,8 +37,8 @@ class AnchorTrainMixin(object):
                direction weights, number of positive anchors and
                number of negative anchors.
        """
-        num_imgs = len(input_metas)
-        assert len(anchor_list) == num_imgs
+        num_inputs = len(batch_input_metas)
+        assert len(anchor_list) == num_inputs

        if isinstance(anchor_list[0][0], list):
            # sizes of anchors are different
@@ -48,7 +47,7 @@ class AnchorTrainMixin(object):
                sum([anchor.size(0) for anchor in anchors])
                for anchors in anchor_list[0]
            ]
-            for i in range(num_imgs):
+            for i in range(num_inputs):
                anchor_list[i] = anchor_list[i][0]
        else:
            # anchor number of multi levels
@@ -57,24 +56,21 @@ class AnchorTrainMixin(object):
                for anchors in anchor_list[0]
            ]
            # concat all level anchors and flags to a single tensor
-            for i in range(num_imgs):
+            for i in range(num_inputs):
                anchor_list[i] = torch.cat(anchor_list[i])

        # compute targets for each image
-        if gt_bboxes_ignore_list is None:
-            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
-        if gt_labels_list is None:
-            gt_labels_list = [None for _ in range(num_imgs)]
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_inputs)]

        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
         all_dir_targets, all_dir_weights, pos_inds_list,
         neg_inds_list) = multi_apply(
             self.anchor_target_3d_single,
             anchor_list,
-             gt_bboxes_list,
-             gt_bboxes_ignore_list,
-             gt_labels_list,
-             input_metas,
+             batch_gt_instances_3d,
+             batch_gt_instances_ignore,
+             batch_input_metas,
             label_channels=label_channels,
             num_classes=num_classes,
             sampling=sampling)
@@ -101,9 +97,8 @@ class AnchorTrainMixin(object):

    def anchor_target_3d_single(self,
                                anchors,
-                                gt_bboxes,
-                                gt_bboxes_ignore,
-                                gt_labels,
+                                gt_instance_3d,
+                                gt_instance_ignore,
                                input_meta,
                                label_channels=1,
                                num_classes=1,
@@ -112,9 +107,8 @@ class AnchorTrainMixin(object):

        Args:
            anchors (torch.Tensor): Concatenated multi-level anchor.
-            gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
-            gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
-            gt_labels (torch.Tensor): Gt class labels.
+            gt_instance_3d (:obj:`InstanceData`): Gt bboxes.
+            gt_instance_ignore (:obj:`InstanceData`): Ignored gt bboxes.
            input_meta (dict): Meta info of each image.
            label_channels (int): The channel of labels.
            num_classes (int): The number of classes.
@@ -137,15 +131,19 @@ class AnchorTrainMixin(object):
                    -1, self.box_code_size)
                current_anchor_num += current_anchors.size(0)
                if self.assign_per_class:
-                    gt_per_cls = (gt_labels == i)
+                    gt_per_cls = (gt_instance_3d.labels_3d == i)
+                    gt_per_cls_instance = InstanceData()
+                    gt_per_cls_instance.labels_3d = gt_instance_3d.labels_3d[
+                        gt_per_cls]
+                    gt_per_cls_instance.bboxes_3d = gt_instance_3d.bboxes_3d[
+                        gt_per_cls, :]
                    anchor_targets = self.anchor_target_single_assigner(
-                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
-                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
-                        num_classes, sampling)
+                        assigner, current_anchors, gt_per_cls_instance,
+                        gt_instance_ignore, input_meta, num_classes, sampling)
                else:
                    anchor_targets = self.anchor_target_single_assigner(
-                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
-                        gt_labels, input_meta, num_classes, sampling)
+                        assigner, current_anchors, gt_instance_3d,
+                        gt_instance_ignore, input_meta, num_classes, sampling)

                (labels, label_weights, bbox_targets, bbox_weights,
                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
@@ -194,15 +192,19 @@ class AnchorTrainMixin(object):
                current_anchors = anchors[i]
                current_anchor_num += current_anchors.size(0)
                if self.assign_per_class:
-                    gt_per_cls = (gt_labels == i)
+                    gt_per_cls = (gt_instance_3d.labels_3d == i)
+                    gt_per_cls_instance = InstanceData()
+                    gt_per_cls_instance.labels_3d = gt_instance_3d.labels_3d[
+                        gt_per_cls]
+                    gt_per_cls_instance.bboxes_3d = gt_instance_3d.bboxes_3d[
+                        gt_per_cls, :]
                    anchor_targets = self.anchor_target_single_assigner(
-                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
-                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
-                        num_classes, sampling)
+                        assigner, current_anchors, gt_per_cls_instance,
+                        gt_instance_ignore, input_meta, num_classes, sampling)
                else:
                    anchor_targets = self.anchor_target_single_assigner(
-                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
-                        gt_labels, input_meta, num_classes, sampling)
+                        assigner, current_anchors, gt_instance_3d,
+                        gt_instance_ignore, input_meta, num_classes, sampling)

                (labels, label_weights, bbox_targets, bbox_weights,
                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
@@ -230,17 +232,16 @@ class AnchorTrainMixin(object):
                    total_pos_inds, total_neg_inds)
        else:
            return self.anchor_target_single_assigner(self.bbox_assigner,
-                                                      anchors, gt_bboxes,
-                                                      gt_bboxes_ignore,
-                                                      gt_labels, input_meta,
-                                                      num_classes, sampling)
+                                                      anchors, gt_instance_3d,
+                                                      gt_instance_ignore,
+                                                      input_meta, num_classes,
+                                                      sampling)

    def anchor_target_single_assigner(self,
                                      bbox_assigner,
                                      anchors,
-                                      gt_bboxes,
-                                      gt_bboxes_ignore,
-                                      gt_labels,
+                                      gt_instance_3d,
+                                      gt_instance_ignore,
                                      input_meta,
                                      num_classes=1,
                                      sampling=True):
@@ -249,9 +250,8 @@ class AnchorTrainMixin(object):
        Args:
            bbox_assigner (BaseAssigner): assign positive and negative boxes.
            anchors (torch.Tensor): Concatenated multi-level anchor.
-            gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
-            gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
-            gt_labels (torch.Tensor): Gt class labels.
+            gt_instance_3d (:obj:`InstanceData`): Gt bboxes.
+            gt_instance_ignore (torch.Tensor): Ignored gt bboxes.
            input_meta (dict): Meta info of each image.
            num_classes (int): The number of classes.
            sampling (bool): Whether to sample anchors.
@@ -267,13 +267,17 @@ class AnchorTrainMixin(object):
        dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float)
        labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
-        if len(gt_bboxes) > 0:
-            if not isinstance(gt_bboxes, torch.Tensor):
-                gt_bboxes = gt_bboxes.tensor.to(anchors.device)
-            assign_result = bbox_assigner.assign(anchors, gt_bboxes,
-                                                 gt_bboxes_ignore, gt_labels)
-            sampling_result = self.bbox_sampler.sample(assign_result, anchors,
-                                                       gt_bboxes)
+        if len(gt_instance_3d.bboxes_3d) > 0:
+            if not isinstance(gt_instance_3d.bboxes_3d, torch.Tensor):
+                gt_instance_3d.bboxes_3d = gt_instance_3d.bboxes_3d.tensor.to(
+                    anchors.device)
+            pred_instance_3d = InstanceData(priors=anchors)
+            assign_result = bbox_assigner.assign(pred_instance_3d,
+                                                 gt_instance_3d,
+                                                 gt_instance_ignore)
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       pred_instance_3d,
+                                                       gt_instance_3d)
            pos_inds = sampling_result.pos_inds
            neg_inds = sampling_result.neg_inds
        else:
@@ -284,7 +288,7 @@ class AnchorTrainMixin(object):
                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) == 0,
                as_tuple=False).squeeze(-1).unique()

-        if gt_labels is not None:
+        if gt_instance_3d.labels_3d is not None:
            labels += num_classes
        if len(pos_inds) > 0:
            pos_bbox_targets = self.bbox_coder.encode(
@@ -300,10 +304,10 @@ class AnchorTrainMixin(object):
            dir_targets[pos_inds] = pos_dir_targets
            dir_weights[pos_inds] = 1.0

-            if gt_labels is None:
+            if gt_instance_3d.labels_3d is None:
                labels[pos_inds] = 1
            else:
-                labels[pos_inds] = gt_labels[
+                labels[pos_inds] = gt_instance_3d.labels_3d[
                    sampling_result.pos_assigned_gt_inds]
            if self.train_cfg.pos_weight <= 0:
                label_weights[pos_inds] = 1.0

--- a/mmdet3d/models/detectors/base.py
+++ b/mmdet3d/models/detectors/base.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
+from typing import Dict, List, Optional, Union

-import mmcv
 import torch
-from mmcv.parallel import DataContainer as DC
-from mmcv.runner import auto_fp16
+from mmengine.data import InstanceData
+from torch.optim import Optimizer

-from mmdet3d.core import Box3DMode, Coord3DMode, show_result
+from mmdet3d.core import Det3DDataSample
+from mmdet3d.registry import MODELS
+from mmdet.core.utils import stack_batch
 from mmdet.models.detectors import BaseDetector


+@MODELS.register_module()
 class Base3DDetector(BaseDetector):
-    """Base class for detectors."""
+    """Base class for 3D detectors.

-    def forward_test(self, points, img_metas, img=None, **kwargs):
+    Args:
+        preprocess_cfg (dict, optional): Model preprocessing config
+            for processing the input data. it usually includes
+            ``to_rgb``, ``pad_size_divisor``, ``pad_value``,
+            ``mean`` and ``std``. Default to None.
+       init_cfg (dict, optional): the config to control the
+           initialization. Default to None.
+    """
+
+    def __init__(self,
+                 preprocess_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None) -> None:
+        super(Base3DDetector, self).__init__(
+            preprocess_cfg=preprocess_cfg, init_cfg=init_cfg)
+
+    def forward_simple_test(self, batch_inputs_dict: Dict[List, torch.Tensor],
+                            batch_data_samples: List[Det3DDataSample],
+                            **kwargs) -> List[Det3DDataSample]:
        """
        Args:
-            points (list[torch.Tensor]): the outer list indicates test-time
-                augmentations and inner torch.Tensor should have a shape NxC,
-                which contains all points in the batch.
-            img_metas (list[list[dict]]): the outer list indicates test-time
-                augs (multiscale, flip, etc.) and the inner list indicates
-                images in a batch
-            img (list[torch.Tensor], optional): the outer
-                list indicates test-time augmentations and inner
-                torch.Tensor should have a shape NxCxHxW, which contains
-                all images in the batch. Defaults to None.
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list(obj:`Det3DDataSample`): Detection results of the
+            input images. Each DetDataSample usually contains
+            ``pred_instances_3d`` or ``pred_panoptic_seg_3d`` or
+            ``pred_sem_seg_3d``.
        """
-        for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
+        batch_size = len(batch_data_samples)
+        batch_input_metas = []
+        if batch_size != len(batch_inputs_dict['points']):
+            raise ValueError(
+                'num of augmentations ({}) != num of image meta ({})'.format(
+                    len(batch_inputs_dict['points']), len(batch_input_metas)))
+
+        for batch_index in range(batch_size):
+            metainfo = batch_data_samples[batch_index].metainfo
+            batch_input_metas.append(metainfo)
+        for var, name in [(batch_inputs_dict['points'], 'points'),
+                          (batch_input_metas, 'img_metas')]:
            if not isinstance(var, list):
                raise TypeError('{} must be a list, but got {}'.format(
                    name, type(var)))

-        num_augs = len(points)
-        if num_augs != len(img_metas):
-            raise ValueError(
-                'num of augmentations ({}) != num of image meta ({})'.format(
-                    len(points), len(img_metas)))
-
-        if num_augs == 1:
-            img = [img] if img is None else img
-            return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
+        if batch_size == 1:
+            return self.simple_test(
+                batch_inputs_dict, batch_input_metas, rescale=True, **kwargs)
        else:
-            return self.aug_test(points, img_metas, img, **kwargs)
-
-    @auto_fp16(apply_to=('img', 'points'))
-    def forward(self, return_loss=True, **kwargs):
-        """Calls either forward_train or forward_test depending on whether
-        return_loss=True.
-
-        Note this setting will change the expected inputs. When
-        `return_loss=True`, img and img_metas are single-nested (i.e.
-        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
-        img_metas should be double nested (i.e.  list[torch.Tensor],
-        list[list[dict]]), with the outer list indicating test time
-        augmentations.
+            return self.aug_test(
+                batch_inputs_dict, batch_input_metas, rescale=True, **kwargs)
+
+    def forward(self,
+                data: List[dict],
+                optimizer: Optional[Union[Optimizer, dict]] = None,
+                return_loss: bool = False,
+                **kwargs):
+        """The iteration step during training and testing. This method defines
+        an iteration step during training and testing, except for the back
+        propagation and optimizer updating during training, which are done in
+        an optimizer scheduler.
+
+        Args:
+            data (list[dict]): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer`, dict, Optional): The
+                optimizer of runner. This argument is unused and reserved.
+                Default to None.
+            return_loss (bool): Whether to return loss. In general,
+                it will be set to True during training and False
+                during testing. Default to False.
+
+        Returns:
+            during training
+                dict: It should contain at least 3 keys: ``loss``,
+                ``log_vars``, ``num_samples``.
+
+                    - ``loss`` is a tensor for back propagation, which can be a
+                      weighted sum of multiple losses.
+                    - ``log_vars`` contains all the variables to be sent to the
+                        logger.
+                    - ``num_samples`` indicates the batch size (when the model
+                        is DDP, it means the batch size on each GPU), which is
+                        used for averaging the logs.
+
+            during testing
+                list(obj:`Det3DDataSample`): Detection results of the
+                input samples. Each DetDataSample usually contains
+                ``pred_instances_3d`` or ``pred_panoptic_seg_3d`` or
+                ``pred_sem_seg_3d``.
        """
+
+        batch_inputs_dict, batch_data_samples = self.preprocess_data(data)
        if return_loss:
-            return self.forward_train(**kwargs)
-        else:
-            return self.forward_test(**kwargs)
+            losses = self.forward_train(batch_inputs_dict, batch_data_samples,
+                                        **kwargs)
+            loss, log_vars = self._parse_losses(losses)

-    def show_results(self, data, result, out_dir, show=False, score_thr=None):
-        """Results visualization.
+            outputs = dict(
+                loss=loss,
+                log_vars=log_vars,
+                num_samples=len(batch_data_samples))
+            return outputs
+        else:
+            return self.forward_simple_test(batch_inputs_dict,
+                                            batch_data_samples, **kwargs)

+    def preprocess_data(self, data: List[dict]) -> tuple:
+        """ Process input data during training and simple testing phases.
        Args:
-            data (list[dict]): Input points and the information of the sample.
-            result (list[dict]): Prediction results.
-            out_dir (str): Output directory of visualization result.
-            show (bool, optional): Determines whether you are
-                going to show result by open3d.
-                Defaults to False.
-            score_thr (float, optional): Score threshold of bounding boxes.
-                Default to None.
+            data (list[dict]): The data to be processed, which
+                comes from dataloader.
+
+        Returns:
+            tuple:  It should contain 2 item.
+
+                 - batch_inputs_dict (dict): The model input dict which include
+                    'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+                 - batch_data_samples (list[:obj:`Det3DDataSample`]): The Data
+                     Samples. It usually includes information such as
+                    `gt_instance_3d` , `gt_instances`.
        """
-        for batch_id in range(len(result)):
-            if isinstance(data['points'][0], DC):
-                points = data['points'][0]._data[0][batch_id].numpy()
-            elif mmcv.is_list_of(data['points'][0], torch.Tensor):
-                points = data['points'][0][batch_id]
-            else:
-                ValueError(f"Unsupported data type {type(data['points'][0])} "
-                           f'for visualization!')
-            if isinstance(data['img_metas'][0], DC):
-                pts_filename = data['img_metas'][0]._data[0][batch_id][
-                    'pts_filename']
-                box_mode_3d = data['img_metas'][0]._data[0][batch_id][
-                    'box_mode_3d']
-            elif mmcv.is_list_of(data['img_metas'][0], dict):
-                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
-                box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
-            else:
-                ValueError(
-                    f"Unsupported data type {type(data['img_metas'][0])} "
-                    f'for visualization!')
-            file_name = osp.split(pts_filename)[-1].split('.')[0]
-
-            assert out_dir is not None, 'Expect out_dir, got none.'
-
-            pred_bboxes = result[batch_id]['boxes_3d']
-            pred_labels = result[batch_id]['labels_3d']
-
-            if score_thr is not None:
-                mask = result[batch_id]['scores_3d'] > score_thr
-                pred_bboxes = pred_bboxes[mask]
-                pred_labels = pred_labels[mask]
-
-            # for now we convert points and bbox into depth mode
-            if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
-                                                  == Box3DMode.LIDAR):
-                points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
-                                                   Coord3DMode.DEPTH)
-                pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
-                                                Box3DMode.DEPTH)
-            elif box_mode_3d != Box3DMode.DEPTH:
-                ValueError(
-                    f'Unsupported box_mode_3d {box_mode_3d} for conversion!')
-            pred_bboxes = pred_bboxes.tensor.cpu().numpy()
-            show_result(
-                points,
-                None,
-                pred_bboxes,
-                out_dir,
-                file_name,
-                show=show,
-                pred_labels=pred_labels)
+        batch_data_samples = [
+            data_['data_sample'].to(self.device) for data_ in data
+        ]
+        if 'points' in data[0]['inputs'].keys():
+            points = [
+                data_['inputs']['points'].to(self.device) for data_ in data
+            ]
+        else:
+            raise KeyError(
+                "Model input dict needs to include the 'points' key.")
+        if 'img' in data[0]['inputs'].keys():
+            imgs = [data_['inputs']['img'].to(self.device) for data_ in data]
+        else:
+            imgs = None
+        if self.preprocess_cfg is None:
+            batch_inputs_dict = {
+                'points': points,
+                'imgs': stack_batch(imgs).float() if imgs is not None else None
+            }
+            return batch_inputs_dict, batch_data_samples
+
+        if self.to_rgb and imgs[0].size(0) == 3:
+            imgs = [_img[[2, 1, 0], ...] for _img in imgs]
+        imgs = [(_img - self.pixel_mean) / self.pixel_std for _img in imgs]
+        batch_img = stack_batch(imgs, self.pad_size_divisor, self.pad_value)
+        batch_inputs_dict = {'points': points, 'imgs': batch_img}
+        return batch_inputs_dict, batch_data_samples
+
+    def postprocess_result(self, results_list: List[InstanceData]) \
+            -> List[Det3DDataSample]:
+        """ Convert results list to `Det3DDataSample`.
+        Args:
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each sample.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3dd`` usually
+            contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instances, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                    contains a tensor with shape (num_instances, 7).
+            """
+        for i in range(len(results_list)):
+            result = Det3DDataSample()
+            result.pred_instances_3d = results_list[i]
+            results_list[i] = result
+        return results_list
+
+    def show_results(self, data, result, out_dir, show=False, score_thr=None):
+        # TODO
+        pass
--- a/mmdet3d/models/detectors/single_stage.py
+++ b/mmdet3d/models/detectors/single_stage.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+
 from mmdet3d.registry import MODELS
 from .base import Base3DDetector

@@ -23,13 +27,15 @@ class SingleStage3DDetector(Base3DDetector):

    def __init__(self,
                 backbone,
-                 neck=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
-                 pretrained=None):
-        super(SingleStage3DDetector, self).__init__(init_cfg)
+                 neck: Optional[dict] = None,
+                 bbox_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 preprocess_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 pretrained: Optional[str] = None) -> None:
+        super(SingleStage3DDetector, self).__init__(
+            preprocess_cfg=preprocess_cfg, init_cfg=init_cfg)
        self.backbone = MODELS.build(backbone)
        if neck is not None:
            self.neck = MODELS.build(neck)
@@ -39,12 +45,12 @@ class SingleStage3DDetector(Base3DDetector):
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

-    def forward_dummy(self, points):
+    def forward_dummy(self, batch_inputs: dict) -> tuple:
        """Used for computing network flops.

        See `mmdetection/tools/analysis_tools/get_flops.py`
        """
-        x = self.extract_feat(points)
+        x = self.extract_feat(batch_inputs['points'])
        try:
            sample_mod = self.train_cfg.sample_mod
            outs = self.bbox_head(x, sample_mod)
@@ -52,20 +58,20 @@ class SingleStage3DDetector(Base3DDetector):
            outs = self.bbox_head(x)
        return outs

-    def extract_feat(self, points, img_metas=None):
+    def extract_feat(self, points: List[torch.Tensor]) -> list:
        """Directly extract features from the backbone+neck.

        Args:
-            points (torch.Tensor): Input points.
+            points (List[torch.Tensor]): Input points.
        """
-        x = self.backbone(points)
+        x = self.backbone(points[0])
        if self.with_neck:
            x = self.neck(x)
        return x

-    def extract_feats(self, points, img_metas):
+    def extract_feats(self, batch_inputs_dict: dict) -> list:
        """Extract features of multiple samples."""
        return [
-            self.extract_feat(pts, img_meta)
-            for pts, img_meta in zip(points, img_metas)
+            self.extract_feat([points])
+            for points in batch_inputs_dict['points']
        ]
--- a/mmdet3d/models/detectors/voxelnet.py
+++ b/mmdet3d/models/detectors/voxelnet.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
 import torch
 from mmcv.ops import Voxelization
 from mmcv.runner import force_fp32
 from torch.nn import functional as F

-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from mmdet3d.core import Det3DDataSample
 from mmdet3d.registry import MODELS
 from .single_stage import SingleStage3DDetector

@@ -14,16 +16,16 @@ class VoxelNet(SingleStage3DDetector):
    r"""`VoxelNet <https://arxiv.org/abs/1711.06396>`_ for 3D detection."""

    def __init__(self,
-                 voxel_layer,
-                 voxel_encoder,
-                 middle_encoder,
-                 backbone,
-                 neck=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
-                 pretrained=None):
+                 voxel_layer: dict,
+                 voxel_encoder: dict,
+                 middle_encoder: dict,
+                 backbone: dict,
+                 neck: Optional[dict] = None,
+                 bbox_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 pretrained: Optional[str] = None) -> None:
        super(VoxelNet, self).__init__(
            backbone=backbone,
            neck=neck,
@@ -36,7 +38,7 @@ class VoxelNet(SingleStage3DDetector):
        self.voxel_encoder = MODELS.build(voxel_encoder)
        self.middle_encoder = MODELS.build(middle_encoder)

-    def extract_feat(self, points, img_metas=None):
+    def extract_feat(self, points: List[torch.Tensor]) -> list:
        """Extract features from points."""
        voxels, num_points, coors = self.voxelize(points)
        voxel_features = self.voxel_encoder(voxels, num_points, coors)
@@ -49,7 +51,7 @@ class VoxelNet(SingleStage3DDetector):

    @torch.no_grad()
    @force_fp32()
-    def voxelize(self, points):
+    def voxelize(self, points: List[torch.Tensor]) -> tuple:
        """Apply hard voxelization to points."""
        voxels, coors, num_points = [], [], []
        for res in points:
@@ -66,64 +68,75 @@ class VoxelNet(SingleStage3DDetector):
        coors_batch = torch.cat(coors_batch, dim=0)
        return voxels, num_points, coors_batch

-    def forward_train(self,
-                      points,
-                      img_metas,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      gt_bboxes_ignore=None):
-        """Training forward function.
-
+    def forward_train(self, batch_inputs_dict: Dict[list, torch.Tensor],
+                      batch_data_samples: List[Det3DDataSample],
+                      **kwargs) -> dict:
+        """
        Args:
-            points (list[torch.Tensor]): Point cloud of each sample.
-            img_metas (list[dict]): Meta information of each sample
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
-                boxes of each sampole
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
+            batch_inputs_dict (dict): The model input dict. It should contain
+                ``points`` and ``img`` keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (list[:obj:`Det3DDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance_3d` or `gt_panoptic_seg_3d` or `gt_sem_seg_3d`.

        Returns:
-            dict: Losses of each branch.
+            dict[str, Tensor]: A dictionary of loss components.
        """
-        x = self.extract_feat(points, img_metas)
-        outs = self.bbox_head(x)
-        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
-        losses = self.bbox_head.loss(
-            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+
+        x = self.extract_feat(batch_inputs_dict['points'])
+        losses = self.bbox_head.forward_train(x, batch_data_samples, **kwargs)
        return losses

-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test function without augmentaiton."""
-        x = self.extract_feat(points, img_metas)
-        outs = self.bbox_head(x)
-        bbox_list = self.bbox_head.get_bboxes(
-            *outs, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+    def simple_test(self,
+                    batch_inputs_dict: Dict[list, torch.Tensor],
+                    batch_input_metas: List[dict],
+                    rescale: bool = False) -> list:
+        """Test function without test-time augmentation.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict. It should contain
+                ``points`` and ``img`` keys.
+
+                    - points (list[torch.Tensor]): Point cloud of single
+                        sample.
+                    - imgs (torch.Tensor, optional): Image of single sample.
+
+            batch_input_metas (list[dict]): List of input information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the \
+                inputs. Each Det3DDataSample usually contain \
+                'pred_instances_3d'. And the ``pred_instances_3d`` usually \
+                contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instances, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                    contains a tensor with shape (num_instances, 7).
+        """
+        x = self.extract_feat(batch_inputs_dict['points'])
+        bboxes_list = self.bbox_head.simple_test(
+            x, batch_input_metas, rescale=rescale)
+
+        # connvert to Det3DDataSample
+        results_list = self.postprocess_result(bboxes_list)
+        return results_list
+
+    def aug_test(self,
+                 aug_batch_inputs_dict: Dict[list, torch.Tensor],
+                 aug_batch_input_metas: List[dict],
+                 rescale: bool = False) -> list:
        """Test function with augmentaiton."""
-        feats = self.extract_feats(points, img_metas)
-
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, img_meta in zip(feats, img_metas):
-            outs = self.bbox_head(x)
-            bbox_list = self.bbox_head.get_bboxes(
-                *outs, img_meta, rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.bbox_head.test_cfg)
-
-        return [merged_bboxes]
+        # TODO Refactor this after mmdet update
+        feats = self.extract_feats(aug_batch_inputs_dict)
+        aug_bboxes = self.bbox_head.aug_test(
+            feats, aug_batch_input_metas, rescale=rescale)
+        return aug_bboxes
--- a/tests/test_models/test_detectors.py
+++ b/tests/test_models/test_detectors.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import random
+from os.path import dirname, exists, join
+
+import numpy as np
+import pytest
+import torch
+from mmengine.data import InstanceData
+
+from mmdet3d.core import Det3DDataSample
+from mmdet3d.core.bbox import LiDARInstance3DBoxes
+from mmdet3d.registry import MODELS
+
+
+def _setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmdetection3d repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet3d
+        repo_dpath = dirname(dirname(mmdet3d.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    from mmcv import Config
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def _get_model_cfg(fname):
+    """Grab configs necessary to create a model.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+
+    return model
+
+
+def _get_detector_cfg(fname):
+    """Grab configs necessary to create a detector.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    import mmcv
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    train_cfg = mmcv.Config(copy.deepcopy(config.model.train_cfg))
+    test_cfg = mmcv.Config(copy.deepcopy(config.model.test_cfg))
+
+    model.update(train_cfg=train_cfg)
+    model.update(test_cfg=test_cfg)
+    return model
+
+
+def test_voxel_net():
+    import mmdet3d.models
+    assert hasattr(mmdet3d.models, 'VoxelNet')
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    _setup_seed(0)
+    voxel_net_cfg = _get_detector_cfg(
+        'pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py')
+    model = MODELS.build(voxel_net_cfg).cuda()
+    input_dict0 = dict(points=torch.rand([2010, 4], device='cuda'))
+    input_dict1 = dict(points=torch.rand([2020, 4], device='cuda'))
+    gt_instance_3d_0 = InstanceData()
+    gt_instance_3d_0.bboxes_3d = LiDARInstance3DBoxes(
+        torch.rand([20, 7], device='cuda'))
+    gt_instance_3d_0.labels_3d = torch.randint(0, 3, [20], device='cuda')
+    data_sample_0 = Det3DDataSample(
+        metainfo=dict(box_type_3d=LiDARInstance3DBoxes))
+    data_sample_0.gt_instances_3d = gt_instance_3d_0
+
+    gt_instance_3d_1 = InstanceData()
+    gt_instance_3d_1.bboxes_3d = LiDARInstance3DBoxes(
+        torch.rand([50, 7], device='cuda'))
+    gt_instance_3d_1.labels_3d = torch.randint(0, 3, [50], device='cuda')
+    data_sample_1 = Det3DDataSample(
+        metainfo=dict(box_type_3d=LiDARInstance3DBoxes))
+    data_sample_1.gt_instances_3d = gt_instance_3d_1
+    data = [dict(inputs=input_dict0, data_sample=data_sample_0)]
+
+    # test simple_test
+    with torch.no_grad():
+        results = model.forward(data, return_loss=False)
+    bboxes_3d = results[0].pred_instances_3d['bboxes_3d']
+    scores_3d = results[0].pred_instances_3d['scores_3d']
+    labels_3d = results[0].pred_instances_3d['labels_3d']
+    assert bboxes_3d.tensor.shape == (50, 7)
+    assert scores_3d.shape == torch.Size([50])
+    assert labels_3d.shape == torch.Size([50])
+
+    # test forward_train
+    data = [
+        dict(inputs=input_dict0, data_sample=data_sample_0),
+        dict(inputs=input_dict1, data_sample=data_sample_1)
+    ]
+    losses = model.forward(data, return_loss=True)
+    assert losses['log_vars']['loss_cls'] >= 0
+    assert losses['log_vars']['loss_bbox'] >= 0
+    assert losses['log_vars']['loss_dir'] >= 0
+    assert losses['log_vars']['loss'] >= 0
+
+    # test_aug_test
+    metainfo = {
+        'pcd_scale_factor': 1,
+        'pcd_horizontal_flip': 1,
+        'pcd_vertical_flip': 1,
+        'box_type_3d': LiDARInstance3DBoxes
+    }
+    data_sample_0.set_metainfo(metainfo)
+    data_sample_1.set_metainfo(metainfo)
+    data = [
+        dict(inputs=input_dict0, data_sample=data_sample_0),
+        dict(inputs=input_dict1, data_sample=data_sample_1)
+    ]
+    results = model.forward(data, return_loss=False)