[Refactor]New version VoteNet

c9ad3605 · jshilong · ChaimZhu · db44cc50 · c9ad3605 · c9ad3605
Commit c9ad3605 authored Jun 21, 2022 by jshilong Committed by ChaimZhu Jul 20, 2022
20 changed files
--- a/configs/_base_/datasets/scannet-3d-18class.py
+++ b/configs/_base_/datasets/scannet-3d-18class.py
 # dataset settings
 dataset_type = 'ScanNetDataset'
 data_root = './data/scannet/'
-class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
-               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+metainfo = dict(
-               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+    CLASSES=('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
-               'garbagebin')
+             'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+             'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+             'garbagebin'))
 train_pipeline = [
    dict(
        type='LoadPointsFromFile',
@@ -35,9 +37,8 @@ train_pipeline = [
        rot_range=[-0.087266, 0.087266],
        scale_ratio_range=[1.0, 1.0],
        shift_height=True),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
-        type='Collect3D',
+        type='Pack3DDetInputs',
        keys=[
            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
            'pts_instance_mask'
@@ -68,61 +69,51 @@ test_pipeline = [
                flip_ratio_bev_horizontal=0.5,
                flip_ratio_bev_vertical=0.5),
            dict(type='PointSample', num_points=40000),
-            dict(
+        ]),
-                type='DefaultFormatBundle3D',
+    dict(type='Pack3DDetInputs', keys=['points'])
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ])
-]
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='DEPTH',
-        shift_height=False,
-        load_dim=6,
-        use_dim=[0, 1, 2]),
-    dict(type='GlobalAlignment', rotation_axis=2),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points'])
 ]
-data = dict(
+train_dataloader = dict(
-    samples_per_gpu=8,
+    batch_size=8,
-    workers_per_gpu=4,
+    num_workers=4,
-    train=dict(
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
        type='RepeatDataset',
        times=5,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
-            ann_file=data_root + 'scannet_infos_train.pkl',
+            ann_file='scannet_infos_train.pkl',
            pipeline=train_pipeline,
            filter_empty_gt=False,
-            classes=class_names,
+            metainfo=metainfo,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-            box_type_3d='Depth')),
+            box_type_3d='Depth')))
-    val=dict(
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'scannet_infos_val.pkl',
+        ann_file='scannet_infos_val.pkl',
        pipeline=test_pipeline,
-        classes=class_names,
+        metainfo=metainfo,
        test_mode=True,
-        box_type_3d='Depth'),
+        box_type_3d='Depth'))
-    test=dict(
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'scannet_infos_val.pkl',
+        ann_file='scannet_infos_val.pkl',
        pipeline=test_pipeline,
-        classes=class_names,
+        metainfo=metainfo,
        test_mode=True,
        box_type_3d='Depth'))
+val_evaluator = dict(type='IndoorMetric')
-evaluation = dict(pipeline=eval_pipeline)
+test_evaluator = val_evaluator
--- a/configs/_base_/default_runtime.py
+++ b/configs/_base_/default_runtime.py
 default_scope = 'mmdet3d'
 default_hooks = dict(
-    optimizer=dict(type='OptimizerHook', grad_clip=None),
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=50),
    param_scheduler=dict(type='ParamSchedulerHook'),

--- a/configs/_base_/models/votenet.py
+++ b/configs/_base_/models/votenet.py
 model = dict(
    type='VoteNet',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
    backbone=dict(
        type='PointNet2SASSG',
        in_channels=4,
@@ -40,10 +41,8 @@ model = dict(
            normalize_xyz=True),
        pred_layer_cfg=dict(
            in_channels=128, shared_conv_channels=(128, 128), bias=True),
-        conv_cfg=dict(type='Conv1d'),
-        norm_cfg=dict(type='BN1d'),
        objectness_loss=dict(
-            type='CrossEntropyLoss',
+            type='mmdet.CrossEntropyLoss',
            class_weight=[0.2, 0.8],
            reduction='sum',
            loss_weight=5.0),
@@ -54,20 +53,21 @@ model = dict(
            loss_src_weight=10.0,
            loss_dst_weight=10.0),
        dir_class_loss=dict(
-            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        dir_res_loss=dict(
-            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
        size_class_loss=dict(
-            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
        size_res_loss=dict(
-            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+            type='mmdet.SmoothL1Loss', reduction='sum',
+            loss_weight=10.0 / 3.0),
        semantic_loss=dict(
-            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
    # model training and testing settings
    train_cfg=dict(
-        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote'),
    test_cfg=dict(
-        sample_mod='seed',
+        sample_mode='seed',
        nms_thr=0.25,
        score_thr=0.05,
        per_class_proposal=True))
--- a/configs/_base_/schedules/cyclic_20e.py
+++ b/configs/_base_/schedules/cyclic_20e.py
@@ -3,22 +3,47 @@
 # interval to be 20. Please change the interval accordingly if you do not
 # use a default schedule.
 # optimizer
+lr = 1e-4
+iter_num_in_epoch = 3712
 # This schedule is mainly used by models on nuScenes dataset
-optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
 # max_norm=10 is better for SECOND
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+optim_wrapper = dict(
-lr_config = dict(
+    type='OptimWrapper',
-    policy='cyclic',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01),
-    target_ratio=(10, 1e-4),
+    clip_grad=dict(max_norm=35, norm_type=2))
-    cyclic_times=1,
+# learning rate
-    step_ratio_up=0.4,
+param_scheduler = [
-)
+    dict(
-momentum_config = dict(
+        type='CosineAnnealingLR',
-    policy='cyclic',
+        T_max=8 * iter_num_in_epoch,
-    target_ratio=(0.85 / 0.95, 1),
+        eta_min=lr * 10,
-    cyclic_times=1,
+        by_epoch=False,
-    step_ratio_up=0.4,
+        begin=0,
-)
+        end=8 * iter_num_in_epoch),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=12 * iter_num_in_epoch,
+        eta_min=lr * 1e-4,
+        by_epoch=False,
+        begin=8 * iter_num_in_epoch,
+        end=20 * iter_num_in_epoch),
+    dict(
+        type='CosineAnnealingBetas',
+        T_max=8 * iter_num_in_epoch,
+        eta_min=0.85 / 0.95,
+        by_epoch=False,
+        begin=0,
+        end=8 * iter_num_in_epoch),
+    dict(
+        type='CosineAnnealingBetas',
+        T_max=12 * iter_num_in_epoch,
+        eta_min=1,
+        by_epoch=False,
+        begin=8 * iter_num_in_epoch,
+        end=20 * iter_num_in_epoch)
+]
 # runtime settings
-runner = dict(type='EpochBasedRunner', max_epochs=20)
+train_cfg = dict(by_epoch=True, max_epochs=20)
+val_cfg = dict(interval=1)
+test_cfg = dict()
--- a/configs/_base_/schedules/cyclic_40e.py
+++ b/configs/_base_/schedules/cyclic_40e.py
 # The schedule is usually used by models trained on KITTI dataset
 # The learning rate set in the cyclic schedule is the initial learning rate
 # rather than the max learning rate. Since the target_ratio is (10, 1e-4),
 # the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
 lr = 0.0018
+iter_num_in_epoch = 3712
 # The optimizer follows the setting in SECOND.Pytorch, but here we use
 # the official AdamW optimizer implemented by PyTorch.
-optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optim_wrapper = dict(
-optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+    type='OptimWrapper',
-# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
+    optimizer=dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01),
-# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69  # noqa
+    clip_grad=dict(max_norm=10, norm_type=2))
-# We implement them in mmcv, for more details, please refer to
+# learning rate
-# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327  # noqa
+param_scheduler = [
-# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130  # noqa
+    dict(
-lr_config = dict(
+        type='CosineAnnealingLR',
-    policy='cyclic',
+        T_max=16 * iter_num_in_epoch,
-    target_ratio=(10, 1e-4),
+        eta_min=lr * 10,
-    cyclic_times=1,
+        by_epoch=False,
-    step_ratio_up=0.4,
+        begin=0,
-)
+        end=16 * iter_num_in_epoch),
-momentum_config = dict(
+    dict(
-    policy='cyclic',
+        type='CosineAnnealingLR',
-    target_ratio=(0.85 / 0.95, 1),
+        T_max=24 * iter_num_in_epoch,
-    cyclic_times=1,
+        eta_min=lr * 1e-4,
-    step_ratio_up=0.4,
+        by_epoch=False,
-)
+        begin=16 * iter_num_in_epoch,
+        end=40 * iter_num_in_epoch),
+    dict(
+        type='CosineAnnealingBetas',
+        T_max=16 * iter_num_in_epoch,
+        eta_min=0.85 / 0.95,
+        by_epoch=False,
+        begin=0,
+        end=16 * iter_num_in_epoch),
+    dict(
+        type='CosineAnnealingBetas',
+        T_max=24 * iter_num_in_epoch,
+        eta_min=1,
+        by_epoch=False,
+        begin=16 * iter_num_in_epoch,
+        end=40 * iter_num_in_epoch)
+]
+# Runtime settings，training schedule for 40e
 # Although the max_epochs is 40, this schedule is usually used we
 # RepeatDataset with repeat ratio N, thus the actual max epoch
 # number could be Nx40
-runner = dict(type='EpochBasedRunner', max_epochs=40)
+train_cfg = dict(by_epoch=True, max_epochs=40)
+val_cfg = dict(interval=1)
+test_cfg = dict()
--- a/configs/_base_/schedules/schedule_3x.py
+++ b/configs/_base_/schedules/schedule_3x.py
@@ -2,8 +2,24 @@
 # This schedule is mainly used by models on indoor dataset,
 # e.g., VoteNet on SUNRGBD and ScanNet
 lr = 0.008  # max learning rate
-optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
+optim_wrapper = dict(
-optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+    type='OptimWrapper',
-lr_config = dict(policy='step', warmup=None, step=[24, 32])
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01),
-# runtime settings
+    clip_grad=dict(max_norm=10, norm_type=2),
-runner = dict(type='EpochBasedRunner', max_epochs=36)
+)
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=36, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[24, 32],
+        gamma=0.1)
+]
--- a/configs/votenet/votenet_8x8_scannet-3d-18class.py
+++ b/configs/votenet/votenet_8x8_scannet-3d-18class.py
@@ -31,6 +31,4 @@ model = dict(
                        [1.1511526, 1.0546296, 0.49706793],
                        [0.47535285, 0.49249494, 0.5802117]])))
-# yapf:disable
+default_hooks = dict(logger=dict(type='LoggerHook', interval=30))
-log_config = dict(interval=30)
-# yapf:enable
--- a/mmdet3d/core/data_structures/det3d_data_sample.py
+++ b/mmdet3d/core/data_structures/det3d_data_sample.py
@@ -51,6 +51,8 @@ class Det3DDataSample(DetDataSample):
            panoptic segmentation.
        - ``pred_pts_panoptic_seg``(PixelData): Predicted of point cloud
            panoptic segmentation.
+        - ``eval_ann_info``(dict): Raw annotation, which will be passed to
+            evaluator and do the online evaluation.
    Examples:
    >>> from mmengine.data import InstanceData, PixelData

--- a/mmdet3d/core/evaluation/indoor_eval.py
+++ b/mmdet3d/core/evaluation/indoor_eval.py
@@ -205,7 +205,6 @@ def indoor_eval(gt_annos,
                metric,
                label2cat,
                logger=None,
-                box_type_3d=None,
                box_mode_3d=None):
    """Indoor Evaluation.
@@ -217,11 +216,11 @@ def indoor_eval(gt_annos,
            includes the following keys
            - labels_3d (torch.Tensor): Labels of boxes.
-            - boxes_3d (:obj:`BaseInstance3DBoxes`):
+            - bboxes_3d (:obj:`BaseInstance3DBoxes`):
                3D bounding boxes in Depth coordinate.
            - scores_3d (torch.Tensor): Scores of boxes.
        metric (list[float]): IoU thresholds for computing average precisions.
-        label2cat (dict): Map from label to category.
+        label2cat (tuple): Map from label to category.
        logger (logging.Logger | str, optional): The way to print the mAP
            summary. See `mmdet.utils.print_log()` for details. Default: None.
@@ -236,7 +235,7 @@ def indoor_eval(gt_annos,
        det_anno = dt_annos[img_id]
        for i in range(len(det_anno['labels_3d'])):
            label = det_anno['labels_3d'].numpy()[i]
-            bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i]
+            bbox = det_anno['bboxes_3d'].convert_to(box_mode_3d)[i]
            score = det_anno['scores_3d'].numpy()[i]
            if label not in pred:
                pred[int(label)] = {}
@@ -250,15 +249,9 @@ def indoor_eval(gt_annos,
        # parse gt annotations
        gt_anno = gt_annos[img_id]
-        if gt_anno['gt_num'] != 0:
-            gt_boxes = box_type_3d(
+        gt_boxes = gt_anno['gt_bboxes_3d']
-                gt_anno['gt_boxes_upright_depth'],
+        labels_3d = gt_anno['gt_labels_3d']
-                box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1],
-                origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d)
-            labels_3d = gt_anno['class']
-        else:
-            gt_boxes = box_type_3d(np.array([], dtype=np.float32))
-            labels_3d = np.array([], dtype=np.int64)
        for i in range(len(labels_3d)):
            label = labels_3d[i]

--- a/mmdet3d/core/post_processing/merge_augs.py
+++ b/mmdet3d/core/post_processing/merge_augs.py
@@ -51,7 +51,7 @@ def merge_aug_bboxes_3d(aug_results, aug_batch_input_metas, test_cfg):
    aug_labels = torch.cat(recovered_labels, dim=0)
    # TODO: use a more elegent way to deal with nms
-    if test_cfg.use_rotate_nms:
+    if test_cfg.get('use_rotate_nms', False):
        nms_func = nms_bev
    else:
        nms_func = nms_normal_bev
@@ -83,7 +83,7 @@ def merge_aug_bboxes_3d(aug_results, aug_batch_input_metas, test_cfg):
    merged_labels = torch.cat(merged_labels, dim=0)
    _, order = merged_scores.sort(0, descending=True)
-    num = min(test_cfg.max_num, len(aug_bboxes))
+    num = min(test_cfg.get('max_num', 500), len(aug_bboxes))
    order = order[:num]
    merged_bboxes = merged_bboxes[order]

--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -47,10 +47,15 @@ class Det3DDataset(BaseDataset):
            - 'Camera': Box in camera coordinates, usually
              for vision-based 3d detection.
-        filter_empty_gt (bool, optional): Whether to filter the data with
+        filter_empty_gt (bool): Whether to filter the data with
            empty GT. Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
+        load_eval_anns (bool): Whether to load annotations
+            in test_mode, the annotation will be save in
+            `eval_ann_infos`, which can be use in Evaluator.
+        file_client_args (dict): Configuration of file client.
+            Defaults to `dict(backend='disk')`.
    """
    def __init__(self,
@@ -63,11 +68,13 @@ class Det3DDataset(BaseDataset):
                 box_type_3d: dict = 'LiDAR',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
+                 load_eval_anns=True,
                 file_client_args: dict = dict(backend='disk'),
                 **kwargs):
        # init file client
        self.file_client = mmcv.FileClient(**file_client_args)
        self.filter_empty_gt = filter_empty_gt
+        self.load_eval_anns = load_eval_anns
        _default_modality_keys = ('use_lidar', 'use_camera')
        if modality is None:
            modality = dict()
@@ -82,7 +89,6 @@ class Det3DDataset(BaseDataset):
            f', `use_camera`) for {self.__class__.__name__}')
        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
        if metainfo is not None and 'CLASSES' in metainfo:
            # we allow to train on subset of self.METAINFO['CLASSES']
            # map unselected labels to -1
@@ -101,6 +107,10 @@ class Det3DDataset(BaseDataset):
            }
            self.label_mapping[-1] = -1
+        # can be accessed by other component in runner
+        metainfo['box_type_3d'] = box_type_3d
+        metainfo['label_mapping'] = self.label_mapping
        super().__init__(
            ann_file=ann_file,
            metainfo=metainfo,
@@ -221,7 +231,10 @@ class Det3DDataset(BaseDataset):
                        self.data_prefix.get('img', ''), img_info['img_path'])
        if not self.test_mode:
+            # used in traing
            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            info['eval_ann_info'] = self.parse_ann_info(info)
        return info

--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
 import numpy as np
 from mmcv import BaseTransform
 from mmcv.transforms import to_tensor
@@ -45,14 +47,16 @@ class Pack3DDetInputs(BaseTransform):
            key = key[3:]
        return key
-    def transform(self, results: dict) -> dict:
+    def transform(self, results: Union[dict,
-        """Method to pack the input data.
+                                       List[dict]]) -> Union[dict, List[dict]]:
+        """Method to pack the input data. when the value in this dict is a
+        list, it usually is in Augmentations Testing.
        Args:
-            results (dict): Result dict from the data pipeline.
+            results (dict | list[dict]): Result dict from the data pipeline.
        Returns:
-            dict:
+            dict | List[dict]:
            - 'inputs' (dict): The forward data of models. It usually contains
              following keys:
@@ -63,12 +67,41 @@ class Pack3DDetInputs(BaseTransform):
            - 'data_sample' (obj:`Det3DDataSample`): The annotation info of the
              sample.
        """
-        packed_results = dict()
+        # augtest
+        if isinstance(results, list):
+            pack_results = []
+            for single_result in results:
+                pack_results.append(self.pack_single_results(single_result))
+            return pack_results
+        # norm training and simple testing
+        elif isinstance(results, dict):
+            return self.pack_single_results(results)
+        else:
+            raise NotImplementedError
+    def pack_single_results(self, results):
+        """Method to pack the single input data. when the value in this dict is
+        a list, it usually is in Augmentations Testing.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict: A dict contains
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+                - points
+                - img
+            - 'data_sample' (obj:`Det3DDataSample`): The annotation info of the
+              sample.
+        """
        # Format 3D data
        if 'points' in results:
-            assert isinstance(results['points'], BasePoints)
+            if isinstance(results['points'], BasePoints):
-            results['points'] = results['points'].tensor
+                results['points'] = results['points'].tensor
        if 'img' in results:
            if isinstance(results['img'], list):
@@ -134,6 +167,12 @@ class Pack3DDetInputs(BaseTransform):
        data_sample.gt_instances_3d = gt_instances_3d
        data_sample.gt_instances = gt_instances
        data_sample.seg_data = seg_data
+        if 'eval_ann_info' in results:
+            data_sample.eval_ann_info = results['eval_ann_info']
+        else:
+            data_sample.eval_ann_info = None
+        packed_results = dict()
        packed_results['data_sample'] = data_sample
        packed_results['inputs'] = inputs

--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
@@ -684,6 +684,9 @@ class LoadAnnotations3D(LoadAnnotations):
                pts_instance_mask_path, dtype=np.int64)
        results['pts_instance_mask'] = pts_instance_mask
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask
        return results
    def _load_semantic_seg_3d(self, results: dict) -> dict:
@@ -710,6 +713,9 @@ class LoadAnnotations3D(LoadAnnotations):
                pts_semantic_mask_path, dtype=np.int64)
        results['pts_semantic_mask'] = pts_semantic_mask
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
        return results
    def transform(self, results: dict) -> dict:

--- a/mmdet3d/datasets/pipelines/test_time_aug.py
+++ b/mmdet3d/datasets/pipelines/test_time_aug.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from copy import deepcopy
+from typing import Dict, List, Optional, Tuple, Union
 import mmcv
+from mmcv import BaseTransform
+from mmengine.dataset import Compose
 from mmdet3d.registry import TRANSFORMS
-from .compose import Compose
 @TRANSFORMS.register_module()
-class MultiScaleFlipAug3D(object):
+class MultiScaleFlipAug3D(BaseTransform):
    """Test-time augmentation with multiple scales and flipping.
    Args:
@@ -33,13 +35,13 @@ class MultiScaleFlipAug3D(object):
    """
    def __init__(self,
-                 transforms,
+                 transforms: List[dict],
-                 img_scale,
+                 img_scale: Optional[Union[Tuple[int], List[Tuple[int]]]],
-                 pts_scale_ratio,
+                 pts_scale_ratio: Union[float, List[float]],
-                 flip=False,
+                 flip: bool = False,
-                 flip_direction='horizontal',
+                 flip_direction: str = 'horizontal',
-                 pcd_horizontal_flip=False,
+                 pcd_horizontal_flip: bool = False,
-                 pcd_vertical_flip=False):
+                 pcd_vertical_flip: bool = False) -> None:
        self.transforms = Compose(transforms)
        self.img_scale = img_scale if isinstance(img_scale,
                                                 list) else [img_scale]
@@ -65,17 +67,17 @@ class MultiScaleFlipAug3D(object):
            warnings.warn(
                'flip has no effect when RandomFlip is not in transforms')
-    def __call__(self, results):
+    def transform(self, results: Dict) -> List[Dict]:
        """Call function to augment common fields in results.
        Args:
            results (dict): Result dict contains the data to augment.
        Returns:
-            dict: The result dict contains the data that is augmented with
+            List[dict]: The list contains the data that is augmented with
                different scales and flips.
        """
-        aug_data = []
+        aug_data_list = []
        # modified from `flip_aug = [False, True] if self.flip else [False]`
        # to reduce unnecessary scenes when using double flip augmentation
@@ -104,13 +106,9 @@ class MultiScaleFlipAug3D(object):
                                _results['pcd_vertical_flip'] = \
                                    pcd_vertical_flip
                                data = self.transforms(_results)
-                                aug_data.append(data)
+                                aug_data_list.append(data)
-        # list of dict to dict of list
-        aug_data_dict = {key: [] for key in aug_data[0]}
+        return aug_data_list
-        for data in aug_data:
-            for key, val in data.items():
-                aug_data_dict[key].append(val)
-        return aug_data_dict
    def __repr__(self):
        """str: Return a string that describes the module."""

--- a/mmdet3d/metrics/__init__.py
+++ b/mmdet3d/metrics/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from .indoor_metric import IndoorMetric  # noqa: F401,F403
 from .kitti_metric import KittiMetric  # noqa: F401,F403
-__all_ = ['KittiMetric']
+__all_ = ['KittiMetric', 'IndoorMetric']
--- a/mmdet3d/metrics/indoor_metric.py
+++ b/mmdet3d/metrics/indoor_metric.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+from mmdet3d.core import get_box_type, indoor_eval
+from mmdet3d.registry import METRICS
+@METRICS.register_module()
+class IndoorMetric(BaseMetric):
+    """Kitti evaluation metric.
+    Args:
+        iou_thr (list[float]): List of iou threshold when calculate the
+            metric. Defaults to  [0.25, 0.5].
+        collect_device (str, optional): Device name used for collecting
+            results from different ranks during distributed training.
+            Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
+        prefix (str): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    """
+    def __init__(self,
+                 iou_thr: List[float] = [0.25, 0.5],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 **kwargs):
+        super(IndoorMetric, self).__init__(
+            prefix=prefix, collect_device=collect_device)
+        self.iou_thr = iou_thr
+    def process(self, data_batch: Sequence[dict],
+                predictions: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+        The processed results should be stored in ``self.results``,
+        which will be used to compute the metrics when all batches
+        have been processed.
+        Args:
+            data_batch (Sequence[dict]): A batch of data
+                from the dataloader.
+            predictions (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        batch_eval_anns = [
+            item['data_sample']['eval_ann_info'] for item in data_batch
+        ]
+        for eval_ann, pred_dict in zip(batch_eval_anns, predictions):
+            pred_3d = pred_dict['pred_instances_3d']
+            cpu_pred_3d = dict()
+            for k, v in pred_3d.items():
+                if hasattr(v, 'to'):
+                    cpu_pred_3d[k] = v.to('cpu')
+                else:
+                    cpu_pred_3d[k] = v
+            self.results.append((eval_ann, cpu_pred_3d))
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        ann_infos = []
+        pred_results = []
+        for eval_ann, sinlge_pred_results in results:
+            ann_infos.append(eval_ann)
+            pred_results.append(sinlge_pred_results)
+        box_type_3d, box_mode_3d = get_box_type(
+            self.dataset_meta['box_type_3d'])
+        ret_dict = indoor_eval(
+            ann_infos,
+            pred_results,
+            self.iou_thr,
+            self.dataset_meta['CLASSES'],
+            logger=logger,
+            box_mode_3d=box_mode_3d)
+        return ret_dict
--- a/mmdet3d/models/data_preprocessors/data_preprocessor.py
+++ b/mmdet3d/models/data_preprocessors/data_preprocessor.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from numbers import Number
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 import numpy as np
 from mmengine.data import BaseDataElement
@@ -66,19 +66,41 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
            batch_augments=batch_augments)
    def forward(self,
-                data: Sequence[dict],
+                data: List[Union[dict, List[dict]]],
-                training: bool = False) -> Tuple[Dict, Optional[list]]:
+                training: bool = False
+                ) -> Tuple[Union[dict, List[dict]], Optional[list]]:
        """Perform normalization、padding and bgr2rgb conversion based on
        ``BaseDataPreprocessor``.
        Args:
-            data (Sequence[dict]): data sampled from dataloader.
+            data (List[dict] | List[List[dict]]): data from dataloader.
+                The outer list always represent the batch size, when it is
+                a list[list[dict]], the inter list indicate test time
+                augmentation.
            training (bool): Whether to enable training time augmentation.
        Returns:
-            Tuple[Dict, Optional[list]]: Data in the same format as the
+            Tuple[Dict, Optional[list]] |
-            model input.
+            Tuple[List[Dict], Optional[list[list]]]:
+            Data in the same format as the model input.
        """
+        if isinstance(data[0], list):
+            num_augs = len(data[0])
+            aug_batch_data = []
+            aug_batch_data_sample = []
+            for aug_id in range(num_augs):
+                single_aug_batch_data, \
+                    single_aug_batch_data_sample = self.simple_process(
+                        [item[aug_id] for item in data], training)
+                aug_batch_data.append(single_aug_batch_data)
+                aug_batch_data_sample.append(single_aug_batch_data_sample)
+            return aug_batch_data, aug_batch_data_sample
+        else:
+            return self.simple_process(data, training)
+    def simple_process(self, data: Sequence[dict], training: bool = False):
        inputs_dict, batch_data_samples = self.collate_data(data)
        if 'points' in inputs_dict[0].keys():

--- a/mmdet3d/models/dense_heads/vote_head.py
+++ b/mmdet3d/models/dense_heads/vote_head.py
--- a/mmdet3d/models/detectors/base.py
+++ b/mmdet3d/models/detectors/base.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
 from mmdet3d.core import Det3DDataSample
 from mmdet3d.core.utils import (ForwardResults, InstanceList, OptConfigType,
                                OptMultiConfig, OptSampleList, SampleList)
@@ -24,8 +26,8 @@ class Base3DDetector(BaseDetector):
        super().__init__(data_preprocessor=data_processor, init_cfg=init_cfg)
    def forward(self,
-                batch_inputs_dict: dict,
+                inputs: Union[dict, List[dict]],
-                batch_data_samples: OptSampleList = None,
+                data_samples: OptSampleList = None,
                mode: str = 'tensor',
                **kwargs) -> ForwardResults:
        """The unified entry for a forward process in both training and test.
@@ -43,10 +45,19 @@ class Base3DDetector(BaseDetector):
        optimizer updating, which are done in the :meth:`train_step`.
        Args:
-            batch_inputs (torch.Tensor): The input tensor with shape
+            inputs  (dict | list[dict]): When it is a list[dict], the
-                (N, C, ...) in general.
+                outer list indicate the test time augmentation. Each
-            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                dict contains batch inputs
-                annotation data of every samples. Defaults to None.
+                which include 'points' and 'imgs' keys.
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor): Image tensor has shape (B, C, H, W).
+            data_samples (list[:obj:`DetDataSample`],
+                list[list[:obj:`DetDataSample`]], optional): The
+                annotation data of every samples. When it is a list[list], the
+                outer list indicate the test time augmentation, and the
+                inter list indicate the batch. Otherwise, the list simply
+                indicate the batch. Defaults to None.
            mode (str): Return what kind of value. Defaults to 'tensor'.
        Returns:
@@ -57,13 +68,20 @@ class Base3DDetector(BaseDetector):
            - If ``mode="loss"``, return a dict of tensor.
        """
        if mode == 'loss':
-            return self.loss(batch_inputs_dict, batch_data_samples, **kwargs)
+            return self.loss(inputs, data_samples, **kwargs)
        elif mode == 'predict':
-            return self.predict(batch_inputs_dict, batch_data_samples,
+            if isinstance(data_samples[0], list):
-                                **kwargs)
+                # aug test
+                assert len(data_samples[0]) == 1, 'Only support ' \
+                                                  'batch_size 1 ' \
+                                                  'in mmdet3d when ' \
+                                                  'do the test' \
+                                                  'time augmentation.'
+                return self.aug_test(inputs, data_samples, **kwargs)
+            else:
+                return self.predict(inputs, data_samples, **kwargs)
        elif mode == 'tensor':
-            return self._forward(batch_inputs_dict, batch_data_samples,
+            return self._forward(inputs, data_samples, **kwargs)
-                                 **kwargs)
        else:
            raise RuntimeError(f'Invalid mode "{mode}". '
                               'Only supports loss, predict and tensor mode')

--- a/mmdet3d/models/detectors/votenet.py
+++ b/mmdet3d/models/detectors/votenet.py
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
+from typing import Dict, List, Optional, Union
-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from mmengine import InstanceData
+from torch import Tensor
+from mmdet3d.core import Det3DDataSample, merge_aug_bboxes_3d
 from mmdet3d.registry import MODELS
 from .single_stage import SingleStage3DDetector
 @MODELS.register_module()
 class VoteNet(SingleStage3DDetector):
-    r"""`VoteNet <https://arxiv.org/pdf/1904.09664.pdf>`_ for 3D detection."""
+    r"""`VoteNet <https://arxiv.org/pdf/1904.09664.pdf>`_ for 3D detection.
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        bbox_head (dict, optional): Config dict of box head. Defaults to None.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        init_cfg (dict, optional): the config to control the
+           initialization. Default to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+    """
    def __init__(self,
-                 backbone,
+                 backbone: dict,
-                 bbox_head=None,
+                 bbox_head: Optional[dict] = None,
-                 train_cfg=None,
+                 train_cfg: Optional[dict] = None,
-                 test_cfg=None,
+                 test_cfg: Optional[dict] = None,
-                 init_cfg=None,
+                 init_cfg: Optional[dict] = None,
-                 pretrained=None):
+                 data_preprocessor: Optional[dict] = None,
+                 **kwargs):
        super(VoteNet, self).__init__(
            backbone=backbone,
            bbox_head=bbox_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
-            init_cfg=None,
+            init_cfg=init_cfg,
-            pretrained=pretrained)
+            data_preprocessor=data_preprocessor,
+            **kwargs)
-    def forward_train(self,
-                      points,
-                      img_metas,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      pts_semantic_mask=None,
-                      pts_instance_mask=None,
-                      gt_bboxes_ignore=None):
-        """Forward of training.
+    def loss(self, batch_inputs_dict: Dict[str, Union[List, Tensor]],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        """
        Args:
-            points (list[torch.Tensor]): Points of each batch.
+            batch_inputs_dict (dict): The model input dict which include
-            img_metas (list): Image metas.
+                'points' keys.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
+                - points (list[torch.Tensor]): Point cloud of each sample.
-            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
-                label of each batch.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
-            pts_instance_mask (list[torch.Tensor]): point-wise instance
+                Samples. It usually includes information such as
-                label of each batch.
+                `gt_instance_3d`.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
        Returns:
-            dict: Losses.
+            dict[str, Tensor]: A dictionary of loss components.
        """
-        points_cat = torch.stack(points)
+        feat_dict = self.extract_feat(batch_inputs_dict)
+        points = batch_inputs_dict['points']
-        x = self.extract_feat(points_cat)
+        losses = self.bbox_head.loss(points, feat_dict, batch_data_samples,
-        bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
+                                     **kwargs)
-        loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
-                       pts_instance_mask, img_metas)
-        losses = self.bbox_head.loss(
-            bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
        return losses
-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
        """Forward of testing.
        Args:
-            points (list[torch.Tensor]): Points of each sample.
+            batch_inputs_dict (dict): The model input dict which include
-            img_metas (list): Image metas.
+                'points' keys.
-            rescale (bool): Whether to rescale results.
+                - points (list[torch.Tensor]): Point cloud of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
        Returns:
-            list: Predicted 3d boxes.
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instances, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                    contains a tensor with shape (num_instances, 7).
        """
-        points_cat = torch.stack(points)
+        feats_dict = self.extract_feat(batch_inputs_dict)
+        points = batch_inputs_dict['points']
-        x = self.extract_feat(points_cat)
+        results_list = self.bbox_head.predict(points, feats_dict,
-        bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
+                                              batch_data_samples, **kwargs)
-        bbox_list = self.bbox_head.get_bboxes(
+        data_3d_samples = self.convert_to_datasample(results_list)
-            points_cat, bbox_preds, img_metas, rescale=rescale)
+        return data_3d_samples
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test with augmentation."""
-        points_cat = [torch.stack(pts) for pts in points]
-        feats = self.extract_feats(points_cat, img_metas)
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, pts_cat, img_meta in zip(feats, points_cat, img_metas):
-            bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
-            bbox_list = self.bbox_head.get_bboxes(
-                pts_cat, bbox_preds, img_meta, rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
+    def aug_test(self, aug_inputs_list: List[dict],
+                 aug_data_samples: List[List[dict]], **kwargs):
+        """Test with augmentation.
+        Batch size always is 1 when do the augtest.
+        Args:
+            aug_inputs_list (List[dict]): The list indicate same data
+                under differecnt augmentation.
+            aug_data_samples (List[List[dict]]): The outer list
+                indicate different augmentation, and the inter
+                list indicate the batch size.
+        """
+        num_augs = len(aug_inputs_list)
+        if num_augs == 1:
+            return self.predict(aug_inputs_list[0], aug_data_samples[0])
+        batch_size = len(aug_data_samples[0])
+        assert batch_size == 1
+        multi_aug_results = []
+        for aug_id in range(num_augs):
+            batch_inputs_dict = aug_inputs_list[aug_id]
+            batch_data_samples = aug_data_samples[aug_id]
+            feats_dict = self.extract_feat(batch_inputs_dict)
+            points = batch_inputs_dict['points']
+            results_list = self.bbox_head.predict(points, feats_dict,
+                                                  batch_data_samples, **kwargs)
+            multi_aug_results.append(results_list[0])
+        aug_input_metas_list = []
+        for aug_index in range(num_augs):
+            metainfo = aug_data_samples[aug_id][0].metainfo
+            aug_input_metas_list.append(metainfo)
+        aug_results_list = [item.to_dict() for item in multi_aug_results]
        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+        merged_results_dict = merge_aug_bboxes_3d(aug_results_list,
-                                            self.bbox_head.test_cfg)
+                                                  aug_input_metas_list,
+                                                  self.bbox_head.test_cfg)
-        return [merged_bboxes]
+        merged_results = InstanceData(**merged_results_dict)
+        data_3d_samples = self.convert_to_datasample([merged_results])
+        return data_3d_samples