[Rrfactor]Imvotenet

f63a62b8 · zhangshilong · ChaimZhu · edb6b369 · f63a62b8 · f63a62b8
Commit f63a62b8 authored Jul 19, 2022 by zhangshilong Committed by ChaimZhu Jul 20, 2022
20 changed files
--- a/configs/_base_/datasets/sunrgbd-3d-10class.py
+++ b/configs/_base_/datasets/sunrgbd-3d-10class.py
@@ -2,6 +2,9 @@ dataset_type = 'SUNRGBDDataset'
 data_root = 'data/sunrgbd/'
 class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
               'night_stand', 'bookshelf', 'bathtub')
+
+metainfo = dict(CLASSES=class_names)
+
 train_pipeline = [
    dict(
        type='LoadPointsFromFile',
@@ -21,8 +24,9 @@ train_pipeline = [
        scale_ratio_range=[0.85, 1.15],
        shift_height=True),
    dict(type='PointSample', num_points=20000),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
 ]
 test_pipeline = [
    dict(
@@ -47,61 +51,52 @@ test_pipeline = [
                sync_2d=False,
                flip_ratio_bev_horizontal=0.5,
            ),
-            dict(type='PointSample', num_points=20000),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ])
-]
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='DEPTH',
-        shift_height=False,
-        load_dim=6,
-        use_dim=[0, 1, 2]),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points'])
+            dict(type='PointSample', num_points=20000)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
 ]

-data = dict(
-    samples_per_gpu=16,
-    workers_per_gpu=4,
-    train=dict(
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
        type='RepeatDataset',
        times=5,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
-            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            ann_file='sunrgbd_infos_train.pkl',
            pipeline=train_pipeline,
-            classes=class_names,
            filter_empty_gt=False,
+            metainfo=metainfo,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-            box_type_3d='Depth')),
-    val=dict(
+            box_type_3d='Depth')))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        ann_file='sunrgbd_infos_val.pkl',
        pipeline=test_pipeline,
-        classes=class_names,
+        metainfo=metainfo,
        test_mode=True,
-        box_type_3d='Depth'),
-    test=dict(
+        box_type_3d='Depth'))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        ann_file='sunrgbd_infos_val.pkl',
        pipeline=test_pipeline,
-        classes=class_names,
+        metainfo=metainfo,
        test_mode=True,
        box_type_3d='Depth'))
-
-evaluation = dict(pipeline=eval_pipeline)
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
--- a/configs/_base_/models/imvotenet_image.py
+++ b/configs/_base_/models/imvotenet_image.py
 model = dict(
    type='ImVoteNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        # use caffe img_norm
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
    img_backbone=dict(
-        type='ResNet',
+        type='mmdet.ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
@@ -10,11 +17,12 @@ model = dict(
        norm_eval=True,
        style='caffe'),
    img_neck=dict(
-        type='FPN',
+        type='mmdet.FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
    img_rpn_head=dict(
+        _scope_='mmdet',
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
@@ -31,6 +39,7 @@ model = dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    img_roi_head=dict(
+        _scope_='mmdet',
        type='StandardRoIHead',
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
@@ -56,6 +65,7 @@ model = dict(
    train_cfg=dict(
        img_rpn=dict(
            assigner=dict(
+                _scope_='mmdet',
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
@@ -63,7 +73,7 @@ model = dict(
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
-                type='RandomSampler',
+                type='mmdet.RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
@@ -80,6 +90,7 @@ model = dict(
            min_bbox_size=0),
        img_rcnn=dict(
            assigner=dict(
+                _scope_='mmdet',
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
@@ -87,7 +98,7 @@ model = dict(
                match_low_quality=False,
                ignore_iof_thr=-1),
            sampler=dict(
-                type='RandomSampler',
+                type='mmdet.RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,

--- a/configs/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py
+++ b/configs/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py
@@ -3,56 +3,71 @@ _base_ = [
    '../_base_/models/imvotenet_image.py'
 ]

-# use caffe img_norm
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-
 train_pipeline = [
    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True),
    dict(
-        type='Resize',
-        img_scale=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
-                   (1333, 576), (1333, 600)],
-        multiscale_mode='value',
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=False,
+        with_label_3d=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
+                (1333, 576), (1333, 600)],
        keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Pack3DDetInputs', keys=['img', 'gt_bboxes', 'gt_bboxes_labels']),
 ]
+
 test_pipeline = [
    dict(type='LoadImageFromFile'),
+    # online evaluation
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=False,
+        with_label_3d=False),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1333, 600),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
+        type='Pack3DDetInputs',
+        keys=(['img']),
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
 ]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset', times=1, dataset=dict(pipeline=train_pipeline)))

-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(times=1, dataset=dict(pipeline=train_pipeline)),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6],
+        gamma=0.1)
+]
+val_evaluator = dict(type='Indoor2DMetric')

-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(grad_clip=None)
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=0.001,
-    step=[6])
-runner = dict(type='EpochBasedRunner', max_epochs=8)
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))

 load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
--- a/configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py
+++ b/configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py
@@ -7,10 +7,6 @@ _base_ = [
 class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
               'night_stand', 'bookshelf', 'bathtub')

-# use caffe img_norm
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-
 model = dict(
    pts_backbone=dict(
        type='PointNet2SASSG',
@@ -48,10 +44,8 @@ model = dict(
                            [0.76584, 1.398258, 0.472728]]),
            pred_layer_cfg=dict(
                in_channels=128, shared_conv_channels=(128, 128), bias=True),
-            conv_cfg=dict(type='Conv1d'),
-            norm_cfg=dict(type='BN1d'),
            objectness_loss=dict(
-                type='CrossEntropyLoss',
+                type='mmdet.CrossEntropyLoss',
                class_weight=[0.2, 0.8],
                reduction='sum',
                loss_weight=5.0),
@@ -62,15 +56,23 @@ model = dict(
                loss_src_weight=10.0,
                loss_dst_weight=10.0),
            dir_class_loss=dict(
-                type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0),
            dir_res_loss=dict(
-                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+                type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
            size_class_loss=dict(
-                type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0),
            size_res_loss=dict(
-                type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+                type='mmdet.SmoothL1Loss',
+                reduction='sum',
+                loss_weight=10.0 / 3.0),
            semantic_loss=dict(
-                type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0)),
        joint=dict(
            vote_module_cfg=dict(
                in_channels=512,
@@ -154,11 +156,11 @@ model = dict(
    # model training and testing settings
    train_cfg=dict(
        pts=dict(
-            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')),
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote')),
    test_cfg=dict(
        img_rcnn=dict(score_thr=0.1),
        pts=dict(
-            sample_mod='seed',
+            sample_mode='seed',
            nms_thr=0.25,
            score_thr=0.05,
            per_class_proposal=True)))
@@ -171,12 +173,13 @@ train_pipeline = [
        load_dim=6,
        use_dim=[0, 1, 2]),
    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations3D'),
-    dict(type='LoadAnnotations', with_bbox=True),
-    dict(type='Resize', img_scale=(1333, 600), keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
    dict(
        type='RandomFlip3D',
        sync_2d=False,
@@ -188,15 +191,13 @@ train_pipeline = [
        scale_ratio_range=[0.85, 1.15],
        shift_height=True),
    dict(type='PointSample', num_points=20000),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
-        type='Collect3D',
-        keys=[
-            'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d',
+        type='Pack3DDetInputs',
+        keys=([
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'points', 'gt_bboxes_3d',
            'gt_labels_3d'
-        ])
+        ]))
 ]
-
 test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
@@ -205,56 +206,15 @@ test_pipeline = [
        shift_height=True,
        load_dim=6,
        use_dim=[0, 1, 2]),
-    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 600),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip', flip_ratio=0.0),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(
-                type='GlobalRotScaleTrans',
-                rot_range=[0, 0],
-                scale_ratio_range=[1., 1.],
-                translation_std=[0, 0, 0]),
-            dict(
-                type='RandomFlip3D',
-                sync_2d=False,
-                flip_ratio_bev_horizontal=0.5,
-            ),
-            dict(type='PointSample', num_points=20000),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img', 'points'])
-        ]),
-]
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='DEPTH',
-        shift_height=False,
-        load_dim=6,
-        use_dim=[0, 1, 2]),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['img', 'points'])
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(type='PointSample', num_points=20000),
+    dict(type='Pack3DDetInputs', keys=['img', 'points'])
 ]

-data = dict(
-    train=dict(dataset=dict(pipeline=train_pipeline)),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
-evaluation = dict(pipeline=eval_pipeline)
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))

 # may also use your own pre-trained image branch
 load_from = 'https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth'  # noqa
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -178,10 +178,11 @@ class Det3DDataset(BaseDataset):
            dict | None: Processed `ann_info`
        """
        # add s or gt prefix for most keys after concat
+        # we only process 3d annotations here, the corresponding
+        # 2d annotation process is in the `LoadAnnotations3D`
+        # in `pipelines`
        name_mapping = {
-            'bbox_label': 'gt_labels',
            'bbox_label_3d': 'gt_labels_3d',
-            'bbox': 'gt_bboxes',
            'bbox_3d': 'gt_bboxes_3d',
            'depth': 'depths',
            'center_2d': 'centers_2d',
@@ -196,16 +197,18 @@ class Det3DDataset(BaseDataset):
            keys = list(instances[0].keys())
            ann_info = dict()
            for ann_name in keys:
-                temp_anns = [item[ann_name] for item in instances]
-                # map the original dataset label to training label
-                if 'label' in ann_name:
-                    temp_anns = [
-                        self.label_mapping[item] for item in temp_anns
-                    ]
-                temp_anns = np.array(temp_anns)
                if ann_name in name_mapping:
+                    temp_anns = [item[ann_name] for item in instances]
+                    # map the original dataset label to training label
+                    if 'label' in ann_name:
+                        temp_anns = [
+                            self.label_mapping[item] for item in temp_anns
+                        ]
+                    temp_anns = np.array(temp_anns)
+
                    ann_name = name_mapping[ann_name]
-                ann_info[ann_name] = temp_anns
+                    ann_info[ann_name] = temp_anns
+            ann_info['instances'] = info['instances']
        return ann_info

    def parse_data_info(self, info: dict) -> dict:

--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -87,7 +87,8 @@ class KittiDataset(Det3DDataset):
            if 'plane' in info:
                # convert ground plane to velodyne coordinates
                plane = np.array(info['plane'])
-                lidar2cam = np.array(info['images']['CAM2']['lidar2cam'])
+                lidar2cam = np.array(
+                    info['images']['CAM2']['lidar2cam'], dtype=np.float32)
                reverse = np.linalg.inv(lidar2cam)

                (plane_norm_cam, plane_off_cam) = (plane[:3],

--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
+from typing import List, Sequence, Union

+import mmcv
 import numpy as np
+import torch
 from mmcv import BaseTransform
-from mmcv.transforms import to_tensor
 from mmengine import InstanceData
+from numpy import dtype

 from mmdet3d.core import Det3DDataSample, PointData
 from mmdet3d.core.bbox import BaseInstance3DBoxes
@@ -12,6 +14,38 @@ from mmdet3d.core.points import BasePoints
 from mmdet3d.registry import TRANSFORMS


+def to_tensor(
+    data: Union[torch.Tensor, np.ndarray, Sequence, int,
+                float]) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: the converted data.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        if data.dtype is dtype('float64'):
+            data = data.astype(np.float32)
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
 @TRANSFORMS.register_module()
 class Pack3DDetInputs(BaseTransform):
    INPUTS_KEYS = ['points', 'img']
@@ -20,7 +54,7 @@ class Pack3DDetInputs(BaseTransform):
    ]
    INSTANCEDATA_2D_KEYS = [
        'gt_bboxes',
-        'gt_labels',
+        'gt_bboxes_labels',
    ]

    SEG_KEYS = [
@@ -121,8 +155,8 @@ class Pack3DDetInputs(BaseTransform):

        for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
-                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
-                'pts_semantic_mask', 'centers_2d', 'depths'
+                'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
        ]:
            if key not in results:
                continue
@@ -159,7 +193,10 @@ class Pack3DDetInputs(BaseTransform):
                elif key in self.INSTANCEDATA_3D_KEYS:
                    gt_instances_3d[self._remove_prefix(key)] = results[key]
                elif key in self.INSTANCEDATA_2D_KEYS:
-                    gt_instances[self._remove_prefix(key)] = results[key]
+                    if key == 'gt_bboxes_labels':
+                        gt_instances['labels'] = results[key]
+                    else:
+                        gt_instances[self._remove_prefix(key)] = results[key]
                elif key in self.SEG_KEYS:
                    gt_pts_seg[self._remove_prefix(key)] = results[key]
                else:

--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
@@ -632,34 +632,6 @@ class LoadAnnotations3D(LoadAnnotations):
        self.with_seg_3d = with_seg_3d
        self.seg_3d_dtype = seg_3d_dtype

-    def _load_bboxes(self, results: dict) -> None:
-        """Private function to load bounding box annotations.
-
-        Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
-        datasets.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict contains loaded bounding box annotations.
-        """
-        results['gt_bboxes'] = results['ann_info']['gt_bboxes']
-
-    def _load_labels(self, results: dict) -> None:
-        """Private function to load label annotations.
-
-        Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
-        datasets.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict contains loaded label annotations.
-        """
-        results['gt_labels'] = results['ann_info']['gt_labels']
-
    def _load_bboxes_3d(self, results: dict) -> dict:
        """Private function to move the 3D bounding box annotation from
        `ann_info` field to the root of `results`.
@@ -769,6 +741,56 @@ class LoadAnnotations3D(LoadAnnotations):
            results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
        return results

+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        The only difference is it remove the proceess for
+        `ignore_flag`
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        for instance in results['instances']:
+            gt_bboxes.append(instance['bbox'])
+        if len(gt_bboxes) == 0:
+            results['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+        else:
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape((-1, 4))
+        if self.denorm_bbox:
+            bbox_num = results['gt_bboxes'].shape[0]
+            if bbox_num != 0:
+                h, w = results['img_shape']
+                results['gt_bboxes'][:, 0::2] *= w
+                results['gt_bboxes'][:, 1::2] *= h
+
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['gt_bboxes'] = results['gt_bboxes']
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results['instances']:
+            gt_bboxes_labels.append(instance['bbox_label'])
+        if len(gt_bboxes_labels) == 0:
+            results['gt_bboxes_labels'] = np.zeros((0, ), dtype=np.int64)
+        else:
+            results['gt_bboxes_labels'] = np.array(
+                gt_bboxes_labels, dtype=np.int64)
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['gt_bboxes_labels'] = results[
+                'gt_bboxes_labels']
+
    def transform(self, results: dict) -> dict:
        """Function to load multiple types annotations.


--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
+
+import os.path as osp
 from typing import Callable, List, Optional, Union

 import numpy as np
@@ -22,13 +24,15 @@ class SUNRGBDDataset(Det3DDataset):
        ann_file (str): Path of annotation file.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
-        data_prefix (dict, optional): Prefix for data. Defaults to
+        data_prefix (dict): Prefix for data. Defaults to
            `dict(pts='points',img='sunrgbd_trainval')`.
        pipeline (list[dict], optional): Pipeline used for data processing.
            Defaults to None.
        modality (dict, optional): Modality to specify the sensor data used
            as input. Defaults to `dict(use_camera=True, use_lidar=True)`.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
+        default_cam_key (str): The default camera name adopted.
+            Defaults to "CAM0".
+        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
            Defaults to 'Depth' in this dataset. Available options includes
@@ -36,9 +40,9 @@ class SUNRGBDDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
+        filter_empty_gt (bool): Whether to filter empty GT.
            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
    """
    METAINFO = {
@@ -51,8 +55,9 @@ class SUNRGBDDataset(Det3DDataset):
                 ann_file: str,
                 metainfo: Optional[dict] = None,
                 data_prefix: dict = dict(
-                     pts='points', img='sunrgbd_trainval'),
+                     pts='points', img='sunrgbd_trainval/image'),
                 pipeline: List[Union[dict, Callable]] = [],
+                 default_cam_key: str = 'CAM0',
                 modality=dict(use_camera=True, use_lidar=True),
                 box_type_3d: str = 'Depth',
                 filter_empty_gt: bool = True,
@@ -64,6 +69,7 @@ class SUNRGBDDataset(Det3DDataset):
            metainfo=metainfo,
            data_prefix=data_prefix,
            pipeline=pipeline,
+            default_cam_key=default_cam_key,
            modality=modality,
            box_type_3d=box_type_3d,
            filter_empty_gt=filter_empty_gt,
@@ -73,6 +79,47 @@ class SUNRGBDDataset(Det3DDataset):
            'use_lidar' in self.modality
        assert self.modality['use_camera'] or self.modality['use_lidar']

+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Convert all relative path of needed modality data file to
+        the absolute path. And process
+        the `instances` field to `ann_info` in training stage.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+
+        if self.modality['use_lidar']:
+            info['lidar_points']['lidar_path'] = \
+                osp.join(
+                    self.data_prefix.get('pts', ''),
+                    info['lidar_points']['lidar_path'])
+
+        if self.modality['use_camera']:
+            for cam_id, img_info in info['images'].items():
+                if 'img_path' in img_info:
+                    img_info['img_path'] = osp.join(
+                        self.data_prefix.get('img', ''), img_info['img_path'])
+            if self.default_cam_key is not None:
+                info['img_path'] = info['images'][
+                    self.default_cam_key]['img_path']
+                info['depth2img'] = np.array(
+                    info['images'][self.default_cam_key]['depth2img'],
+                    dtype=np.float32)
+
+        if not self.test_mode:
+            # used in traing
+            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            info['eval_ann_info'] = self.parse_ann_info(info)
+
+        return info
+
    def parse_ann_info(self, info: dict) -> dict:
        """Process the `instances` in data info to `ann_info`

@@ -83,12 +130,11 @@ class SUNRGBDDataset(Det3DDataset):
            dict: Processed `ann_info`
        """
        ann_info = super().parse_ann_info(info)
-        # empty gt
+        # process data without any annotations
        if ann_info is None:
            ann_info = dict()
            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
-
        # to target box structure
        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
            ann_info['gt_bboxes_3d'],

--- a/mmdet3d/metrics/indoor_metric.py
+++ b/mmdet3d/metrics/indoor_metric.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
 from typing import Dict, List, Optional, Sequence

+import numpy as np
 from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger

 from mmdet3d.core import get_box_type, indoor_eval
 from mmdet3d.registry import METRICS
+from mmdet.core import eval_map


 @METRICS.register_module()
 class IndoorMetric(BaseMetric):
-    """Kitti evaluation metric.
+    """Indoor scene evaluation metric.

    Args:
        iou_thr (list[float]): List of iou threshold when calculate the
@@ -90,3 +93,91 @@ class IndoorMetric(BaseMetric):
            box_mode_3d=box_mode_3d)

        return ret_dict
+
+
+@METRICS.register_module()
+class Indoor2DMetric(BaseMetric):
+    """indoor 2d predictions evaluation metric.
+
+    Args:
+        iou_thr (list[float]): List of iou threshold when calculate the
+            metric. Defaults to  [0.5].
+        collect_device (str, optional): Device name used for collecting
+            results from different ranks during distributed training.
+            Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
+        prefix (str): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    """
+
+    def __init__(self,
+                 iou_thr: List[float] = [0.5],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 **kwargs):
+        super(Indoor2DMetric, self).__init__(
+            prefix=prefix, collect_device=collect_device)
+        self.iou_thr = iou_thr
+
+    def process(self, data_batch: Sequence[dict],
+                predictions: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``,
+        which will be used to compute the metrics when all batches
+        have been processed.
+
+        Args:
+            data_batch (Sequence[dict]): A batch of data
+                from the dataloader.
+            predictions (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        batch_eval_anns = [
+            item['data_sample']['eval_ann_info'] for item in data_batch
+        ]
+        for eval_ann, pred_dict in zip(batch_eval_anns, predictions):
+            pred = pred_dict['pred_instances']
+            ann = dict(
+                labels=eval_ann['gt_bboxes_labels'],
+                bboxes=eval_ann['gt_bboxes'])
+
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            dets = []
+            for label in range(len(self.dataset_meta['CLASSES'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        annotations, preds = zip(*results)
+        eval_results = OrderedDict()
+        iou_thr_2d = (self.iou_thr) if isinstance(self.iou_thr,
+                                                  float) else self.iou_thr
+        for iou_thr_2d_single in iou_thr_2d:
+            mean_ap, _ = eval_map(
+                preds,
+                annotations,
+                scale_ranges=None,
+                iou_thr=iou_thr_2d_single,
+                dataset=self.dataset_meta['CLASSES'],
+                logger=logger)
+            eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
+        return eval_results
--- a/mmdet3d/models/dense_heads/vote_head.py
+++ b/mmdet3d/models/dense_heads/vote_head.py
@@ -754,15 +754,16 @@ class VoteHead(BaseModule):
        batch_size = bbox3d.shape[0]
        results_list = list()
        if use_nms:
-            for b in range(batch_size):
+            for batch_index in range(batch_size):
                temp_results = InstanceData()
                bbox_selected, score_selected, labels = \
-                    self.multiclass_nms_single(obj_scores[b],
-                                               sem_scores[b],
-                                               bbox3d[b],
-                                               stack_points[b, ..., :3],
-                                               batch_input_metas[b])
-                bbox = batch_input_metas[b]['box_type_3d'](
+                    self.multiclass_nms_single(
+                        obj_scores[batch_index],
+                        sem_scores[batch_index],
+                        bbox3d[batch_index],
+                        stack_points[batch_index, ..., :3],
+                        batch_input_metas[batch_index])
+                bbox = batch_input_metas[batch_index]['box_type_3d'](
                    bbox_selected,
                    box_dim=bbox_selected.shape[-1],
                    with_yaw=self.bbox_coder.with_rot)

--- a/mmdet3d/models/detectors/base.py
+++ b/mmdet3d/models/detectors/base.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
+from typing import List, Optional, Union

 from mmengine import InstanceData

@@ -91,8 +91,8 @@ class Base3DDetector(BaseDetector):

    def convert_to_datasample(
        self,
-        results_list_3d: InstanceList,
-        results_list_2d: InstanceList = None,
+        results_list_3d: Optional[InstanceList] = None,
+        results_list_2d: Optional[InstanceList] = None,
    ) -> SampleList:
        """Convert results list to `Det3DDataSample`.

@@ -128,10 +128,18 @@ class Base3DDetector(BaseDetector):
        """

        data_sample_list = []
+        assert (results_list_2d is not None) or \
+               (results_list_3d is not None),\
+               'please pass at least one type of results_list'
+
        if results_list_2d is None:
            results_list_2d = [
                InstanceData() for _ in range(len(results_list_3d))
            ]
+        if results_list_3d is None:
+            results_list_3d = [
+                InstanceData() for _ in range(len(results_list_2d))
+            ]
        for i in range(len(results_list_3d)):
            result = Det3DDataSample()
            result.pred_instances_3d = results_list_3d[i]

--- a/mmdet3d/models/detectors/imvotenet.py
+++ b/mmdet3d/models/detectors/imvotenet.py
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
+import copy
+from typing import Dict, List, Optional, Sequence, Tuple, Union

-import numpy as np
 import torch
+from mmengine import InstanceData
+from torch import Tensor

-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from mmdet3d.core import Det3DDataSample
 from mmdet3d.models.utils import MLP
 from mmdet3d.registry import MODELS
 from .base import Base3DDetector


-def sample_valid_seeds(mask, num_sampled_seed=1024):
+def sample_valid_seeds(mask: Tensor, num_sampled_seed: int = 1024) -> Tensor:
    r"""Randomly sample seeds from all imvotes.

    Modified from `<https://github.com/facebookresearch/imvotenet/blob/a8856345146bacf29a57266a2f0b874406fd8823/models/imvotenet.py#L26>`_
@@ -54,26 +56,34 @@ def sample_valid_seeds(mask, num_sampled_seed=1024):

 @MODELS.register_module()
 class ImVoteNet(Base3DDetector):
-    r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection."""
+    r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection.
+
+    ImVoteNet is based on fusing 2D votes in images and 3D votes in point
+    clouds, which explicitly extract both geometric and semantic features
+    from the 2D images. It leverage camera parameters to lift these
+    features to 3D. A multi-tower training scheme also improve the synergy
+    of 2D-3D feature fusion.
+
+    """

    def __init__(self,
-                 pts_backbone=None,
-                 pts_bbox_heads=None,
-                 pts_neck=None,
-                 img_backbone=None,
-                 img_neck=None,
-                 img_roi_head=None,
-                 img_rpn_head=None,
-                 img_mlp=None,
-                 freeze_img_branch=False,
-                 fusion_layer=None,
-                 num_sampled_seed=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-
-        super(ImVoteNet, self).__init__(init_cfg=init_cfg)
+                 pts_backbone: Optional[dict] = None,
+                 pts_bbox_heads: Optional[dict] = None,
+                 pts_neck: Optional[dict] = None,
+                 img_backbone: Optional[dict] = None,
+                 img_neck: Optional[dict] = None,
+                 img_roi_head: Optional[dict] = None,
+                 img_rpn_head: Optional[dict] = None,
+                 img_mlp: Optional[dict] = None,
+                 freeze_img_branch: bool = False,
+                 fusion_layer: Optional[dict] = None,
+                 num_sampled_seed: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 **kwargs) -> None:
+
+        super(ImVoteNet, self).__init__(init_cfg=init_cfg, **kwargs)

        # point branch
        if pts_backbone is not None:
@@ -137,35 +147,8 @@ class ImVoteNet(Base3DDetector):
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

-        if pretrained is None:
-            img_pretrained = None
-            pts_pretrained = None
-        elif isinstance(pretrained, dict):
-            img_pretrained = pretrained.get('img', None)
-            pts_pretrained = pretrained.get('pts', None)
-        else:
-            raise ValueError(
-                f'pretrained should be a dict, got {type(pretrained)}')
-
-        if self.with_img_backbone:
-            if img_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg.')
-                self.img_backbone.init_cfg = dict(
-                    type='Pretrained', checkpoint=img_pretrained)
-        if self.with_img_roi_head:
-            if img_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg.')
-                self.img_roi_head.init_cfg = dict(
-                    type='Pretrained', checkpoint=img_pretrained)
-
-        if self.with_pts_backbone:
-            if img_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg.')
-                self.pts_backbone.init_cfg = dict(
-                    type='Pretrained', checkpoint=pts_pretrained)
+    def _forward(self):
+        raise NotImplementedError

    def freeze_img_branch_params(self):
        """Freeze all image branch parameters."""
@@ -267,28 +250,14 @@ class ImVoteNet(Base3DDetector):
        """Just to inherit from abstract method."""
        pass

-    def extract_img_feat(self, img):
+    def extract_img_feat(self, img: Tensor) -> Sequence[Tensor]:
        """Directly extract features from the img backbone+neck."""
        x = self.img_backbone(img)
        if self.with_img_neck:
            x = self.img_neck(x)
        return x

-    def extract_img_feats(self, imgs):
-        """Extract features from multiple images.
-
-        Args:
-            imgs (list[torch.Tensor]): A list of images. The images are
-                augmented from the same image but in different ways.
-
-        Returns:
-            list[torch.Tensor]: Features of different images
-        """
-
-        assert isinstance(imgs, list)
-        return [self.extract_img_feat(img) for img in imgs]
-
-    def extract_pts_feat(self, pts):
+    def extract_pts_feat(self, pts: Tensor) -> Tuple[Tensor]:
        """Extract features of points."""
        x = self.pts_backbone(pts)
        if self.with_pts_neck:
@@ -300,151 +269,93 @@ class ImVoteNet(Base3DDetector):

        return (seed_points, seed_features, seed_indices)

-    def extract_pts_feats(self, pts):
-        """Extract features of points from multiple samples."""
-        assert isinstance(pts, list)
-        return [self.extract_pts_feat(pt) for pt in pts]
-
-    @torch.no_grad()
-    def extract_bboxes_2d(self,
-                          img,
-                          img_metas,
-                          train=True,
-                          bboxes_2d=None,
-                          **kwargs):
-        """Extract bounding boxes from 2d detector.
-
-        Args:
-            img (torch.Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): Image meta info.
-            train (bool): train-time or not.
-            bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
-                not supported yet.
-
-        Return:
-            list[torch.Tensor]: a list of processed 2d bounding boxes.
+    def loss(self, batch_inputs_dict: Dict[str, Union[List, Tensor]],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
        """
-        if bboxes_2d is None:
-            x = self.extract_img_feat(img)
-            proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
-            rets = self.img_roi_head.simple_test(
-                x, proposal_list, img_metas, rescale=False)
-
-            rets_processed = []
-            for ret in rets:
-                tmp = np.concatenate(ret, axis=0)
-                sem_class = img.new_zeros((len(tmp)))
-                start = 0
-                for i, bboxes in enumerate(ret):
-                    sem_class[start:start + len(bboxes)] = i
-                    start += len(bboxes)
-                ret = img.new_tensor(tmp)
-
-                # append class index
-                ret = torch.cat([ret, sem_class[:, None]], dim=-1)
-                inds = torch.argsort(ret[:, 4], descending=True)
-                ret = ret.index_select(0, inds)
-
-                # drop half bboxes during training for better generalization
-                if train:
-                    rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
-                    rand_drop = torch.sort(rand_drop)[0]
-                    ret = ret[rand_drop]
-
-                rets_processed.append(ret.float())
-            return rets_processed
-        else:
-            rets_processed = []
-            for ret in bboxes_2d:
-                if len(ret) > 0 and train:
-                    rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
-                    rand_drop = torch.sort(rand_drop)[0]
-                    ret = ret[rand_drop]
-                rets_processed.append(ret.float())
-            return rets_processed
-
-    def forward_train(self,
-                      points=None,
-                      img=None,
-                      img_metas=None,
-                      gt_bboxes=None,
-                      gt_labels=None,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None,
-                      proposals=None,
-                      bboxes_2d=None,
-                      gt_bboxes_3d=None,
-                      gt_labels_3d=None,
-                      pts_semantic_mask=None,
-                      pts_instance_mask=None,
-                      **kwargs):
-        """Forwarding of train for image branch pretrain or stage 2 train.
-
        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            img (torch.Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image and point cloud meta info
-                dict. For example, keys include 'ori_shape', 'img_norm_cfg',
-                and 'transformation_3d_flow'. For details on the values of
-                the keys see `mmdet/datasets/pipelines/formatting.py:Collect`.
-            gt_bboxes (list[torch.Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[torch.Tensor]): class indices for each
-                2d bounding box.
-            gt_bboxes_ignore (list[torch.Tensor]): specify which
-                2d bounding boxes can be ignored when computing the loss.
-            gt_masks (torch.Tensor): true segmentation masks for each
-                2d bbox, used if the architecture supports a segmentation task.
-            proposals: override rpn proposals (2d) with custom proposals.
-                Use when `with_rpn` is False.
-            bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
-                not supported yet.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes.
-            gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes.
-            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): point-wise instance
-                label of each batch.
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs` keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (list[torch.Tensor]): Image tensor with shape
+                  (N, C, H ,W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.

        Returns:
-            dict[str, torch.Tensor]: a dictionary of loss components.
+            dict[str, Tensor]: A dictionary of loss components.
        """
+        imgs = batch_inputs_dict.get('imgs', None)
+        points = batch_inputs_dict.get('points', None)
        if points is None:
-            x = self.extract_img_feat(img)
+            x = self.extract_img_feat(imgs)
            losses = dict()
-
            # RPN forward and loss
            if self.with_img_rpn:
                proposal_cfg = self.train_cfg.get('img_rpn_proposal',
                                                  self.test_cfg.img_rpn)
-                rpn_losses, proposal_list = self.img_rpn_head.forward_train(
-                    x,
-                    img_metas,
-                    gt_bboxes,
-                    gt_labels=None,
-                    gt_bboxes_ignore=gt_bboxes_ignore,
-                    proposal_cfg=proposal_cfg)
+                rpn_data_samples = copy.deepcopy(batch_data_samples)
+                # set cat_id of gt_labels to 0 in RPN
+                for data_sample in rpn_data_samples:
+                    data_sample.gt_instances.labels = \
+                        torch.zeros_like(data_sample.gt_instances.labels)
+
+                rpn_losses, rpn_results_list = \
+                    self.img_rpn_head.loss_and_predict(
+                        x, rpn_data_samples,
+                        proposal_cfg=proposal_cfg, **kwargs)
+                # avoid get same name with roi_head loss
+                keys = rpn_losses.keys()
+                for key in keys:
+                    if 'loss' in key and 'rpn' not in key:
+                        rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
                losses.update(rpn_losses)
            else:
-                proposal_list = proposals
-
-            roi_losses = self.img_roi_head.forward_train(
-                x, img_metas, proposal_list, gt_bboxes, gt_labels,
-                gt_bboxes_ignore, gt_masks, **kwargs)
+                assert batch_data_samples[0].get('proposals', None) is not None
+                # use pre-defined proposals in InstanceData for
+                # the second stage
+                # to extract ROI features.
+                rpn_results_list = [
+                    data_sample.proposals for data_sample in batch_data_samples
+                ]
+
+            roi_losses = self.img_roi_head.loss(x, rpn_results_list,
+                                                batch_data_samples, **kwargs)
            losses.update(roi_losses)
            return losses
        else:
-            bboxes_2d = self.extract_bboxes_2d(
-                img, img_metas, bboxes_2d=bboxes_2d, **kwargs)
+            with torch.no_grad():
+                results_2d = self.predict_img_only(
+                    batch_inputs_dict['imgs'],
+                    batch_data_samples,
+                    rescale=False)
+            # tensor with shape (n, 6), the 6 arrange
+            # as [x1, x2, y1, y2, score, label]
+            pred_bboxes_with_label_list = []
+            for single_results in results_2d:
+                cat_preds = torch.cat(
+                    (single_results.bboxes, single_results.scores[:, None],
+                     single_results.labels[:, None]),
+                    dim=-1)
+                cat_preds = cat_preds[torch.argsort(
+                    cat_preds[:, 4], descending=True)]
+                # drop half bboxes during training for better generalization
+                if self.training:
+                    rand_drop = torch.randperm(
+                        len(cat_preds))[:(len(cat_preds) + 1) // 2]
+                    rand_drop = torch.sort(rand_drop)[0]
+                    cat_preds = cat_preds[rand_drop]

-            points = torch.stack(points)
-            seeds_3d, seed_3d_features, seed_indices = \
-                self.extract_pts_feat(points)
+                pred_bboxes_with_label_list.append(cat_preds)

-            img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
-                                                    img_metas)
+            stack_points = torch.stack(points)
+            seeds_3d, seed_3d_features, seed_indices = \
+                self.extract_pts_feat(stack_points)
+            img_metas = [item.metainfo for item in batch_data_samples]
+            img_features, masks = self.fusion_layer(
+                imgs, pred_bboxes_with_label_list, seeds_3d, img_metas)

            inds = sample_valid_seeds(masks, self.num_sampled_seed)
            batch_size, img_feat_size = img_features.shape[:2]
@@ -476,27 +387,13 @@ class ImVoteNet(Base3DDetector):
                seed_features=img_features,
                seed_indices=seed_indices)

-            loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
-                           pts_semantic_mask, pts_instance_mask, img_metas)
-            bbox_preds_joints = self.pts_bbox_head_joint(
-                feat_dict_joint, self.train_cfg.pts.sample_mod)
-            bbox_preds_pts = self.pts_bbox_head_pts(
-                feat_dict_pts, self.train_cfg.pts.sample_mod)
-            bbox_preds_img = self.pts_bbox_head_img(
-                feat_dict_img, self.train_cfg.pts.sample_mod)
            losses_towers = []
            losses_joint = self.pts_bbox_head_joint.loss(
-                bbox_preds_joints,
-                *loss_inputs,
-                gt_bboxes_ignore=gt_bboxes_ignore)
-            losses_pts = self.pts_bbox_head_pts.loss(
-                bbox_preds_pts,
-                *loss_inputs,
-                gt_bboxes_ignore=gt_bboxes_ignore)
-            losses_img = self.pts_bbox_head_img.loss(
-                bbox_preds_img,
-                *loss_inputs,
-                gt_bboxes_ignore=gt_bboxes_ignore)
+                points, feat_dict_joint, batch_data_samples)
+            losses_pts = self.pts_bbox_head_pts.loss(points, feat_dict_pts,
+                                                     batch_data_samples)
+            losses_img = self.pts_bbox_head_img.loss(points, feat_dict_img,
+                                                     batch_data_samples)
            losses_towers.append(losses_joint)
            losses_towers.append(losses_pts)
            losses_towers.append(losses_img)
@@ -516,267 +413,50 @@ class ImVoteNet(Base3DDetector):

            return combined_losses

-    def forward_test(self,
-                     points=None,
-                     img_metas=None,
-                     img=None,
-                     bboxes_2d=None,
-                     **kwargs):
-        """Forwarding of test for image branch pretrain or stage 2 train.
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.

        Args:
-            points (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and the inner
-                list contains all points in the batch, where each Tensor
-                should have a shape NxC. Defaults to None.
-            img_metas (list[list[dict]], optional): the outer list
-                indicates test-time augs (multiscale, flip, etc.)
-                and the inner list indicates images in a batch.
-                Defaults to None.
-            img (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and inner Tensor
-                should have a shape NxCxHxW, which contains all images
-                in the batch. Defaults to None. Defaults to None.
-            bboxes_2d (list[list[torch.Tensor]], optional):
-                Provided 2d bboxes, not supported yet. Defaults to None.
-
-        Returns:
-            list[list[torch.Tensor]]|list[dict]: Predicted 2d or 3d boxes.
+            batch_inputs_dict (dict): The model input dict which include
+                'points' and 'imgs keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (list[torch.Tensor]): Tensor of Images.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
        """
+        points = batch_inputs_dict.get('points', None)
+        imgs = batch_inputs_dict.get('imgs', None)
        if points is None:
-            for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
-                if not isinstance(var, list):
-                    raise TypeError(
-                        f'{name} must be a list, but got {type(var)}')
-
-            num_augs = len(img)
-            if num_augs != len(img_metas):
-                raise ValueError(f'num of augmentations ({len(img)}) '
-                                 f'!= num of image meta ({len(img_metas)})')
-
-            if num_augs == 1:
-                # proposals (List[List[Tensor]]): the outer list indicates
-                # test-time augs (multiscale, flip, etc.) and the inner list
-                # indicates images in a batch.
-                # The Tensor should have a shape Px4, where P is the number of
-                # proposals.
-                if 'proposals' in kwargs:
-                    kwargs['proposals'] = kwargs['proposals'][0]
-                return self.simple_test_img_only(
-                    img=img[0], img_metas=img_metas[0], **kwargs)
-            else:
-                assert img[0].size(0) == 1, 'aug test does not support ' \
-                                         'inference with batch size ' \
-                                         f'{img[0].size(0)}'
-                # TODO: support test augmentation for predefined proposals
-                assert 'proposals' not in kwargs
-                return self.aug_test_img_only(
-                    img=img, img_metas=img_metas, **kwargs)
+            assert imgs is not None
+            results_2d = self.predict_img_only(imgs, batch_data_samples)
+            return self.convert_to_datasample(results_list_2d=results_2d)

        else:
-            for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
-                if not isinstance(var, list):
-                    raise TypeError('{} must be a list, but got {}'.format(
-                        name, type(var)))
-
-            num_augs = len(points)
-            if num_augs != len(img_metas):
-                raise ValueError(
-                    'num of augmentations ({}) != num of image meta ({})'.
-                    format(len(points), len(img_metas)))
-
-            if num_augs == 1:
-                return self.simple_test(
-                    points[0],
-                    img_metas[0],
-                    img[0],
-                    bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None,
-                    **kwargs)
-            else:
-                return self.aug_test(points, img_metas, img, bboxes_2d,
-                                     **kwargs)
-
-    def simple_test_img_only(self,
-                             img,
-                             img_metas,
-                             proposals=None,
-                             rescale=False):
-        r"""Test without augmentation, image network pretrain. May refer to
-        `<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
-
-        Args:
-            img (torch.Tensor): Should have a shape NxCxHxW, which contains
-                all images in the batch.
-            img_metas (list[dict]):
-            proposals (list[Tensor], optional): override rpn proposals
-                with custom proposals. Defaults to None.
-            rescale (bool, optional): Whether or not rescale bboxes to the
-                original shape of input image. Defaults to False.
-
-        Returns:
-            list[list[torch.Tensor]]: Predicted 2d boxes.
-        """  # noqa: E501
-        assert self.with_img_bbox, 'Img bbox head must be implemented.'
-        assert self.with_img_backbone, 'Img backbone must be implemented.'
-        assert self.with_img_rpn, 'Img rpn must be implemented.'
-        assert self.with_img_roi_head, 'Img roi head must be implemented.'
-
-        x = self.extract_img_feat(img)
-
-        if proposals is None:
-            proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
-        else:
-            proposal_list = proposals
-
-        ret = self.img_roi_head.simple_test(
-            x, proposal_list, img_metas, rescale=rescale)
-
-        return ret
-
-    def simple_test(self,
-                    points=None,
-                    img_metas=None,
-                    img=None,
-                    bboxes_2d=None,
-                    rescale=False,
-                    **kwargs):
-        """Test without augmentation, stage 2.
-
-        Args:
-            points (list[torch.Tensor], optional): Elements in the list
-                should have a shape NxC, the list indicates all point-clouds
-                in the batch. Defaults to None.
-            img_metas (list[dict], optional): List indicates
-                images in a batch. Defaults to None.
-            img (torch.Tensor, optional): Should have a shape NxCxHxW,
-                which contains all images in the batch. Defaults to None.
-            bboxes_2d (list[torch.Tensor], optional):
-                Provided 2d bboxes, not supported yet. Defaults to None.
-            rescale (bool, optional): Whether or not rescale bboxes.
-                Defaults to False.
-
-        Returns:
-            list[dict]: Predicted 3d boxes.
-        """
-        bboxes_2d = self.extract_bboxes_2d(
-            img, img_metas, train=False, bboxes_2d=bboxes_2d, **kwargs)
-
-        points = torch.stack(points)
-        seeds_3d, seed_3d_features, seed_indices = \
-            self.extract_pts_feat(points)
-
-        img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
-                                                img_metas)
-
-        inds = sample_valid_seeds(masks, self.num_sampled_seed)
-        batch_size, img_feat_size = img_features.shape[:2]
-        pts_feat_size = seed_3d_features.shape[1]
-        inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1)
-        img_features = img_features.gather(-1, inds_img)
-        inds = inds % inds.shape[1]
-        inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
-        seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
-        inds_seed_feats = inds.view(batch_size, 1,
-                                    -1).expand(-1, pts_feat_size, -1)
-        seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
-        seed_indices = seed_indices.gather(1, inds)
-
-        img_features = self.img_mlp(img_features)
-
-        fused_features = torch.cat([seed_3d_features, img_features], dim=1)
-
-        feat_dict = dict(
-            seed_points=seeds_3d,
-            seed_features=fused_features,
-            seed_indices=seed_indices)
-        bbox_preds = self.pts_bbox_head_joint(feat_dict,
-                                              self.test_cfg.pts.sample_mod)
-        bbox_list = self.pts_bbox_head_joint.get_bboxes(
-            points, bbox_preds, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test_img_only(self, img, img_metas, rescale=False):
-        r"""Test function with augmentation, image network pretrain. May refer
-        to `<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
-
-        Args:
-            img (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and inner Tensor
-                should have a shape NxCxHxW, which contains all images
-                in the batch. Defaults to None. Defaults to None.
-            img_metas (list[list[dict]], optional): the outer list
-                indicates test-time augs (multiscale, flip, etc.)
-                and the inner list indicates images in a batch.
-                Defaults to None.
-            rescale (bool, optional): Whether or not rescale bboxes to the
-                original shape of input image. If rescale is False, then
-                returned bboxes and masks will fit the scale of imgs[0].
-                Defaults to None.
-
-        Returns:
-            list[list[torch.Tensor]]: Predicted 2d boxes.
-        """  # noqa: E501
-        assert self.with_img_bbox, 'Img bbox head must be implemented.'
-        assert self.with_img_backbone, 'Img backbone must be implemented.'
-        assert self.with_img_rpn, 'Img rpn must be implemented.'
-        assert self.with_img_roi_head, 'Img roi head must be implemented.'
-
-        x = self.extract_img_feats(img)
-        proposal_list = self.img_rpn_head.aug_test_rpn(x, img_metas)
-
-        return self.img_roi_head.aug_test(
-            x, proposal_list, img_metas, rescale=rescale)
-
-    def aug_test(self,
-                 points=None,
-                 img_metas=None,
-                 imgs=None,
-                 bboxes_2d=None,
-                 rescale=False,
-                 **kwargs):
-        """Test function with augmentation, stage 2.
-
-        Args:
-            points (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and the inner
-                list contains all points in the batch, where each Tensor
-                should have a shape NxC. Defaults to None.
-            img_metas (list[list[dict]], optional): the outer list
-                indicates test-time augs (multiscale, flip, etc.)
-                and the inner list indicates images in a batch.
-                Defaults to None.
-            imgs (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and inner Tensor
-                should have a shape NxCxHxW, which contains all images
-                in the batch. Defaults to None. Defaults to None.
-            bboxes_2d (list[list[torch.Tensor]], optional):
-                Provided 2d bboxes, not supported yet. Defaults to None.
-            rescale (bool, optional): Whether or not rescale bboxes.
-                Defaults to False.
-
-        Returns:
-            list[dict]: Predicted 3d boxes.
-        """
-        points_cat = [torch.stack(pts) for pts in points]
-        feats = self.extract_pts_feats(points_cat, img_metas)
-
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, pts_cat, img_meta, bbox_2d, img in zip(feats, points_cat,
-                                                      img_metas, bboxes_2d,
-                                                      imgs):
-
-            bbox_2d = self.extract_bboxes_2d(
-                img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs)
-
-            seeds_3d, seed_3d_features, seed_indices = x
+            results_2d = self.predict_img_only(
+                batch_inputs_dict['imgs'], batch_data_samples, rescale=False)
+            # tensor with shape (n, 6), the 6 arrange
+            # as [x1, x2, y1, y2, score, label]
+            pred_bboxes_with_label_list = []
+            for single_results in results_2d:
+                cat_preds = torch.cat(
+                    (single_results.bboxes, single_results.scores[:, None],
+                     single_results.labels[:, None]),
+                    dim=-1)
+                cat_preds = cat_preds[torch.argsort(
+                    cat_preds[:, 4], descending=True)]
+                pred_bboxes_with_label_list.append(cat_preds)
+
+            stack_points = torch.stack(points)
+            seeds_3d, seed_3d_features, seed_indices = \
+                self.extract_pts_feat(stack_points)

-            img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d,
-                                                    img_metas)
+            img_features, masks = self.fusion_layer(
+                imgs, pred_bboxes_with_label_list, seeds_3d,
+                [item.metainfo for item in batch_data_samples])

            inds = sample_valid_seeds(masks, self.num_sampled_seed)
            batch_size, img_feat_size = img_features.shape[:2]
@@ -800,19 +480,57 @@ class ImVoteNet(Base3DDetector):
                seed_points=seeds_3d,
                seed_features=fused_features,
                seed_indices=seed_indices)
-            bbox_preds = self.pts_bbox_head_joint(feat_dict,
-                                                  self.test_cfg.pts.sample_mod)
-            bbox_list = self.pts_bbox_head_joint.get_bboxes(
-                pts_cat, bbox_preds, img_metas, rescale=rescale)
-
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
+
+            results_3d = self.pts_bbox_head_joint.predict(
+                batch_inputs_dict['points'],
+                feat_dict,
+                batch_data_samples,
+                rescale=True)
+
+            return self.convert_to_datasample(results_3d)
+
+    def predict_img_only(self,
+                         imgs: Tensor,
+                         batch_data_samples: List[Det3DDataSample],
+                         rescale: bool = True) -> List[InstanceData]:
+        """Predict results from a batch of imgs with post- processing.
+
+        Args:
+            imgs (Tensor): Inputs images with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Return the list of detection
+            results of the input images, usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+                the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        assert self.with_img_bbox, 'Img bbox head must be implemented.'
+        assert self.with_img_backbone, 'Img backbone must be implemented.'
+        assert self.with_img_rpn, 'Img rpn must be implemented.'
+        assert self.with_img_roi_head, 'Img roi head must be implemented.'
+        x = self.extract_img_feat(imgs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.img_rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
            ]
-            aug_bboxes.append(bbox_list[0])

-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.bbox_head.test_cfg)
+        results_list = self.img_roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale)

-        return [merged_bboxes]
+        return results_list
--- a/mmdet3d/models/fusion_layers/coord_transform.py
+++ b/mmdet3d/models/fusion_layers/coord_transform.py
@@ -105,8 +105,8 @@ def extract_2d_info(img_meta, tensor):
    """
    img_shape = img_meta['img_shape']
    ori_shape = img_meta['ori_shape']
-    img_h, img_w, _ = img_shape
-    ori_h, ori_w, _ = ori_shape
+    img_h, img_w = img_shape
+    ori_h, ori_w = ori_shape

    img_scale_factor = (
        tensor.new_tensor(img_meta['scale_factor'][:2])

--- a/mmdet3d/models/fusion_layers/vote_fusion.py
+++ b/mmdet3d/models/fusion_layers/vote_fusion.py
@@ -45,8 +45,6 @@ class VoteFusion(nn.Module):
            seed_num = seed_3d_depth.shape[0]

            img_shape = img_meta['img_shape']
-            img_h, img_w, _ = img_shape
-
            # first reverse the data transformations
            xyz_depth = apply_3d_transformation(
                seed_3d_depth, 'DEPTH', img_meta, reverse=True)

--- a/tests/data/sunrgbd/sunrgbd_infos.pkl
+++ b/tests/data/sunrgbd/sunrgbd_infos.pkl
--- a/tests/test_models/test_detectors/test_h3d.py
+++ b/tests/test_models/test_detectors/test_h3d.py
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from tests.utils.model_utils import (_create_detector_inputs,
+                                     _get_detector_cfg, _setup_seed)
+
+
+class TestH3D(unittest.TestCase):
+
+    def test_h3dnet(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'H3DNet')
+        DefaultScope.get_instance('test_H3DNet', scope_name='mmdet3d')
+        _setup_seed(0)
+        voxel_net_cfg = _get_detector_cfg(
+            'h3dnet/h3dnet_3x8_scannet-3d-18class.py')
+        model = MODELS.build(voxel_net_cfg)
+        num_gt_instance = 5
+        data = [
+            _create_detector_inputs(
+                num_gt_instance=num_gt_instance,
+                points_feat_dim=4,
+                bboxes_3d_type='depth',
+                with_pts_semantic_mask=True,
+                with_pts_instance_mask=True)
+        ]
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                batch_inputs, data_samples = model.data_preprocessor(
+                    data, True)
+                results = model.forward(
+                    batch_inputs, data_samples, mode='predict')
+            self.assertEqual(len(results), len(data))
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(batch_inputs, data_samples, mode='loss')
+
+            self.assertGreater(losses['vote_loss'], 0)
+            self.assertGreater(losses['objectness_loss'], 0)
+            self.assertGreater(losses['center_loss'], 0)
--- a/tests/test_models/test_detectors/test_imvotenet.py
+++ b/tests/test_models/test_detectors/test_imvotenet.py
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from tests.utils.model_utils import (_create_detector_inputs,
+                                     _get_detector_cfg, _setup_seed)
+
+
+class TestImvoteNet(unittest.TestCase):
+
+    def test_imvotenet_only_img(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoteNet')
+        DefaultScope.get_instance('test_imvotenet_img', scope_name='mmdet3d')
+        _setup_seed(0)
+        votenet_net_cfg = _get_detector_cfg(
+            'imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py'
+        )
+        model = MODELS.build(votenet_net_cfg)
+
+        data = [
+            _create_detector_inputs(
+                with_points=False, with_img=True, img_size=128)
+        ]
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                batch_inputs, data_samples = model.data_preprocessor(
+                    data, True)
+                results = model.forward(
+                    batch_inputs, data_samples, mode='predict')
+            self.assertEqual(len(results), len(data))
+            self.assertIn('bboxes', results[0].pred_instances)
+            self.assertIn('scores', results[0].pred_instances)
+            self.assertIn('labels', results[0].pred_instances)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(batch_inputs, data_samples, mode='loss')
+
+            self.assertGreater(sum(losses['loss_rpn_cls']), 0)
+
+            self.assertGreater(losses['loss_cls'], 0)
+            self.assertGreater(losses['loss_bbox'], 0)
+
+    def test_imvotenet(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoteNet')
+        DefaultScope.get_instance('test_imvotenet', scope_name='mmdet3d')
+        _setup_seed(0)
+        votenet_net_cfg = _get_detector_cfg(
+            'imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py')
+        model = MODELS.build(votenet_net_cfg)
+
+        data = [
+            _create_detector_inputs(
+                with_points=True,
+                with_img=True,
+                img_size=128,
+                bboxes_3d_type='depth')
+        ]
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                batch_inputs, data_samples = model.data_preprocessor(
+                    data, True)
+                results = model.forward(
+                    batch_inputs, data_samples, mode='predict')
+            self.assertEqual(len(results), len(data))
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(batch_inputs, data_samples, mode='loss')
+
+            self.assertGreater(losses['vote_loss'], 0)
+            self.assertGreater(losses['objectness_loss'], 0)
+            self.assertGreater(losses['semantic_loss'], 0)
--- a/tests/utils/model_utils.py
+++ b/tests/utils/model_utils.py
@@ -75,6 +75,7 @@ def _get_detector_cfg(fname):
 def _create_detector_inputs(seed=0,
                            with_points=True,
                            with_img=False,
+                            img_size=10,
                            num_gt_instance=20,
                            num_points=10,
                            points_feat_dim=4,
@@ -90,23 +91,46 @@ def _create_detector_inputs(seed=0,
        'depth': DepthInstance3DBoxes,
        'cam': CameraInstance3DBoxes
    }
+    meta_info = dict()
+    meta_info['depth2img'] = np.array(
+        [[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
+         [1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
+         [1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
+    meta_info['lidar2img'] = np.array(
+        [[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
+         [1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
+         [1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
    if with_points:
        points = torch.rand([num_points, points_feat_dim])
    else:
        points = None
    if with_img:
-        img = torch.rand(3, 10, 10)
+        img = torch.rand(3, img_size, img_size)
+        meta_info['img_shape'] = (img_size, img_size)
+        meta_info['ori_shape'] = (img_size, img_size)
+        meta_info['scale_factor'] = np.array([1., 1.])
+
    else:
        img = None
    inputs_dict = dict(img=img, points=points)
-
    gt_instance_3d = InstanceData()
+
    gt_instance_3d.bboxes_3d = bbox_3d_class[bboxes_3d_type](
        torch.rand([num_gt_instance, gt_bboxes_dim]), box_dim=gt_bboxes_dim)
    gt_instance_3d.labels_3d = torch.randint(0, num_classes, [num_gt_instance])
    data_sample = Det3DDataSample(
        metainfo=dict(box_type_3d=bbox_3d_class[bboxes_3d_type]))
+    data_sample.set_metainfo(meta_info)
    data_sample.gt_instances_3d = gt_instance_3d
+
+    gt_instance = InstanceData()
+    gt_instance.labels = torch.randint(0, num_classes, [num_gt_instance])
+    gt_instance.bboxes = torch.rand(num_gt_instance, 4)
+    gt_instance.bboxes[:,
+                       2:] = gt_instance.bboxes[:, :2] + gt_instance.bboxes[:,
+                                                                            2:]
+
+    data_sample.gt_instances = gt_instance
    data_sample.gt_pts_seg = PointData()
    if with_pts_instance_mask:
        pts_instance_mask = torch.randint(0, num_gt_instance, [num_points])

--- a/tools/data_converter/update_infos_to_v2.py
+++ b/tools/data_converter/update_infos_to_v2.py
@@ -652,21 +652,28 @@ def update_sunrgbd_infos(pkl_path, out_dir):
        temp_data_info['images']['CAM0']['width'] = w

        anns = ori_info_dict['annos']
-        num_instances = len(anns['name'])
-        ignore_class_name = set()
-        instance_list = []
-        for instance_id in range(num_instances):
-            empty_instance = get_empty_instance()
-            empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
-                instance_id].tolist()
-            if anns['name'][instance_id] in METAINFO['CLASSES']:
-                empty_instance['bbox_label_3d'] = METAINFO['CLASSES'].index(
-                    anns['name'][instance_id])
-            else:
-                ignore_class_name.add(anns['name'][instance_id])
-                empty_instance['bbox_label_3d'] = -1
-            empty_instance = clear_instance_unused_keys(empty_instance)
-            instance_list.append(empty_instance)
+        if anns['gt_num'] == 0:
+            instance_list = []
+        else:
+            num_instances = len(anns['name'])
+            ignore_class_name = set()
+            instance_list = []
+            for instance_id in range(num_instances):
+                empty_instance = get_empty_instance()
+                empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
+                    instance_id].tolist()
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+                if anns['name'][instance_id] in METAINFO['CLASSES']:
+                    empty_instance['bbox_label_3d'] = METAINFO[
+                        'CLASSES'].index(anns['name'][instance_id])
+                    empty_instance['bbox_label'] = empty_instance[
+                        'bbox_label_3d']
+                else:
+                    ignore_class_name.add(anns['name'][instance_id])
+                    empty_instance['bbox_label_3d'] = -1
+                    empty_instance['bbox_label'] = -1
+                empty_instance = clear_instance_unused_keys(empty_instance)
+                instance_list.append(empty_instance)
        temp_data_info['instances'] = instance_list
        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
        converted_list.append(temp_data_info)