[Rrfactor]Imvotenet

f63a62b8 · zhangshilong · ChaimZhu · edb6b369 · f63a62b8 · f63a62b8
Commit f63a62b8 authored Jul 19, 2022 by zhangshilong Committed by ChaimZhu Jul 20, 2022
20 changed files
--- a/configs/_base_/datasets/sunrgbd-3d-10class.py
+++ b/configs/_base_/datasets/sunrgbd-3d-10class.py
@@ -2,6 +2,9 @@ dataset_type = 'SUNRGBDDataset'
 data_root = 'data/sunrgbd/'
 class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
               'night_stand', 'bookshelf', 'bathtub')
+
+metainfo = dict(CLASSES=class_names)
+
 train_pipeline = [
    dict(
        type='LoadPointsFromFile',
@@ -21,8 +24,9 @@ train_pipeline = [
        scale_ratio_range=[0.85, 1.15],
        shift_height=True),
    dict(type='PointSample', num_points=20000),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
 ]
 test_pipeline = [
    dict(
@@ -47,61 +51,52 @@ test_pipeline = [
                sync_2d=False,
                flip_ratio_bev_horizontal=0.5,
            ),
-            dict(type='PointSample', num_points=20000),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ])
-]
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='DEPTH',
-        shift_height=False,
-        load_dim=6,
-        use_dim=[0, 1, 2]),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points'])
+            dict(type='PointSample', num_points=20000)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
 ]

-data = dict(
-    samples_per_gpu=16,
-    workers_per_gpu=4,
-    train=dict(
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
        type='RepeatDataset',
        times=5,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
-            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            ann_file='sunrgbd_infos_train.pkl',
            pipeline=train_pipeline,
-            classes=class_names,
            filter_empty_gt=False,
+            metainfo=metainfo,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-            box_type_3d='Depth')),
-    val=dict(
+            box_type_3d='Depth')))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        ann_file='sunrgbd_infos_val.pkl',
        pipeline=test_pipeline,
-        classes=class_names,
+        metainfo=metainfo,
        test_mode=True,
-        box_type_3d='Depth'),
-    test=dict(
+        box_type_3d='Depth'))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        ann_file='sunrgbd_infos_val.pkl',
        pipeline=test_pipeline,
-        classes=class_names,
+        metainfo=metainfo,
        test_mode=True,
        box_type_3d='Depth'))
-
-evaluation = dict(pipeline=eval_pipeline)
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
--- a/configs/_base_/models/imvotenet_image.py
+++ b/configs/_base_/models/imvotenet_image.py
 model = dict(
    type='ImVoteNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        # use caffe img_norm
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
    img_backbone=dict(
-        type='ResNet',
+        type='mmdet.ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
@@ -10,11 +17,12 @@ model = dict(
        norm_eval=True,
        style='caffe'),
    img_neck=dict(
-        type='FPN',
+        type='mmdet.FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
    img_rpn_head=dict(
+        _scope_='mmdet',
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
@@ -31,6 +39,7 @@ model = dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    img_roi_head=dict(
+        _scope_='mmdet',
        type='StandardRoIHead',
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
@@ -56,6 +65,7 @@ model = dict(
    train_cfg=dict(
        img_rpn=dict(
            assigner=dict(
+                _scope_='mmdet',
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
@@ -63,7 +73,7 @@ model = dict(
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
-                type='RandomSampler',
+                type='mmdet.RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
@@ -80,6 +90,7 @@ model = dict(
            min_bbox_size=0),
        img_rcnn=dict(
            assigner=dict(
+                _scope_='mmdet',
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
@@ -87,7 +98,7 @@ model = dict(
                match_low_quality=False,
                ignore_iof_thr=-1),
            sampler=dict(
-                type='RandomSampler',
+                type='mmdet.RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,

--- a/configs/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py
+++ b/configs/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py
@@ -3,56 +3,71 @@ _base_ = [
    '../_base_/models/imvotenet_image.py'
 ]

-# use caffe img_norm
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-
 train_pipeline = [
    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True),
    dict(
-        type='Resize',
-        img_scale=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=False,
+        with_label_3d=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
                (1333, 576), (1333, 600)],
-        multiscale_mode='value',
        keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Pack3DDetInputs', keys=['img', 'gt_bboxes', 'gt_bboxes_labels']),
 ]
+
 test_pipeline = [
    dict(type='LoadImageFromFile'),
+    # online evaluation
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=False,
+        with_label_3d=False),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1333, 600),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
+        type='Pack3DDetInputs',
+        keys=(['img']),
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
 ]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset', times=1, dataset=dict(pipeline=train_pipeline)))

-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(times=1, dataset=dict(pipeline=train_pipeline)),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6],
+        gamma=0.1)
+]
+val_evaluator = dict(type='Indoor2DMetric')

-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(grad_clip=None)
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=0.001,
-    step=[6])
-runner = dict(type='EpochBasedRunner', max_epochs=8)
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))

 load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
--- a/configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py
+++ b/configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py
@@ -7,10 +7,6 @@ _base_ = [
 class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
               'night_stand', 'bookshelf', 'bathtub')

-# use caffe img_norm
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-
 model = dict(
    pts_backbone=dict(
        type='PointNet2SASSG',
@@ -48,10 +44,8 @@ model = dict(
                            [0.76584, 1.398258, 0.472728]]),
            pred_layer_cfg=dict(
                in_channels=128, shared_conv_channels=(128, 128), bias=True),
-            conv_cfg=dict(type='Conv1d'),
-            norm_cfg=dict(type='BN1d'),
            objectness_loss=dict(
-                type='CrossEntropyLoss',
+                type='mmdet.CrossEntropyLoss',
                class_weight=[0.2, 0.8],
                reduction='sum',
                loss_weight=5.0),
@@ -62,15 +56,23 @@ model = dict(
                loss_src_weight=10.0,
                loss_dst_weight=10.0),
            dir_class_loss=dict(
-                type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0),
            dir_res_loss=dict(
-                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+                type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
            size_class_loss=dict(
-                type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0),
            size_res_loss=dict(
-                type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+                type='mmdet.SmoothL1Loss',
+                reduction='sum',
+                loss_weight=10.0 / 3.0),
            semantic_loss=dict(
-                type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0)),
        joint=dict(
            vote_module_cfg=dict(
                in_channels=512,
@@ -154,11 +156,11 @@ model = dict(
    # model training and testing settings
    train_cfg=dict(
        pts=dict(
-            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')),
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote')),
    test_cfg=dict(
        img_rcnn=dict(score_thr=0.1),
        pts=dict(
-            sample_mod='seed',
+            sample_mode='seed',
            nms_thr=0.25,
            score_thr=0.05,
            per_class_proposal=True)))
@@ -171,12 +173,13 @@ train_pipeline = [
        load_dim=6,
        use_dim=[0, 1, 2]),
    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations3D'),
-    dict(type='LoadAnnotations', with_bbox=True),
-    dict(type='Resize', img_scale=(1333, 600), keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
    dict(
        type='RandomFlip3D',
        sync_2d=False,
@@ -188,15 +191,13 @@ train_pipeline = [
        scale_ratio_range=[0.85, 1.15],
        shift_height=True),
    dict(type='PointSample', num_points=20000),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
-        type='Collect3D',
-        keys=[
-            'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d',
+        type='Pack3DDetInputs',
+        keys=([
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'points', 'gt_bboxes_3d',
            'gt_labels_3d'
-        ])
+        ]))
 ]
-
 test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
@@ -205,56 +206,15 @@ test_pipeline = [
        shift_height=True,
        load_dim=6,
        use_dim=[0, 1, 2]),
-    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 600),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip', flip_ratio=0.0),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(
-                type='GlobalRotScaleTrans',
-                rot_range=[0, 0],
-                scale_ratio_range=[1., 1.],
-                translation_std=[0, 0, 0]),
-            dict(
-                type='RandomFlip3D',
-                sync_2d=False,
-                flip_ratio_bev_horizontal=0.5,
-            ),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
    dict(type='PointSample', num_points=20000),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img', 'points'])
-        ]),
-]
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='DEPTH',
-        shift_height=False,
-        load_dim=6,
-        use_dim=[0, 1, 2]),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['img', 'points'])
+    dict(type='Pack3DDetInputs', keys=['img', 'points'])
 ]

-data = dict(
-    train=dict(dataset=dict(pipeline=train_pipeline)),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
-evaluation = dict(pipeline=eval_pipeline)
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))

 # may also use your own pre-trained image branch
 load_from = 'https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth'  # noqa
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -178,10 +178,11 @@ class Det3DDataset(BaseDataset):
            dict | None: Processed `ann_info`
        """
        # add s or gt prefix for most keys after concat
+        # we only process 3d annotations here, the corresponding
+        # 2d annotation process is in the `LoadAnnotations3D`
+        # in `pipelines`
        name_mapping = {
-            'bbox_label': 'gt_labels',
            'bbox_label_3d': 'gt_labels_3d',
-            'bbox': 'gt_bboxes',
            'bbox_3d': 'gt_bboxes_3d',
            'depth': 'depths',
            'center_2d': 'centers_2d',
@@ -196,6 +197,7 @@ class Det3DDataset(BaseDataset):
            keys = list(instances[0].keys())
            ann_info = dict()
            for ann_name in keys:
+                if ann_name in name_mapping:
                    temp_anns = [item[ann_name] for item in instances]
                    # map the original dataset label to training label
                    if 'label' in ann_name:
@@ -203,9 +205,10 @@ class Det3DDataset(BaseDataset):
                            self.label_mapping[item] for item in temp_anns
                        ]
                    temp_anns = np.array(temp_anns)
-                if ann_name in name_mapping:
+
                    ann_name = name_mapping[ann_name]
                    ann_info[ann_name] = temp_anns
+            ann_info['instances'] = info['instances']
        return ann_info

    def parse_data_info(self, info: dict) -> dict:

--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -87,7 +87,8 @@ class KittiDataset(Det3DDataset):
            if 'plane' in info:
                # convert ground plane to velodyne coordinates
                plane = np.array(info['plane'])
-                lidar2cam = np.array(info['images']['CAM2']['lidar2cam'])
+                lidar2cam = np.array(
+                    info['images']['CAM2']['lidar2cam'], dtype=np.float32)
                reverse = np.linalg.inv(lidar2cam)

                (plane_norm_cam, plane_off_cam) = (plane[:3],

--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
+from typing import List, Sequence, Union

+import mmcv
 import numpy as np
+import torch
 from mmcv import BaseTransform
-from mmcv.transforms import to_tensor
 from mmengine import InstanceData
+from numpy import dtype

 from mmdet3d.core import Det3DDataSample, PointData
 from mmdet3d.core.bbox import BaseInstance3DBoxes
@@ -12,6 +14,38 @@ from mmdet3d.core.points import BasePoints
 from mmdet3d.registry import TRANSFORMS


+def to_tensor(
+    data: Union[torch.Tensor, np.ndarray, Sequence, int,
+                float]) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: the converted data.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        if data.dtype is dtype('float64'):
+            data = data.astype(np.float32)
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
 @TRANSFORMS.register_module()
 class Pack3DDetInputs(BaseTransform):
    INPUTS_KEYS = ['points', 'img']
@@ -20,7 +54,7 @@ class Pack3DDetInputs(BaseTransform):
    ]
    INSTANCEDATA_2D_KEYS = [
        'gt_bboxes',
-        'gt_labels',
+        'gt_bboxes_labels',
    ]

    SEG_KEYS = [
@@ -121,8 +155,8 @@ class Pack3DDetInputs(BaseTransform):

        for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
-                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
-                'pts_semantic_mask', 'centers_2d', 'depths'
+                'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
        ]:
            if key not in results:
                continue
@@ -159,6 +193,9 @@ class Pack3DDetInputs(BaseTransform):
                elif key in self.INSTANCEDATA_3D_KEYS:
                    gt_instances_3d[self._remove_prefix(key)] = results[key]
                elif key in self.INSTANCEDATA_2D_KEYS:
+                    if key == 'gt_bboxes_labels':
+                        gt_instances['labels'] = results[key]
+                    else:
                        gt_instances[self._remove_prefix(key)] = results[key]
                elif key in self.SEG_KEYS:
                    gt_pts_seg[self._remove_prefix(key)] = results[key]

--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
@@ -632,34 +632,6 @@ class LoadAnnotations3D(LoadAnnotations):
        self.with_seg_3d = with_seg_3d
        self.seg_3d_dtype = seg_3d_dtype

-    def _load_bboxes(self, results: dict) -> None:
-        """Private function to load bounding box annotations.
-
-        Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
-        datasets.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict contains loaded bounding box annotations.
-        """
-        results['gt_bboxes'] = results['ann_info']['gt_bboxes']
-
-    def _load_labels(self, results: dict) -> None:
-        """Private function to load label annotations.
-
-        Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
-        datasets.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict contains loaded label annotations.
-        """
-        results['gt_labels'] = results['ann_info']['gt_labels']
-
    def _load_bboxes_3d(self, results: dict) -> dict:
        """Private function to move the 3D bounding box annotation from
        `ann_info` field to the root of `results`.
@@ -769,6 +741,56 @@ class LoadAnnotations3D(LoadAnnotations):
            results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
        return results

+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        The only difference is it remove the proceess for
+        `ignore_flag`
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        for instance in results['instances']:
+            gt_bboxes.append(instance['bbox'])
+        if len(gt_bboxes) == 0:
+            results['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+        else:
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape((-1, 4))
+        if self.denorm_bbox:
+            bbox_num = results['gt_bboxes'].shape[0]
+            if bbox_num != 0:
+                h, w = results['img_shape']
+                results['gt_bboxes'][:, 0::2] *= w
+                results['gt_bboxes'][:, 1::2] *= h
+
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['gt_bboxes'] = results['gt_bboxes']
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results['instances']:
+            gt_bboxes_labels.append(instance['bbox_label'])
+        if len(gt_bboxes_labels) == 0:
+            results['gt_bboxes_labels'] = np.zeros((0, ), dtype=np.int64)
+        else:
+            results['gt_bboxes_labels'] = np.array(
+                gt_bboxes_labels, dtype=np.int64)
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['gt_bboxes_labels'] = results[
+                'gt_bboxes_labels']
+
    def transform(self, results: dict) -> dict:
        """Function to load multiple types annotations.


--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
+
+import os.path as osp
 from typing import Callable, List, Optional, Union

 import numpy as np
@@ -22,13 +24,15 @@ class SUNRGBDDataset(Det3DDataset):
        ann_file (str): Path of annotation file.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
-        data_prefix (dict, optional): Prefix for data. Defaults to
+        data_prefix (dict): Prefix for data. Defaults to
            `dict(pts='points',img='sunrgbd_trainval')`.
        pipeline (list[dict], optional): Pipeline used for data processing.
            Defaults to None.
        modality (dict, optional): Modality to specify the sensor data used
            as input. Defaults to `dict(use_camera=True, use_lidar=True)`.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
+        default_cam_key (str): The default camera name adopted.
+            Defaults to "CAM0".
+        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
            Defaults to 'Depth' in this dataset. Available options includes
@@ -36,9 +40,9 @@ class SUNRGBDDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
+        filter_empty_gt (bool): Whether to filter empty GT.
            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
    """
    METAINFO = {
@@ -51,8 +55,9 @@ class SUNRGBDDataset(Det3DDataset):
                 ann_file: str,
                 metainfo: Optional[dict] = None,
                 data_prefix: dict = dict(
-                     pts='points', img='sunrgbd_trainval'),
+                     pts='points', img='sunrgbd_trainval/image'),
                 pipeline: List[Union[dict, Callable]] = [],
+                 default_cam_key: str = 'CAM0',
                 modality=dict(use_camera=True, use_lidar=True),
                 box_type_3d: str = 'Depth',
                 filter_empty_gt: bool = True,
@@ -64,6 +69,7 @@ class SUNRGBDDataset(Det3DDataset):
            metainfo=metainfo,
            data_prefix=data_prefix,
            pipeline=pipeline,
+            default_cam_key=default_cam_key,
            modality=modality,
            box_type_3d=box_type_3d,
            filter_empty_gt=filter_empty_gt,
@@ -73,6 +79,47 @@ class SUNRGBDDataset(Det3DDataset):
            'use_lidar' in self.modality
        assert self.modality['use_camera'] or self.modality['use_lidar']

+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Convert all relative path of needed modality data file to
+        the absolute path. And process
+        the `instances` field to `ann_info` in training stage.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+
+        if self.modality['use_lidar']:
+            info['lidar_points']['lidar_path'] = \
+                osp.join(
+                    self.data_prefix.get('pts', ''),
+                    info['lidar_points']['lidar_path'])
+
+        if self.modality['use_camera']:
+            for cam_id, img_info in info['images'].items():
+                if 'img_path' in img_info:
+                    img_info['img_path'] = osp.join(
+                        self.data_prefix.get('img', ''), img_info['img_path'])
+            if self.default_cam_key is not None:
+                info['img_path'] = info['images'][
+                    self.default_cam_key]['img_path']
+                info['depth2img'] = np.array(
+                    info['images'][self.default_cam_key]['depth2img'],
+                    dtype=np.float32)
+
+        if not self.test_mode:
+            # used in traing
+            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            info['eval_ann_info'] = self.parse_ann_info(info)
+
+        return info
+
    def parse_ann_info(self, info: dict) -> dict:
        """Process the `instances` in data info to `ann_info`

@@ -83,12 +130,11 @@ class SUNRGBDDataset(Det3DDataset):
            dict: Processed `ann_info`
        """
        ann_info = super().parse_ann_info(info)
-        # empty gt
+        # process data without any annotations
        if ann_info is None:
            ann_info = dict()
            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
-
        # to target box structure
        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
            ann_info['gt_bboxes_3d'],

--- a/mmdet3d/metrics/indoor_metric.py
+++ b/mmdet3d/metrics/indoor_metric.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
 from typing import Dict, List, Optional, Sequence

+import numpy as np
 from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger

 from mmdet3d.core import get_box_type, indoor_eval
 from mmdet3d.registry import METRICS
+from mmdet.core import eval_map


 @METRICS.register_module()
 class IndoorMetric(BaseMetric):
-    """Kitti evaluation metric.
+    """Indoor scene evaluation metric.

    Args:
        iou_thr (list[float]): List of iou threshold when calculate the
@@ -90,3 +93,91 @@ class IndoorMetric(BaseMetric):
            box_mode_3d=box_mode_3d)

        return ret_dict
+
+
+@METRICS.register_module()
+class Indoor2DMetric(BaseMetric):
+    """indoor 2d predictions evaluation metric.
+
+    Args:
+        iou_thr (list[float]): List of iou threshold when calculate the
+            metric. Defaults to  [0.5].
+        collect_device (str, optional): Device name used for collecting
+            results from different ranks during distributed training.
+            Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
+        prefix (str): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    """
+
+    def __init__(self,
+                 iou_thr: List[float] = [0.5],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 **kwargs):
+        super(Indoor2DMetric, self).__init__(
+            prefix=prefix, collect_device=collect_device)
+        self.iou_thr = iou_thr
+
+    def process(self, data_batch: Sequence[dict],
+                predictions: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``,
+        which will be used to compute the metrics when all batches
+        have been processed.
+
+        Args:
+            data_batch (Sequence[dict]): A batch of data
+                from the dataloader.
+            predictions (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        batch_eval_anns = [
+            item['data_sample']['eval_ann_info'] for item in data_batch
+        ]
+        for eval_ann, pred_dict in zip(batch_eval_anns, predictions):
+            pred = pred_dict['pred_instances']
+            ann = dict(
+                labels=eval_ann['gt_bboxes_labels'],
+                bboxes=eval_ann['gt_bboxes'])
+
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            dets = []
+            for label in range(len(self.dataset_meta['CLASSES'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        annotations, preds = zip(*results)
+        eval_results = OrderedDict()
+        iou_thr_2d = (self.iou_thr) if isinstance(self.iou_thr,
+                                                  float) else self.iou_thr
+        for iou_thr_2d_single in iou_thr_2d:
+            mean_ap, _ = eval_map(
+                preds,
+                annotations,
+                scale_ranges=None,
+                iou_thr=iou_thr_2d_single,
+                dataset=self.dataset_meta['CLASSES'],
+                logger=logger)
+            eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
+        return eval_results
--- a/mmdet3d/models/dense_heads/vote_head.py
+++ b/mmdet3d/models/dense_heads/vote_head.py
@@ -754,15 +754,16 @@ class VoteHead(BaseModule):
        batch_size = bbox3d.shape[0]
        results_list = list()
        if use_nms:
-            for b in range(batch_size):
+            for batch_index in range(batch_size):
                temp_results = InstanceData()
                bbox_selected, score_selected, labels = \
-                    self.multiclass_nms_single(obj_scores[b],
-                                               sem_scores[b],
-                                               bbox3d[b],
-                                               stack_points[b, ..., :3],
-                                               batch_input_metas[b])
-                bbox = batch_input_metas[b]['box_type_3d'](
+                    self.multiclass_nms_single(
+                        obj_scores[batch_index],
+                        sem_scores[batch_index],
+                        bbox3d[batch_index],
+                        stack_points[batch_index, ..., :3],
+                        batch_input_metas[batch_index])
+                bbox = batch_input_metas[batch_index]['box_type_3d'](
                    bbox_selected,
                    box_dim=bbox_selected.shape[-1],
                    with_yaw=self.bbox_coder.with_rot)

--- a/mmdet3d/models/detectors/base.py
+++ b/mmdet3d/models/detectors/base.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
+from typing import List, Optional, Union

 from mmengine import InstanceData

@@ -91,8 +91,8 @@ class Base3DDetector(BaseDetector):

    def convert_to_datasample(
        self,
-        results_list_3d: InstanceList,
-        results_list_2d: InstanceList = None,
+        results_list_3d: Optional[InstanceList] = None,
+        results_list_2d: Optional[InstanceList] = None,
    ) -> SampleList:
        """Convert results list to `Det3DDataSample`.

@@ -128,10 +128,18 @@ class Base3DDetector(BaseDetector):
        """

        data_sample_list = []
+        assert (results_list_2d is not None) or \
+               (results_list_3d is not None),\
+               'please pass at least one type of results_list'
+
        if results_list_2d is None:
            results_list_2d = [
                InstanceData() for _ in range(len(results_list_3d))
            ]
+        if results_list_3d is None:
+            results_list_3d = [
+                InstanceData() for _ in range(len(results_list_2d))
+            ]
        for i in range(len(results_list_3d)):
            result = Det3DDataSample()
            result.pred_instances_3d = results_list_3d[i]

--- a/mmdet3d/models/detectors/imvotenet.py
+++ b/mmdet3d/models/detectors/imvotenet.py
--- a/mmdet3d/models/fusion_layers/coord_transform.py
+++ b/mmdet3d/models/fusion_layers/coord_transform.py
@@ -105,8 +105,8 @@ def extract_2d_info(img_meta, tensor):
    """
    img_shape = img_meta['img_shape']
    ori_shape = img_meta['ori_shape']
-    img_h, img_w, _ = img_shape
-    ori_h, ori_w, _ = ori_shape
+    img_h, img_w = img_shape
+    ori_h, ori_w = ori_shape

    img_scale_factor = (
        tensor.new_tensor(img_meta['scale_factor'][:2])

--- a/mmdet3d/models/fusion_layers/vote_fusion.py
+++ b/mmdet3d/models/fusion_layers/vote_fusion.py
@@ -45,8 +45,6 @@ class VoteFusion(nn.Module):
            seed_num = seed_3d_depth.shape[0]

            img_shape = img_meta['img_shape']
-            img_h, img_w, _ = img_shape
-
            # first reverse the data transformations
            xyz_depth = apply_3d_transformation(
                seed_3d_depth, 'DEPTH', img_meta, reverse=True)

--- a/tests/data/sunrgbd/sunrgbd_infos.pkl
+++ b/tests/data/sunrgbd/sunrgbd_infos.pkl
--- a/tests/test_models/test_detectors/test_h3d.py
+++ b/tests/test_models/test_detectors/test_h3d.py
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from tests.utils.model_utils import (_create_detector_inputs,
+                                     _get_detector_cfg, _setup_seed)
+
+
+class TestH3D(unittest.TestCase):
+
+    def test_h3dnet(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'H3DNet')
+        DefaultScope.get_instance('test_H3DNet', scope_name='mmdet3d')
+        _setup_seed(0)
+        voxel_net_cfg = _get_detector_cfg(
+            'h3dnet/h3dnet_3x8_scannet-3d-18class.py')
+        model = MODELS.build(voxel_net_cfg)
+        num_gt_instance = 5
+        data = [
+            _create_detector_inputs(
+                num_gt_instance=num_gt_instance,
+                points_feat_dim=4,
+                bboxes_3d_type='depth',
+                with_pts_semantic_mask=True,
+                with_pts_instance_mask=True)
+        ]
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                batch_inputs, data_samples = model.data_preprocessor(
+                    data, True)
+                results = model.forward(
+                    batch_inputs, data_samples, mode='predict')
+            self.assertEqual(len(results), len(data))
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(batch_inputs, data_samples, mode='loss')
+
+            self.assertGreater(losses['vote_loss'], 0)
+            self.assertGreater(losses['objectness_loss'], 0)
+            self.assertGreater(losses['center_loss'], 0)
--- a/tests/test_models/test_detectors/test_imvotenet.py
+++ b/tests/test_models/test_detectors/test_imvotenet.py
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from tests.utils.model_utils import (_create_detector_inputs,
+                                     _get_detector_cfg, _setup_seed)
+
+
+class TestImvoteNet(unittest.TestCase):
+
+    def test_imvotenet_only_img(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoteNet')
+        DefaultScope.get_instance('test_imvotenet_img', scope_name='mmdet3d')
+        _setup_seed(0)
+        votenet_net_cfg = _get_detector_cfg(
+            'imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py'
+        )
+        model = MODELS.build(votenet_net_cfg)
+
+        data = [
+            _create_detector_inputs(
+                with_points=False, with_img=True, img_size=128)
+        ]
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                batch_inputs, data_samples = model.data_preprocessor(
+                    data, True)
+                results = model.forward(
+                    batch_inputs, data_samples, mode='predict')
+            self.assertEqual(len(results), len(data))
+            self.assertIn('bboxes', results[0].pred_instances)
+            self.assertIn('scores', results[0].pred_instances)
+            self.assertIn('labels', results[0].pred_instances)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(batch_inputs, data_samples, mode='loss')
+
+            self.assertGreater(sum(losses['loss_rpn_cls']), 0)
+
+            self.assertGreater(losses['loss_cls'], 0)
+            self.assertGreater(losses['loss_bbox'], 0)
+
+    def test_imvotenet(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoteNet')
+        DefaultScope.get_instance('test_imvotenet', scope_name='mmdet3d')
+        _setup_seed(0)
+        votenet_net_cfg = _get_detector_cfg(
+            'imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py')
+        model = MODELS.build(votenet_net_cfg)
+
+        data = [
+            _create_detector_inputs(
+                with_points=True,
+                with_img=True,
+                img_size=128,
+                bboxes_3d_type='depth')
+        ]
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                batch_inputs, data_samples = model.data_preprocessor(
+                    data, True)
+                results = model.forward(
+                    batch_inputs, data_samples, mode='predict')
+            self.assertEqual(len(results), len(data))
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(batch_inputs, data_samples, mode='loss')
+
+            self.assertGreater(losses['vote_loss'], 0)
+            self.assertGreater(losses['objectness_loss'], 0)
+            self.assertGreater(losses['semantic_loss'], 0)
--- a/tests/utils/model_utils.py
+++ b/tests/utils/model_utils.py
@@ -75,6 +75,7 @@ def _get_detector_cfg(fname):
 def _create_detector_inputs(seed=0,
                            with_points=True,
                            with_img=False,
+                            img_size=10,
                            num_gt_instance=20,
                            num_points=10,
                            points_feat_dim=4,
@@ -90,23 +91,46 @@ def _create_detector_inputs(seed=0,
        'depth': DepthInstance3DBoxes,
        'cam': CameraInstance3DBoxes
    }
+    meta_info = dict()
+    meta_info['depth2img'] = np.array(
+        [[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
+         [1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
+         [1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
+    meta_info['lidar2img'] = np.array(
+        [[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
+         [1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
+         [1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
    if with_points:
        points = torch.rand([num_points, points_feat_dim])
    else:
        points = None
    if with_img:
-        img = torch.rand(3, 10, 10)
+        img = torch.rand(3, img_size, img_size)
+        meta_info['img_shape'] = (img_size, img_size)
+        meta_info['ori_shape'] = (img_size, img_size)
+        meta_info['scale_factor'] = np.array([1., 1.])
+
    else:
        img = None
    inputs_dict = dict(img=img, points=points)
-
    gt_instance_3d = InstanceData()
+
    gt_instance_3d.bboxes_3d = bbox_3d_class[bboxes_3d_type](
        torch.rand([num_gt_instance, gt_bboxes_dim]), box_dim=gt_bboxes_dim)
    gt_instance_3d.labels_3d = torch.randint(0, num_classes, [num_gt_instance])
    data_sample = Det3DDataSample(
        metainfo=dict(box_type_3d=bbox_3d_class[bboxes_3d_type]))
+    data_sample.set_metainfo(meta_info)
    data_sample.gt_instances_3d = gt_instance_3d
+
+    gt_instance = InstanceData()
+    gt_instance.labels = torch.randint(0, num_classes, [num_gt_instance])
+    gt_instance.bboxes = torch.rand(num_gt_instance, 4)
+    gt_instance.bboxes[:,
+                       2:] = gt_instance.bboxes[:, :2] + gt_instance.bboxes[:,
+                                                                            2:]
+
+    data_sample.gt_instances = gt_instance
    data_sample.gt_pts_seg = PointData()
    if with_pts_instance_mask:
        pts_instance_mask = torch.randint(0, num_gt_instance, [num_points])

--- a/tools/data_converter/update_infos_to_v2.py
+++ b/tools/data_converter/update_infos_to_v2.py
@@ -652,6 +652,9 @@ def update_sunrgbd_infos(pkl_path, out_dir):
        temp_data_info['images']['CAM0']['width'] = w

        anns = ori_info_dict['annos']
+        if anns['gt_num'] == 0:
+            instance_list = []
+        else:
            num_instances = len(anns['name'])
            ignore_class_name = set()
            instance_list = []
@@ -659,12 +662,16 @@ def update_sunrgbd_infos(pkl_path, out_dir):
                empty_instance = get_empty_instance()
                empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
                    instance_id].tolist()
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
                if anns['name'][instance_id] in METAINFO['CLASSES']:
-                empty_instance['bbox_label_3d'] = METAINFO['CLASSES'].index(
-                    anns['name'][instance_id])
+                    empty_instance['bbox_label_3d'] = METAINFO[
+                        'CLASSES'].index(anns['name'][instance_id])
+                    empty_instance['bbox_label'] = empty_instance[
+                        'bbox_label_3d']
                else:
                    ignore_class_name.add(anns['name'][instance_id])
                    empty_instance['bbox_label_3d'] = -1
+                    empty_instance['bbox_label'] = -1
                empty_instance = clear_instance_unused_keys(empty_instance)
                instance_list.append(empty_instance)
        temp_data_info['instances'] = instance_list