Encapsulate network output into box structure

2d91ef83 · zhangwenwei · 26c08c7e · 2d91ef83 · 2d91ef83 · 2d91ef83
Commit 2d91ef83 authored Jun 13, 2020 by zhangwenwei
15 changed files
--- a/.dev_scripts/linter.sh
+++ b/.dev_scripts/linter.sh
+yapf -r -i --style .style.yapf mmdet3d/ configs/ tests/ tools/
+isort -rc mmdet3d/ configs/ tests/ tools/
+flake8 .
--- a/configs/nus/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/nus/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py
+# model settings
+voxel_size = [0.25, 0.25, 8]
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+model = dict(
+    type='MVXFasterRCNNV2',
+    pts_voxel_layer=dict(
+        max_num_points=64,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000),  # (training, testing) max_coxels
+    ),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter',
+        in_channels=64,
+        output_shape=[400, 400],  # checked from PointCloud3D
+    ),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256],
+    ),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3,
+    ),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
+                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.2,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_num=500
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+    ))
+
+# dataset settings
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(),
+    sample_groups=dict(
+        bus=4,
+        trailer=4,
+        truck=4,
+    ))
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.3925, 0.3925],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='RandomFlip3D', flip_ratio=0),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points']),
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+evaluation = dict(interval=24)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 24
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -83,7 +83,9 @@ def bbox3d2result(bboxes, scores, labels):
        dict(Tensor): bbox results in cpu mode
    """
    return dict(
-        boxes_3d=bboxes.cpu(), scores_3d=scores.cpu(), labels_3d=labels.cpu())
+        boxes_3d=bboxes.to('cpu'),
+        scores_3d=scores.cpu(),
+        labels_3d=labels.cpu())


 def upright_depth_to_lidar_torch(points=None,

--- a/mmdet3d/datasets/custom_3d.py
+++ b/mmdet3d/datasets/custom_3d.py
@@ -6,11 +6,39 @@ import numpy as np
 from torch.utils.data import Dataset

 from mmdet.datasets import DATASETS
+from ..core.bbox import (Box3DMode, CameraInstance3DBoxes,
+                         DepthInstance3DBoxes, LiDARInstance3DBoxes)
 from .pipelines import Compose


 @DATASETS.register_module()
 class Custom3DDataset(Dataset):
+    """Customized 3D dataset
+
+    This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
+    dataset.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality ([dict], optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR'. Available options includes
+            - 'LiDAR': box in LiDAR coordinates
+            - 'Depth': box in depth coordinates, usually for indoor dataset
+            - 'Camera': box in camera coordinates
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """

    def __init__(self,
                 data_root,
@@ -18,6 +46,7 @@ class Custom3DDataset(Dataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 box_type_3d='LiDAR',
                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__()
@@ -26,6 +55,7 @@ class Custom3DDataset(Dataset):
        self.test_mode = test_mode
        self.modality = modality
        self.filter_empty_gt = filter_empty_gt
+        self.get_box_type(box_type_3d)

        self.CLASSES = self.get_classes(classes)
        self.data_infos = self.load_annotations(self.ann_file)
@@ -40,6 +70,21 @@ class Custom3DDataset(Dataset):
    def load_annotations(self, ann_file):
        return mmcv.load(ann_file)

+    def get_box_type(self, box_type):
+        box_type_lower = box_type.lower()
+        if box_type_lower == 'lidar':
+            self.box_type_3d = LiDARInstance3DBoxes
+            self.box_mode_3d = Box3DMode.LIDAR
+        elif box_type_lower == 'camera':
+            self.box_type_3d = CameraInstance3DBoxes
+            self.box_mode_3d = Box3DMode.CAM
+        elif box_type_lower == 'depth':
+            self.box_type_3d = DepthInstance3DBoxes
+            self.box_mode_3d = Box3DMode.DEPTH
+        else:
+            raise ValueError('Only "box_type" of "camera", "lidar", "depth"'
+                             f' are supported, got {box_type}')
+
    def get_data_info(self, index):
        info = self.data_infos[index]
        sample_idx = info['point_cloud']['lidar_idx']
@@ -61,6 +106,8 @@ class Custom3DDataset(Dataset):
        results['bbox3d_fields'] = []
        results['pts_mask_fields'] = []
        results['pts_seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d

    def prepare_train_data(self, index):
        input_dict = self.get_data_info(index)

--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -9,7 +9,7 @@ import torch
 from mmcv.utils import print_log

 from mmdet.datasets import DATASETS
-from ..core.bbox import Box3DMode, CameraInstance3DBoxes, box_np_ops
+from ..core.bbox import Box3DMode, CameraInstance3DBoxes
 from .custom_3d import Custom3DDataset
 from .utils import remove_dontcare

@@ -27,6 +27,8 @@ class KittiDataset(Custom3DDataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__(
            data_root=data_root,
@@ -34,6 +36,8 @@ class KittiDataset(Custom3DDataset):
            pipeline=pipeline,
            classes=classes,
            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
            test_mode=test_mode)

        self.root_split = os.path.join(self.data_root, split)
@@ -90,7 +94,7 @@ class KittiDataset(Custom3DDataset):

        # convert gt_bboxes_3d to velodyne coordinates
        gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+            self.box_mode_3d, np.linalg.inv(rect @ Trv2c))
        gt_bboxes = annos['bbox']

        selected = self.drop_arrays_by_name(gt_names, ['DontCare'])
@@ -395,73 +399,66 @@ class KittiDataset(Custom3DDataset):

    def convert_valid_bboxes(self, box_dict, info):
        # TODO: refactor this function
-        final_box_preds = box_dict['boxes_3d']
-        final_scores = box_dict['scores_3d']
-        final_labels = box_dict['labels_3d']
+        box_preds = box_dict['boxes_3d']
+        scores = box_dict['scores_3d']
+        labels = box_dict['labels_3d']
        sample_idx = info['image']['image_idx']
-        final_box_preds[:, -1] = box_np_ops.limit_period(
-            final_box_preds[:, -1] - np.pi, offset=0.5, period=np.pi * 2)
+        # TODO: remove the hack of yaw
+        box_preds.tensor[:, -1] = box_preds.tensor[:, -1] - np.pi
+        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)

-        if final_box_preds.shape[0] == 0:
+        if len(box_preds) == 0:
            return dict(
-                bbox=final_box_preds.new_zeros([0, 4]).numpy(),
-                box3d_camera=final_box_preds.new_zeros([0, 7]).numpy(),
-                box3d_lidar=final_box_preds.new_zeros([0, 7]).numpy(),
-                scores=final_box_preds.new_zeros([0]).numpy(),
-                label_preds=final_box_preds.new_zeros([0, 4]).numpy(),
-                sample_idx=sample_idx,
-            )
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)

        from mmdet3d.core.bbox import box_torch_ops
        rect = info['calib']['R0_rect'].astype(np.float32)
        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
        P2 = info['calib']['P2'].astype(np.float32)
        img_shape = info['image']['image_shape']
-        rect = final_box_preds.new_tensor(rect)
-        Trv2c = final_box_preds.new_tensor(Trv2c)
-        P2 = final_box_preds.new_tensor(P2)
-
-        final_box_preds_camera = box_torch_ops.box_lidar_to_camera(
-            final_box_preds, rect, Trv2c)
-        locs = final_box_preds_camera[:, :3]
-        dims = final_box_preds_camera[:, 3:6]
-        angles = final_box_preds_camera[:, 6]
-        camera_box_origin = [0.5, 1.0, 0.5]
-        box_corners = box_torch_ops.center_to_corner_box3d(
-            locs, dims, angles, camera_box_origin, axis=1)
+        P2 = box_preds.tensor.new_tensor(P2)
+
+        box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c)
+
+        box_corners = box_preds_camera.corners
        box_corners_in_image = box_torch_ops.project_to_image(box_corners, P2)
        # box_corners_in_image: [N, 8, 2]
        minxy = torch.min(box_corners_in_image, dim=1)[0]
        maxxy = torch.max(box_corners_in_image, dim=1)[0]
        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
        # Post-processing
-        # check final_box_preds_camera
-        image_shape = final_box_preds.new_tensor(img_shape)
-        valid_cam_inds = ((final_box_preds_camera[:, 0] < image_shape[1]) &
-                          (final_box_preds_camera[:, 1] < image_shape[0]) &
-                          (final_box_preds_camera[:, 2] > 0) &
-                          (final_box_preds_camera[:, 3] > 0))
-        # check final_box_preds
-        limit_range = final_box_preds.new_tensor(self.pcd_limit_range)
-        valid_pcd_inds = ((final_box_preds[:, :3] > limit_range[:3]) &
-                          (final_box_preds[:, :3] < limit_range[3:]))
+        # check box_preds_camera
+        image_shape = box_preds.tensor.new_tensor(img_shape)
+        valid_cam_inds = ((box_preds_camera.tensor[:, 0] < image_shape[1]) &
+                          (box_preds_camera.tensor[:, 1] < image_shape[0]) &
+                          (box_preds_camera.tensor[:, 2] > 0) &
+                          (box_preds_camera.tensor[:, 3] > 0))
+        # check box_preds
+        limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
+        valid_pcd_inds = ((box_preds.center > limit_range[:3]) &
+                          (box_preds.center < limit_range[3:]))
        valid_inds = valid_cam_inds & valid_pcd_inds.all(-1)

        if valid_inds.sum() > 0:
            return dict(
                bbox=box_2d_preds[valid_inds, :].numpy(),
-                box3d_camera=final_box_preds_camera[valid_inds, :].numpy(),
-                box3d_lidar=final_box_preds[valid_inds, :].numpy(),
-                scores=final_scores[valid_inds].numpy(),
-                label_preds=final_labels[valid_inds].numpy(),
+                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
+                box3d_lidar=box_preds[valid_inds].tensor.numpy(),
+                scores=scores[valid_inds].numpy(),
+                label_preds=labels[valid_inds].numpy(),
                sample_idx=sample_idx,
            )
        else:
            return dict(
-                bbox=final_box_preds.new_zeros([0, 4]).numpy(),
-                box3d_camera=final_box_preds.new_zeros([0, 7]).numpy(),
-                box3d_lidar=final_box_preds.new_zeros([0, 7]).numpy(),
-                scores=final_box_preds.new_zeros([0]).numpy(),
-                label_preds=final_box_preds.new_zeros([0, 4]).numpy(),
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
                sample_idx=sample_idx,
            )
--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@@ -7,7 +7,7 @@ import pyquaternion
 from nuscenes.utils.data_classes import Box as NuScenesBox

 from mmdet.datasets import DATASETS
-from ..core.bbox import LiDARInstance3DBoxes, box_np_ops
+from ..core.bbox import LiDARInstance3DBoxes
 from .custom_3d import Custom3DDataset


@@ -72,8 +72,10 @@ class NuScenesDataset(Custom3DDataset):
                 classes=None,
                 load_interval=1,
                 with_velocity=True,
-                 test_mode=False,
                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False,
                 eval_version='detection_cvpr_2019'):
        self.load_interval = load_interval
        super().__init__(
@@ -82,6 +84,8 @@ class NuScenesDataset(Custom3DDataset):
            pipeline=pipeline,
            classes=classes,
            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
            test_mode=test_mode)

        self.with_velocity = with_velocity
@@ -172,7 +176,7 @@ class NuScenesDataset(Custom3DDataset):
        gt_bboxes_3d = LiDARInstance3DBoxes(
            gt_bboxes_3d,
            box_dim=gt_bboxes_3d.shape[-1],
-            origin=[0.5, 0.5, 0.5])
+            origin=[0.5, 0.5, 0.5]).convert_to(self.box_mode_3d)

        anns_results = dict(
            gt_bboxes_3d=gt_bboxes_3d,
@@ -352,26 +356,28 @@ class NuScenesDataset(Custom3DDataset):


 def output_to_nusc_box(detection):
-    box3d = detection['boxes_3d'].numpy()
+    box3d = detection['boxes_3d']
    scores = detection['scores_3d'].numpy()
    labels = detection['labels_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
    # TODO: check whether this is necessary
    # with dir_offset & dir_limit in the head
-    box3d[:, 6] = -box3d[:, 6] - np.pi / 2
-    # the trained model is in [0.5, 0.5, 0],
-    # change them back to nuscenes [0.5, 0.5, 0.5]
-    box_np_ops.change_box3d_center_(box3d, [0.5, 0.5, 0], [0.5, 0.5, 0.5])
+    box_yaw = -box_yaw - np.pi / 2
+
    box_list = []
-    for i in range(box3d.shape[0]):
-        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box3d[i, 6])
-        velocity = (*box3d[i, 7:9], 0.0)
+    for i in range(len(box3d)):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        velocity = (*box3d.tensor[i, 7:9], 0.0)
        # velo_val = np.linalg.norm(box3d[i, 7:9])
        # velo_ori = box3d[i, 6]
        # velocity = (
        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
        box = NuScenesBox(
-            box3d[i, :3],
-            box3d[i, 3:6],
+            box_gravity_center[i],
+            box_dims[i],
            quat,
            label=labels[i],
            score=scores[i],

--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
@@ -73,14 +73,13 @@ class Collect3D(object):

    def __init__(self,
                 keys,
-                 pcd_shape=[1, 1600, 1408],
                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
                            'pad_shape', 'scale_factor', 'flip', 'pcd_flip',
-                            'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans',
-                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation')):
+                            'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
+                            'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx',
+                            'pcd_scale_factor', 'pcd_rotation')):
        self.keys = keys
        self.meta_keys = meta_keys
-        self.pcd_shape = pcd_shape

    def __call__(self, results):
        data = {}
@@ -88,7 +87,7 @@ class Collect3D(object):
        for key in self.meta_keys:
            if key in results:
                img_meta[key] = results[key]
-        img_meta.update(pcd_shape=self.pcd_shape, pcd_pad_shape=self.pcd_shape)
+
        data['img_meta'] = DC(img_meta, cpu_only=True)
        for key in self.keys:
            data[key] = results[key]

--- a/mmdet3d/datasets/scannet_dataset.py
+++ b/mmdet3d/datasets/scannet_dataset.py
@@ -20,10 +20,18 @@ class ScanNetDataset(Custom3DDataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 box_type_3d='Depth',
                 filter_empty_gt=True,
                 test_mode=False):
-        super().__init__(data_root, ann_file, pipeline, classes, modality,
-                         filter_empty_gt, test_mode)
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode)

    def get_ann_info(self, index):
        # Use index to get the annos, thus the evalhook could also use this api

--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
@@ -16,10 +16,18 @@ class SUNRGBDDataset(Custom3DDataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 box_type_3d='Depth',
                 filter_empty_gt=True,
                 test_mode=False):
-        super().__init__(data_root, ann_file, pipeline, classes, modality,
-                         filter_empty_gt, test_mode)
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode)

    def get_ann_info(self, index):
        # Use index to get the annos, thus the evalhook could also use this api

--- a/mmdet3d/models/dense_heads/anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor3d_head.py
@@ -381,5 +381,5 @@ class Anchor3DHead(nn.Module, AnchorTrainMixin):
            bboxes[..., 6] = (
                dir_rot + self.dir_offset +
                np.pi * dir_scores.to(bboxes.dtype))
-
+        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
        return bboxes, scores, labels
--- a/mmdet3d/models/dense_heads/parta2_rpn_head.py
+++ b/mmdet3d/models/dense_heads/parta2_rpn_head.py
@@ -150,13 +150,15 @@ class PartA2RPNHead(Anchor3DHead):
        result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
                                         mlvl_max_scores, mlvl_label_pred,
                                         mlvl_cls_score, mlvl_dir_scores,
-                                         score_thr, cfg.nms_post, cfg)
+                                         score_thr, cfg.nms_post, cfg,
+                                         input_meta)

        return result

    def class_agnostic_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms,
                           mlvl_max_scores, mlvl_label_pred, mlvl_cls_score,
-                           mlvl_dir_scores, score_thr, max_num, cfg):
+                           mlvl_dir_scores, score_thr, max_num, cfg,
+                           input_meta):
        bboxes = []
        scores = []
        labels = []
@@ -202,6 +204,8 @@ class PartA2RPNHead(Anchor3DHead):
                labels = labels[inds]
                scores = scores[inds]
                cls_scores = cls_scores[inds]
+            bboxes = input_meta['box_type_3d'](
+                bboxes, box_dim=self.box_code_size)
            return dict(
                boxes_3d=bboxes,
                scores_3d=scores,
@@ -210,7 +214,9 @@ class PartA2RPNHead(Anchor3DHead):
            )
        else:
            return dict(
-                boxes_3d=mlvl_bboxes.new_zeros([0, self.box_code_size]),
+                boxes_3d=input_meta['box_type_3d'](
+                    mlvl_bboxes.new_zeros([0, self.box_code_size]),
+                    box_dim=self.box_code_size),
                scores_3d=mlvl_bboxes.new_zeros([0]),
                labels_3d=mlvl_bboxes.new_zeros([0]),
                cls_preds=mlvl_bboxes.new_zeros([0, mlvl_cls_score.shape[-1]]))
--- a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
@@ -474,7 +474,9 @@ class PartA2BboxHead(nn.Module):
            selected_scores = cur_cls_score[selected]

            result_list.append(
-                (selected_bboxes, selected_scores, selected_label_preds))
+                (img_meta[batch_id]['box_type_3d'](selected_bboxes,
+                                                   self.bbox_coder.code_size),
+                 selected_scores, selected_label_preds))
        return result_list

    def multi_class_nms(self,

--- a/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
+++ b/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
@@ -112,7 +112,7 @@ class PartAggregationROIHead(Base3DRoIHead):

        semantic_results = self.semantic_head(feats_dict['seg_features'])

-        rois = bbox3d2roi([res['boxes_3d'] for res in proposal_list])
+        rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list])
        labels_3d = [res['labels_3d'] for res in proposal_list]
        cls_preds = [res['cls_preds'] for res in proposal_list]
        bbox_results = self._bbox_forward(feats_dict['seg_features'],

--- a/tests/test_heads.py
+++ b/tests/test_heads.py
@@ -4,6 +4,8 @@ from os.path import dirname, exists, join
 import pytest
 import torch

+from mmdet3d.core.bbox import Box3DMode, LiDARInstance3DBoxes
+

 def _get_config_directory():
    """ Find the predefined detector config directory """
@@ -129,11 +131,16 @@ def test_anchor3d_head_getboxes():

    feats = list()
    feats.append(torch.rand([2, 512, 200, 176], dtype=torch.float32).cuda())
+    # fake input_metas
    input_metas = [{
-        'sample_idx': 1234
+        'sample_idx': 1234,
+        'box_type_3d': LiDARInstance3DBoxes,
+        'box_mode_3d': Box3DMode.LIDAR
    }, {
-        'sample_idx': 2345
-    }]  # fake input_metas
+        'sample_idx': 2345,
+        'box_type_3d': LiDARInstance3DBoxes,
+        'box_mode_3d': Box3DMode.LIDAR
+    }]
    (cls_score, bbox_pred, dir_cls_preds) = self.forward(feats)

    # test get_boxes
@@ -155,11 +162,16 @@ def test_parta2_rpnhead_getboxes():

    feats = list()
    feats.append(torch.rand([2, 512, 200, 176], dtype=torch.float32).cuda())
+    # fake input_metas
    input_metas = [{
-        'sample_idx': 1234
+        'sample_idx': 1234,
+        'box_type_3d': LiDARInstance3DBoxes,
+        'box_mode_3d': Box3DMode.LIDAR
    }, {
-        'sample_idx': 2345
-    }]  # fake input_metas
+        'sample_idx': 2345,
+        'box_type_3d': LiDARInstance3DBoxes,
+        'box_mode_3d': Box3DMode.LIDAR
+    }]
    (cls_score, bbox_pred, dir_cls_preds) = self.forward(feats)

    # test get_boxes
@@ -169,7 +181,7 @@ def test_parta2_rpnhead_getboxes():
    assert result_list[0]['scores_3d'].shape == torch.Size([512])
    assert result_list[0]['labels_3d'].shape == torch.Size([512])
    assert result_list[0]['cls_preds'].shape == torch.Size([512, 3])
-    assert result_list[0]['boxes_3d'].shape == torch.Size([512, 7])
+    assert result_list[0]['boxes_3d'].tensor.shape == torch.Size([512, 7])


 def test_vote_head():

--- a/tests/test_parta2_bbox_head.py
+++ b/tests/test_parta2_bbox_head.py
@@ -3,6 +3,7 @@ import torch
 from mmcv import Config
 from torch.nn import BatchNorm1d, ReLU

+from mmdet3d.core.bbox import Box3DMode, LiDARInstance3DBoxes
 from mmdet3d.core.bbox.samplers import IoUNegPiecewiseSampler
 from mmdet3d.models import PartA2BboxHead
 from mmdet3d.ops import make_sparse_convmodule
@@ -336,8 +337,10 @@ def test_get_bboxes():
            use_raw_score=True,
            nms_thr=0.01,
            score_thr=0.1))
+    input_meta = dict(
+        box_type_3d=LiDARInstance3DBoxes, box_mode_3d=Box3DMode.LIDAR)
    result_list = self.get_bboxes(rois, cls_score, bbox_pred, class_labels,
-                                  class_pred, None, cfg)
+                                  class_pred, [input_meta], cfg)
    selected_bboxes, selected_scores, selected_label_preds = result_list[0]

    expected_selected_bboxes = torch.Tensor(
@@ -347,7 +350,8 @@ def test_get_bboxes():
    expected_selected_scores = torch.Tensor([-2.2061, -2.1121, -0.1761]).cuda()
    expected_selected_label_preds = torch.Tensor([2., 2., 2.]).cuda()

-    assert torch.allclose(selected_bboxes, expected_selected_bboxes, 1e-3)
+    assert torch.allclose(selected_bboxes.tensor, expected_selected_bboxes,
+                          1e-3)
    assert torch.allclose(selected_scores, expected_selected_scores, 1e-3)
    assert torch.allclose(selected_label_preds, expected_selected_label_preds)