Merge branch 'master' into process_raw_data

36466f83 · liyinhao · 25d39342 · f93167c3 · 36466f83 · 36466f83
Commit 36466f83 authored Jun 15, 2020 by liyinhao
20 changed files
--- a/.dev_scripts/linter.sh
+++ b/.dev_scripts/linter.sh
+yapf -r -i --style .style.yapf mmdet3d/ configs/ tests/ tools/
+isort -rc mmdet3d/ configs/ tests/ tools/
+flake8 .
--- a/configs/nus/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/nus/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py
+# model settings
+voxel_size = [0.25, 0.25, 8]
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+model = dict(
+    type='MVXFasterRCNNV2',
+    pts_voxel_layer=dict(
+        max_num_points=64,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000),  # (training, testing) max_coxels
+    ),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter',
+        in_channels=64,
+        output_shape=[400, 400],  # checked from PointCloud3D
+    ),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256],
+    ),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3,
+    ),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
+                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.2,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_num=500
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+    ))
+# dataset settings
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(),
+    sample_groups=dict(
+        bus=4,
+        trailer=4,
+        truck=4,
+    ))
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.3925, 0.3925],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='RandomFlip3D', flip_ratio=0),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points']),
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+evaluation = dict(interval=24)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 24
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/nus/hv_pointpillars_fpn_sbn-all_4x8_3x_nus-3d.py
+++ b/configs/nus/hv_pointpillars_fpn_sbn-all_4x8_3x_nus-3d.py
+# model settings
+voxel_size = [0.25, 0.25, 8]
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+model = dict(
+    type='MVXFasterRCNNV2',
+    pts_voxel_layer=dict(
+        max_num_points=64,
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
+                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.2,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_num=500))
+# dataset settings
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False)
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.3925, 0.3925],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='RandomFlip3D', flip_ratio=0),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[28, 34])
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+evaluation = dict(interval=36)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 36
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/regnet/README.md
+++ b/configs/regnet/README.md
+# Designing Network Design Spaces
+## Introduction
+We implement RegNetX and RegNetY models in 3D detection systems and provide their first results on PointPillars.
+The pre-trained modles are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md) and maintained in [mmcv](https://github.com/open-mmlab/mmcv).
+```
+@article{radosavovic2020designing,
+    title={Designing Network Design Spaces},
+    author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár},
+    year={2020},
+    eprint={2003.13678},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
+## Usage
+To use a regnet model, there are two steps to do:
+1. Convert the model to ResNet-style supported by MMDetection
+2. Modify backbone and neck in config accordingly
+### Convert model
+We already prepare models of FLOPs from 800M to 12G in our model zoo.
+For more general usage, we also provide script `regnet2mmdet.py` in the tools directory to convert the key of models pretrained by [pycls](https://github.com/facebookresearch/pycls/) to
+ResNet-style checkpoints used in MMDetection.
+```bash
+python -u tools/regnet2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+### Modify config
+The users can modify the config's `depth` of backbone and corresponding keys in `arch` according to the configs in the [pycls model zoo](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md).
+The parameter `in_channels` in FPN can be found in the Figure 15 & 16 of the paper (`wi` in the legend).
+This directory already provides some configs with their performance, using RegNetX from 800MF to 12GF level.
+For other pre-trained models or self-implemented regnet models, the users are responsible to check these parameters by themselves.
+**Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model.
+## Results
+### PointPillars
+|  Backbone   | Lr schd | Mem (GB) | Inf time (fps) | mAP |NDS| Download |
+| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
+|    [SECFPN](../)     |  2x    ||||
+|[RegNetX-400MF-SECFPN](./hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py)|  2x    ||||
+|    [FPN](../)     |  2x    ||||
+|[RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py)|  2x    ||||
--- a/configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py
+# model settings
+voxel_size = [0.25, 0.25, 8]
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+model = dict(
+    type='MVXFasterRCNNV2',
+    pretrained=dict(pts='open-mmlab://regnetx_400mf'),
+    pts_voxel_layer=dict(
+        max_num_points=64,
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+    pts_backbone=dict(
+        type='NoStemRegNet',
+        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 160, 384],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
+                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.2,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_num=500))
+# dataset settings
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False)
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.3925, 0.3925],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='RandomFlip3D', flip_ratio=0),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+evaluation = dict(interval=24)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 24
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py
+# model settings
+voxel_size = [0.25, 0.25, 8]
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+model = dict(
+    type='MVXFasterRCNNV2',
+    pretrained=dict(pts='open-mmlab://regnetx_400mf'),
+    pts_voxel_layer=dict(
+        max_num_points=64,
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+    pts_backbone=dict(
+        type='NoStemRegNet',
+        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 160, 384],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
+                [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
+                [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
+                [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
+                [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
+                [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
+                [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
+            ],
+            sizes=[
+                [1.95017717, 4.60718145, 1.72270761],  # car
+                [2.4560939, 6.73778078, 2.73004906],  # truck
+                [2.87427237, 12.01320693, 3.81509561],  # trailer
+                [0.60058911, 1.68452161, 1.27192197],  # bicycle
+                [0.66344886, 0.7256437, 1.75748069],  # pedestrian
+                [0.39694519, 0.40359262, 1.06232151],  # traffic_cone
+                [2.49008838, 0.48578221, 0.98297065],  # barrier
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.2,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_num=500))
+# dataset settings
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.3925, 0.3925],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='RandomFlip3D', flip_ratio=0),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points']),
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+evaluation = dict(interval=24)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 24
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/docs/GETTING_STARTED.md
+++ b/docs/GETTING_STARTED.md
@@ -9,39 +9,57 @@ It is recommended to symlink the dataset root to `$MMDETECTION/data`.
 If your folder structure is different, you may need to change the corresponding paths in config files.
 ```
-mmdetection
+mmdetection3d
-├── mmdet
+├── mmdet3d
 ├── tools
 ├── configs
 ├── data
-│   ├── coco
+│   ├── nuscenes
-│   │   ├── annotations
+│   │   ├── maps
-│   │   ├── train2017
+│   │   ├── samples
-│   │   ├── val2017
+│   │   ├── sweeps
-│   │   ├── test2017
+│   │   ├── v1.0-test
-│   ├── cityscapes
+|   |   ├── v1.0-trainval
-│   │   ├── annotations
+│   ├── kitti
-│   │   ├── leftImg8bit
+│   │   ├── ImageSets
-│   │   │   ├── train
+│   │   ├── testing
-│   │   │   ├── val
+│   │   │   ├── calib
-│   │   ├── gtFine
+│   │   │   ├── image_2
-│   │   │   ├── train
+│   │   │   ├── velodyne
-│   │   │   ├── val
+│   │   ├── training
-│   ├── VOCdevkit
+│   │   │   ├── calib
-│   │   ├── VOC2007
+│   │   │   ├── image_2
-│   │   ├── VOC2012
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
-```
+│   ├── scannet
+│   │   ├── meta_data
-The cityscapes annotations have to be converted into the coco format using `tools/convert_datasets/cityscapes.py`:
+│   │   ├── scans
+│   │   ├── batch_load_scannet_data.py
-```shell
+│   │   ├── load_scannet_data.py
-pip install cityscapesscripts
+│   │   ├── scannet_utils.py
-python tools/convert_datasets/cityscapes.py ./data/cityscapes --nproc 8 --out_dir ./data/cityscapes/annotations
+│   │   ├── README.md
-```
+│   ├── sunrgbd
+│   │   ├── OFFICIAL_SUNRGBD
-Currently the config files in `cityscapes` use COCO pre-trained weights to initialize.
+│   │   ├── matlab
-You could download the pre-trained models in advance if network is unavailable or slow, otherwise it would cause errors at the beginning of training.
+│   │   ├── sunrgbd_data.py
+│   │   ├── sunrgbd_utils.py
+│   │   ├── README.md
+```
+Download nuScenes V1.0 full dataset data [HERE]( https://www.nuscenes.org/download). Prepare nuscenes data by running
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+Download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). Prepare kitti data by running
+```bash
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti
+```
+To prepare scannet data, please see [scannet](../data/scannet/README.md).
+To prepare sunrgbd data, please see [sunrgbd](../data/sunrgbd/README.md).
 For using custom datasets, please refer to [Tutorials 2: Adding New Dataset](tutorials/new_dataset.md).

--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -148,7 +148,7 @@ def center_to_corner_box3d(centers,
        dims (float array, shape=[N, 3]): dimensions in kitti label file.
        angles (float array, shape=[N]): rotation_y in kitti label file.
        origin (list or array or float): origin point relate to smallest point.
-            use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar.
+            use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
        axis (int): rotation axis. 1 for camera and 2 for lidar.
    Returns:
        [type]: [description]

--- a/mmdet3d/core/bbox/box_torch_ops.py
+++ b/mmdet3d/core/bbox/box_torch_ops.py
@@ -74,7 +74,7 @@ def rotation_3d_in_axis(points, angles, axis=0):
 def center_to_corner_box3d(centers,
                           dims,
                           angles,
-                           origin=[0.5, 1.0, 0.5],
+                           origin=(0.5, 1.0, 0.5),
                           axis=1):
    """convert kitti locations, dimensions and angles to corners
@@ -83,7 +83,7 @@ def center_to_corner_box3d(centers,
        dims (float array, shape=[N, 3]): dimensions in kitti label file.
        angles (float array, shape=[N]): rotation_y in kitti label file.
        origin (list or array or float): origin point relate to smallest point.
-            use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar.
+            use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
        axis (int): rotation axis. 1 for camera and 2 for lidar.
    Returns:
        [type]: [description]

--- a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
@@ -28,28 +28,28 @@ class PartialBinBasedBBoxCoder(BaseBBoxCoder):
        """Encode ground truth to prediction targets.
        Args:
-            gt_bboxes_3d (Tensor): 3d gt bboxes with shape (n, 7).
+            gt_bboxes_3d (BaseInstance3DBoxes): gt bboxes with shape (n, 7).
-            gt_labels_3d (Tensor): Gt classes.
+            gt_labels_3d (Tensor): gt classes.
        Returns:
            tuple: Targets of center, size and direction.
        """
        # generate center target
-        center_target = gt_bboxes_3d[..., 0:3]
+        center_target = gt_bboxes_3d.gravity_center
        # generate bbox size target
        size_class_target = gt_labels_3d
-        size_res_target = gt_bboxes_3d[..., 3:6] - gt_bboxes_3d.new_tensor(
+        size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
            self.mean_sizes)[size_class_target]
        # generate dir target
-        box_num = gt_bboxes_3d.shape[0]
+        box_num = gt_labels_3d.shape[0]
        if self.with_rot:
            (dir_class_target,
-             dir_res_target) = self.angle2class(gt_bboxes_3d[..., 6])
+             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
        else:
            dir_class_target = gt_labels_3d.new_zeros(box_num)
-            dir_res_target = gt_bboxes_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
        return (center_target, size_class_target, size_res_target,
                dir_class_target, dir_res_target)

--- a/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
+++ b/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
@@ -83,8 +83,6 @@ def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
    Return:
        iou: (M, N) not support aligned mode currently
    """
-    # TODO: check the input dimension meanings,
-    #  this is inconsistent with that in bbox_overlaps_nearest_3d
    assert bboxes1.size(-1) == bboxes2.size(-1) == 7
    assert coordinate in ['camera', 'lidar']

--- a/mmdet3d/core/bbox/structures/base_box3d.py
+++ b/mmdet3d/core/bbox/structures/base_box3d.py
@@ -12,7 +12,7 @@ class BaseInstance3DBoxes(object):
    Note:
        The box is bottom centered, i.e. the relative position of origin in
-            the box is [0.5, 0.5, 0].
+            the box is (0.5, 0.5, 0).
    Args:
        tensor (torch.Tensor | np.ndarray | list): a Nxbox_dim matrix.
@@ -23,11 +23,11 @@ class BaseInstance3DBoxes(object):
            If False, the value of yaw will be set to 0 as minmax boxes.
            Default to True.
        origin (tuple): The relative position of origin in the box.
-            Default to [0.5, 0.5, 0]. This will guide the box be converted to
+            Default to (0.5, 0.5, 0). This will guide the box be converted to
-            [0.5, 0.5, 0] mode.
+            (0.5, 0.5, 0) mode.
    """
-    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=[0.5, 0.5, 0]):
+    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
        if isinstance(tensor, torch.Tensor):
            device = tensor.device
        else:
@@ -40,18 +40,21 @@ class BaseInstance3DBoxes(object):
                dtype=torch.float32, device=device)
        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
-        if not with_yaw and tensor.shape[-1] == 6:
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
            assert box_dim == 6
            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
            tensor = torch.cat((tensor, fake_rot), dim=-1)
            self.box_dim = box_dim + 1
+            self.with_yaw = False
        else:
            self.box_dim = box_dim
-        self.with_yaw = with_yaw
+            self.with_yaw = with_yaw
        self.tensor = tensor
-        if origin != [0.5, 0.5, 0]:
+        if origin != (0.5, 0.5, 0):
-            dst = self.tensor.new_tensor([0.5, 0.5, 0])
+            dst = self.tensor.new_tensor((0.5, 0.5, 0))
            src = self.tensor.new_tensor(origin)
            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
@@ -121,7 +124,7 @@ class BaseInstance3DBoxes(object):
            The relative position of the centers in different kinds of
            boxes are different, e.g., the relative center of a boxes is
-            [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar.
+            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
            It is recommended to use `bottom_center` or `gravity_center`
            for more clear usage.

--- a/mmdet3d/core/bbox/structures/box_3d_mode.py
+++ b/mmdet3d/core/bbox/structures/box_3d_mode.py
@@ -22,7 +22,7 @@ class Box3DMode(IntEnum):
                       | /
        left y <------ 0
-    The relative coordinate of bottom center in a LiDAR box is [0.5, 0.5, 0],
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    Coordinates in camera:
@@ -49,7 +49,7 @@ class Box3DMode(IntEnum):
           | /
           0 ------> x right
-    The relative coordinate of bottom center in a DEPTH box is [0.5, 0.5, 0],
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    """

--- a/mmdet3d/core/bbox/structures/cam_box3d.py
+++ b/mmdet3d/core/bbox/structures/cam_box3d.py
@@ -20,7 +20,7 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
             v
        down y
-    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
    and the yaw is around the y axis, thus the rotation axis=1.
    The yaw is 0 at the positive direction of x axis, and increases from
    the positive direction of x to the positive direction of z.

--- a/mmdet3d/core/bbox/structures/depth_box3d.py
+++ b/mmdet3d/core/bbox/structures/depth_box3d.py
 import numpy as np
 import torch
+from mmdet3d.ops import points_in_boxes_batch
 from .base_box3d import BaseInstance3DBoxes
 from .utils import limit_period, rotation_3d_in_axis
@@ -17,7 +18,7 @@ class DepthInstance3DBoxes(BaseInstance3DBoxes):
                       | /
                       0 ------> x right (yaw=0)
-    The relative coordinate of bottom center in a Depth box is [0.5, 0.5, 0],
+    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    The yaw is 0 at the positive direction of x axis, and increases from
    the positive direction of x to the positive direction of y.
@@ -74,7 +75,7 @@ class DepthInstance3DBoxes(BaseInstance3DBoxes):
                device=dims.device, dtype=dims.dtype)
        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
-        # use relative origin [0.5, 0.5, 0]
+        # use relative origin (0.5, 0.5, 0)
        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
@@ -201,3 +202,30 @@ class DepthInstance3DBoxes(BaseInstance3DBoxes):
        from .box_3d_mode import Box3DMode
        return Box3DMode.convert(
            box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
+    def points_in_boxes(self, points):
+        """Find points that are in boxes (CUDA)
+        Args:
+            points (torch.Tensor): [1, M, 3] or [M, 3], [x, y, z]
+                in LiDAR coordinate.
+        Returns:
+            torch.Tensor: The box index of each point in, shape is (B, M, T).
+        """
+        from .box_3d_mode import Box3DMode
+        # to lidar
+        points_lidar = points.clone()
+        points_lidar = points_lidar[..., [1, 0, 2]]
+        points_lidar[..., 1] *= -1
+        if points.dim() == 2:
+            points_lidar = points_lidar.unsqueeze(0)
+        else:
+            assert points.dim() == 3 and points_lidar.shape[0] == 1
+        boxes_lidar = self.convert_to(Box3DMode.LIDAR).tensor
+        boxes_lidar = boxes_lidar.to(points.device).unsqueeze(0)
+        box_idxs_of_pts = points_in_boxes_batch(points_lidar, boxes_lidar)
+        return box_idxs_of_pts.squeeze(0)
--- a/mmdet3d/core/bbox/structures/lidar_box3d.py
+++ b/mmdet3d/core/bbox/structures/lidar_box3d.py
@@ -18,7 +18,7 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
                               | /
       (yaw=pi) left y <------ 0
-    The relative coordinate of bottom center in a LiDAR box is [0.5, 0.5, 0],
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    The yaw is 0 at the negative direction of y axis, and increases from
    the negative direction of y to the positive direction of x.

--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -83,7 +83,9 @@ def bbox3d2result(bboxes, scores, labels):
        dict(Tensor): bbox results in cpu mode
    """
    return dict(
-        boxes_3d=bboxes.cpu(), scores_3d=scores.cpu(), labels_3d=labels.cpu())
+        boxes_3d=bboxes.to('cpu'),
+        scores_3d=scores.cpu(),
+        labels_3d=labels.cpu())
 def upright_depth_to_lidar_torch(points=None,

--- a/mmdet3d/core/evaluation/indoor_eval.py
+++ b/mmdet3d/core/evaluation/indoor_eval.py
@@ -3,47 +3,6 @@ import torch
 from mmcv.utils import print_log
 from terminaltables import AsciiTable
-from mmdet3d.core.bbox.iou_calculators.iou3d_calculator import bbox_overlaps_3d
-def boxes3d_depth_to_lidar(boxes3d, mid_to_bottom=True):
-    """Boxes3d depth to lidar.
-    Flip X-right,Y-forward,Z-up to X-forward,Y-left,Z-up.
-    Args:
-        boxes3d (ndarray): (N, 7) [x, y, z, w, l, h, r] in depth coords.
-    Return:
-        boxes3d_lidar (ndarray): (N, 7) [x, y, z, l, h, w, r] in LiDAR coords.
-    """
-    boxes3d_lidar = boxes3d.copy()
-    boxes3d_lidar[..., [0, 1, 2, 3, 4, 5]] = boxes3d_lidar[...,
-                                                           [1, 0, 2, 4, 3, 5]]
-    boxes3d_lidar[..., 1] *= -1
-    if mid_to_bottom:
-        boxes3d_lidar[..., 2] -= boxes3d_lidar[..., 5] / 2
-    return boxes3d_lidar
-def get_iou_gpu(bb1, bb2):
-    """Get IoU.
-    Compute IoU of two bounding boxes.
-    Args:
-        bb1 (ndarray): [x, y, z, w, l, h, ry] in LiDAR.
-        bb2 (ndarray): [x, y, z, h, w, l, ry] in LiDAR.
-    Returns:
-        ans_iou (tensor): The answer of IoU.
-    """
-    bb1 = torch.from_numpy(bb1).float().cuda()
-    bb2 = torch.from_numpy(bb2).float().cuda()
-    iou3d = bbox_overlaps_3d(bb1, bb2, mode='iou', coordinate='lidar')
-    return iou3d.cpu().numpy()
 def average_precision(recalls, precisions, mode='area'):
    """Calculate average precision (for single or multiple scales).
@@ -61,7 +20,10 @@ def average_precision(recalls, precisions, mode='area'):
    if recalls.ndim == 1:
        recalls = recalls[np.newaxis, :]
        precisions = precisions[np.newaxis, :]
-    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    assert recalls.shape == precisions.shape
+    assert recalls.ndim == 2
    num_scales = recalls.shape[0]
    ap = np.zeros(num_scales, dtype=np.float32)
    if mode == 'area':
@@ -103,40 +65,42 @@ def eval_det_cls(pred, gt, iou_thr=None):
        float: scalar, average precision.
    """
-    # construct gt objects
+    # {img_id: {'bbox': box structure, 'det': matched list}}
-    class_recs = {}  # {img_id: {'bbox': bbox list, 'det': matched list}}
+    class_recs = {}
    npos = 0
    for img_id in gt.keys():
-        bbox = np.array(gt[img_id])
+        cur_gt_num = len(gt[img_id])
+        if cur_gt_num != 0:
+            gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32)
+            for i in range(cur_gt_num):
+                gt_cur[i] = gt[img_id][i].tensor
+            bbox = gt[img_id][0].new_box(gt_cur)
+        else:
+            bbox = gt[img_id]
        det = [[False] * len(bbox) for i in iou_thr]
        npos += len(bbox)
        class_recs[img_id] = {'bbox': bbox, 'det': det}
-    # pad empty list to all other imgids
-    for img_id in pred.keys():
-        if img_id not in gt:
-            class_recs[img_id] = {'bbox': np.array([]), 'det': []}
    # construct dets
    image_ids = []
    confidence = []
-    BB = []
    ious = []
    for img_id in pred.keys():
        cur_num = len(pred[img_id])
        if cur_num == 0:
            continue
-        BB_cur = np.zeros((cur_num, 7))  # hard code
+        pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32)
        box_idx = 0
        for box, score in pred[img_id]:
            image_ids.append(img_id)
            confidence.append(score)
-            BB.append(box)
+            pred_cur[box_idx] = box.tensor
-            BB_cur[box_idx] = box
            box_idx += 1
-        gt_cur = class_recs[img_id]['bbox'].astype(float)
+        pred_cur = box.new_box(pred_cur)
+        gt_cur = class_recs[img_id]['bbox']
        if len(gt_cur) > 0:
            # calculate iou in each image
-            iou_cur = get_iou_gpu(BB_cur, gt_cur)
+            iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
            for i in range(cur_num):
                ious.append(iou_cur[i])
        else:
@@ -157,12 +121,12 @@ def eval_det_cls(pred, gt, iou_thr=None):
    for d in range(nd):
        R = class_recs[image_ids[d]]
        iou_max = -np.inf
-        BBGT = R['bbox'].astype(float)
+        BBGT = R['bbox']
        cur_iou = ious[d]
-        if BBGT.size > 0:
+        if len(BBGT) > 0:
            # compute overlaps
-            for j in range(BBGT.shape[0]):
+            for j in range(len(BBGT)):
                # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
                iou = cur_iou[j]
                if iou > iou_max:
@@ -194,61 +158,22 @@ def eval_det_cls(pred, gt, iou_thr=None):
    return ret
-def eval_map_recall(det_infos, gt_infos, ovthresh=None):
+def eval_map_recall(pred, gt, ovthresh=None):
    """Evaluate mAP and recall.
    Generic functions to compute precision/recall for object detection
        for multiple classes.
    Args:
-        det_infos (list[dict]): Information of detection results, the dict
+        pred (dict): Information of detection results,
-            includes the following keys
+            which maps class_id and predictions.
-            - labels_3d (Tensor): Labels of boxes.
+        gt (dict): information of gt results, which maps class_id and gt.
-            - boxes_3d (Tensor): 3d bboxes.
-            - scores_3d (Tensor): Scores of boxes.
-        gt_infos (list[dict]): information of gt results, the dict
-            includes the following keys
-            - labels_3d (Tensor): labels of boxes.
-            - boxes_3d (Tensor): 3d bboxes.
        ovthresh (list[float]): iou threshold.
            Default: None.
    Return:
        tuple[dict]: dict results of recall, AP, and precision for all classes.
    """
-    pred_all = {}
-    scan_cnt = 0
-    for det_info in det_infos:
-        pred_all[scan_cnt] = det_info
-        scan_cnt += 1
-    pred = {}  # map {classname: pred}
-    gt = {}  # map {classname: gt}
-    for img_id in pred_all.keys():
-        for i in range(len(pred_all[img_id]['labels_3d'])):
-            label = pred_all[img_id]['labels_3d'].numpy()[i]
-            bbox = pred_all[img_id]['boxes_3d'].numpy()[i]
-            score = pred_all[img_id]['scores_3d'].numpy()[i]
-            if label not in pred:
-                pred[int(label)] = {}
-            if img_id not in pred[label]:
-                pred[int(label)][img_id] = []
-            if label not in gt:
-                gt[int(label)] = {}
-            if img_id not in gt[label]:
-                gt[int(label)][img_id] = []
-            pred[int(label)][img_id].append((bbox, score))
-    for img_id in range(len(gt_infos)):
-        for i in range(len(gt_infos[img_id]['labels_3d'])):
-            label = gt_infos[img_id]['labels_3d'][i]
-            bbox = gt_infos[img_id]['boxes_3d'][i]
-            if label not in gt:
-                gt[label] = {}
-            if img_id not in gt[label]:
-                gt[label][img_id] = []
-            gt[label][img_id].append(bbox)
    ret_values = []
    for classname in gt.keys():
@@ -272,14 +197,24 @@ def eval_map_recall(det_infos, gt_infos, ovthresh=None):
    return recall, precision, ap
-def indoor_eval(gt_annos, dt_annos, metric, label2cat, logger=None):
+def indoor_eval(gt_annos,
+                dt_annos,
+                metric,
+                label2cat,
+                logger=None,
+                box_type_3d=None,
+                box_mode_3d=None):
    """Scannet Evaluation.
    Evaluate the result of the detection.
    Args:
        gt_annos (list[dict]): GT annotations.
-        dt_annos (list[dict]): Detection annotations.
+        dt_annos (list[dict]): Detection annotations. the dict
+            includes the following keys
+            - labels_3d (Tensor): Labels of boxes.
+            - boxes_3d (BaseInstance3DBoxes): 3d bboxes in Depth coordinate.
+            - scores_3d (Tensor): Scores of boxes.
        metric (list[float]): AP IoU thresholds.
        label2cat (dict): {label: cat}.
        logger (logging.Logger | str | None): The way to print the mAP
@@ -288,24 +223,48 @@ def indoor_eval(gt_annos, dt_annos, metric, label2cat, logger=None):
    Return:
        dict: Dict of results.
    """
-    gt_infos = []
+    assert len(dt_annos) == len(gt_annos)
-    for gt_anno in gt_annos:
+    pred = {}  # map {class_id: pred}
+    gt = {}  # map {class_id: gt}
+    for img_id in range(len(dt_annos)):
+        # parse detected annotations
+        det_anno = dt_annos[img_id]
+        for i in range(len(det_anno['labels_3d'])):
+            label = det_anno['labels_3d'].numpy()[i]
+            bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i]
+            score = det_anno['scores_3d'].numpy()[i]
+            if label not in pred:
+                pred[int(label)] = {}
+            if img_id not in pred[label]:
+                pred[int(label)][img_id] = []
+            if label not in gt:
+                gt[int(label)] = {}
+            if img_id not in gt[label]:
+                gt[int(label)][img_id] = []
+            pred[int(label)][img_id].append((bbox, score))
+        # parse gt annotations
+        gt_anno = gt_annos[img_id]
        if gt_anno['gt_num'] != 0:
-            # convert to lidar coor for evaluation
+            gt_boxes = box_type_3d(
-            bbox_lidar_bottom = boxes3d_depth_to_lidar(
+                gt_anno['gt_boxes_upright_depth'],
-                gt_anno['gt_boxes_upright_depth'], mid_to_bottom=True)
+                box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1],
-            if bbox_lidar_bottom.shape[-1] == 6:
+                origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d)
-                bbox_lidar_bottom = np.pad(bbox_lidar_bottom, ((0, 0), (0, 1)),
+            labels_3d = gt_anno['class']
-                                           'constant')
-            gt_infos.append(
-                dict(boxes_3d=bbox_lidar_bottom, labels_3d=gt_anno['class']))
        else:
-            gt_infos.append(
+            gt_boxes = box_type_3d(np.array([], dtype=np.float32))
-                dict(
+            labels_3d = np.array([], dtype=np.int64)
-                    boxes_3d=np.array([], dtype=np.float32),
-                    labels_3d=np.array([], dtype=np.int64)))
+        for i in range(len(labels_3d)):
+            label = labels_3d[i]
+            bbox = gt_boxes[i]
+            if label not in gt:
+                gt[label] = {}
+            if img_id not in gt[label]:
+                gt[label][img_id] = []
+            gt[label][img_id].append(bbox)
-    rec, prec, ap = eval_map_recall(dt_annos, gt_infos, metric)
+    rec, prec, ap = eval_map_recall(pred, gt, metric)
    ret_dict = dict()
    header = ['classes']
    table_columns = [[label2cat[label]

--- a/mmdet3d/datasets/custom_3d.py
+++ b/mmdet3d/datasets/custom_3d.py
@@ -6,11 +6,39 @@ import numpy as np
 from torch.utils.data import Dataset
 from mmdet.datasets import DATASETS
+from ..core.bbox import (Box3DMode, CameraInstance3DBoxes,
+                         DepthInstance3DBoxes, LiDARInstance3DBoxes)
 from .pipelines import Compose
 @DATASETS.register_module()
 class Custom3DDataset(Dataset):
+    """Customized 3D dataset
+    This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
+    dataset.
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality ([dict], optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR'. Available options includes
+            - 'LiDAR': box in LiDAR coordinates
+            - 'Depth': box in depth coordinates, usually for indoor dataset
+            - 'Camera': box in camera coordinates
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
    def __init__(self,
                 data_root,
@@ -18,6 +46,7 @@ class Custom3DDataset(Dataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 box_type_3d='LiDAR',
                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__()
@@ -26,6 +55,7 @@ class Custom3DDataset(Dataset):
        self.test_mode = test_mode
        self.modality = modality
        self.filter_empty_gt = filter_empty_gt
+        self.get_box_type(box_type_3d)
        self.CLASSES = self.get_classes(classes)
        self.data_infos = self.load_annotations(self.ann_file)
@@ -40,6 +70,21 @@ class Custom3DDataset(Dataset):
    def load_annotations(self, ann_file):
        return mmcv.load(ann_file)
+    def get_box_type(self, box_type):
+        box_type_lower = box_type.lower()
+        if box_type_lower == 'lidar':
+            self.box_type_3d = LiDARInstance3DBoxes
+            self.box_mode_3d = Box3DMode.LIDAR
+        elif box_type_lower == 'camera':
+            self.box_type_3d = CameraInstance3DBoxes
+            self.box_mode_3d = Box3DMode.CAM
+        elif box_type_lower == 'depth':
+            self.box_type_3d = DepthInstance3DBoxes
+            self.box_mode_3d = Box3DMode.DEPTH
+        else:
+            raise ValueError('Only "box_type" of "camera", "lidar", "depth"'
+                             f' are supported, got {box_type}')
    def get_data_info(self, index):
        info = self.data_infos[index]
        sample_idx = info['point_cloud']['lidar_idx']
@@ -61,6 +106,8 @@ class Custom3DDataset(Dataset):
        results['bbox3d_fields'] = []
        results['pts_mask_fields'] = []
        results['pts_seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d
    def prepare_train_data(self, index):
        input_dict = self.get_data_info(index)
@@ -139,7 +186,13 @@ class Custom3DDataset(Dataset):
        gt_annos = [info['annos'] for info in self.data_infos]
        label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)}
        ret_dict = indoor_eval(
-            gt_annos, results, iou_thr, label2cat, logger=logger)
+            gt_annos,
+            results,
+            iou_thr,
+            label2cat,
+            logger=logger,
+            box_type_3d=self.box_type_3d,
+            box_mode_3d=self.box_mode_3d)
        return ret_dict

--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -9,7 +9,7 @@ import torch
 from mmcv.utils import print_log
 from mmdet.datasets import DATASETS
-from ..core.bbox import Box3DMode, CameraInstance3DBoxes, box_np_ops
+from ..core.bbox import Box3DMode, CameraInstance3DBoxes
 from .custom_3d import Custom3DDataset
 from .utils import remove_dontcare
@@ -27,6 +27,8 @@ class KittiDataset(Custom3DDataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__(
            data_root=data_root,
@@ -34,6 +36,8 @@ class KittiDataset(Custom3DDataset):
            pipeline=pipeline,
            classes=classes,
            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
            test_mode=test_mode)
        self.root_split = os.path.join(self.data_root, split)
@@ -90,7 +94,7 @@ class KittiDataset(Custom3DDataset):
        # convert gt_bboxes_3d to velodyne coordinates
        gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+            self.box_mode_3d, np.linalg.inv(rect @ Trv2c))
        gt_bboxes = annos['bbox']
        selected = self.drop_arrays_by_name(gt_names, ['DontCare'])
@@ -395,73 +399,66 @@ class KittiDataset(Custom3DDataset):
    def convert_valid_bboxes(self, box_dict, info):
        # TODO: refactor this function
-        final_box_preds = box_dict['boxes_3d']
+        box_preds = box_dict['boxes_3d']
-        final_scores = box_dict['scores_3d']
+        scores = box_dict['scores_3d']
-        final_labels = box_dict['labels_3d']
+        labels = box_dict['labels_3d']
        sample_idx = info['image']['image_idx']
-        final_box_preds[:, -1] = box_np_ops.limit_period(
+        # TODO: remove the hack of yaw
-            final_box_preds[:, -1] - np.pi, offset=0.5, period=np.pi * 2)
+        box_preds.tensor[:, -1] = box_preds.tensor[:, -1] - np.pi
+        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
-        if final_box_preds.shape[0] == 0:
+        if len(box_preds) == 0:
            return dict(
-                bbox=final_box_preds.new_zeros([0, 4]).numpy(),
+                bbox=np.zeros([0, 4]),
-                box3d_camera=final_box_preds.new_zeros([0, 7]).numpy(),
+                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=final_box_preds.new_zeros([0, 7]).numpy(),
+                box3d_lidar=np.zeros([0, 7]),
-                scores=final_box_preds.new_zeros([0]).numpy(),
+                scores=np.zeros([0]),
-                label_preds=final_box_preds.new_zeros([0, 4]).numpy(),
+                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx,
+                sample_idx=sample_idx)
-            )
        from mmdet3d.core.bbox import box_torch_ops
        rect = info['calib']['R0_rect'].astype(np.float32)
        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
        P2 = info['calib']['P2'].astype(np.float32)
        img_shape = info['image']['image_shape']
-        rect = final_box_preds.new_tensor(rect)
+        P2 = box_preds.tensor.new_tensor(P2)
-        Trv2c = final_box_preds.new_tensor(Trv2c)
-        P2 = final_box_preds.new_tensor(P2)
+        box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c)
-        final_box_preds_camera = box_torch_ops.box_lidar_to_camera(
+        box_corners = box_preds_camera.corners
-            final_box_preds, rect, Trv2c)
-        locs = final_box_preds_camera[:, :3]
-        dims = final_box_preds_camera[:, 3:6]
-        angles = final_box_preds_camera[:, 6]
-        camera_box_origin = [0.5, 1.0, 0.5]
-        box_corners = box_torch_ops.center_to_corner_box3d(
-            locs, dims, angles, camera_box_origin, axis=1)
        box_corners_in_image = box_torch_ops.project_to_image(box_corners, P2)
        # box_corners_in_image: [N, 8, 2]
        minxy = torch.min(box_corners_in_image, dim=1)[0]
        maxxy = torch.max(box_corners_in_image, dim=1)[0]
        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
        # Post-processing
-        # check final_box_preds_camera
+        # check box_preds_camera
-        image_shape = final_box_preds.new_tensor(img_shape)
+        image_shape = box_preds.tensor.new_tensor(img_shape)
-        valid_cam_inds = ((final_box_preds_camera[:, 0] < image_shape[1]) &
+        valid_cam_inds = ((box_preds_camera.tensor[:, 0] < image_shape[1]) &
-                          (final_box_preds_camera[:, 1] < image_shape[0]) &
+                          (box_preds_camera.tensor[:, 1] < image_shape[0]) &
-                          (final_box_preds_camera[:, 2] > 0) &
+                          (box_preds_camera.tensor[:, 2] > 0) &
-                          (final_box_preds_camera[:, 3] > 0))
+                          (box_preds_camera.tensor[:, 3] > 0))
-        # check final_box_preds
+        # check box_preds
-        limit_range = final_box_preds.new_tensor(self.pcd_limit_range)
+        limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
-        valid_pcd_inds = ((final_box_preds[:, :3] > limit_range[:3]) &
+        valid_pcd_inds = ((box_preds.center > limit_range[:3]) &
-                          (final_box_preds[:, :3] < limit_range[3:]))
+                          (box_preds.center < limit_range[3:]))
        valid_inds = valid_cam_inds & valid_pcd_inds.all(-1)
        if valid_inds.sum() > 0:
            return dict(
                bbox=box_2d_preds[valid_inds, :].numpy(),
-                box3d_camera=final_box_preds_camera[valid_inds, :].numpy(),
+                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
-                box3d_lidar=final_box_preds[valid_inds, :].numpy(),
+                box3d_lidar=box_preds[valid_inds].tensor.numpy(),
-                scores=final_scores[valid_inds].numpy(),
+                scores=scores[valid_inds].numpy(),
-                label_preds=final_labels[valid_inds].numpy(),
+                label_preds=labels[valid_inds].numpy(),
                sample_idx=sample_idx,
            )
        else:
            return dict(
-                bbox=final_box_preds.new_zeros([0, 4]).numpy(),
+                bbox=np.zeros([0, 4]),
-                box3d_camera=final_box_preds.new_zeros([0, 7]).numpy(),
+                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=final_box_preds.new_zeros([0, 7]).numpy(),
+                box3d_lidar=np.zeros([0, 7]),
-                scores=final_box_preds.new_zeros([0]).numpy(),
+                scores=np.zeros([0]),
-                label_preds=final_box_preds.new_zeros([0, 4]).numpy(),
+                label_preds=np.zeros([0, 4]),
                sample_idx=sample_idx,
            )