raw_mmdetection

7aa442d5 · raojy · 9c03eaa8 · 7aa442d5 · 7aa442d5 · 7aa442d5
Commit 7aa442d5 authored Apr 01, 2026 by raojy
20 changed files
--- a/mmdetection3d/mmdet3d/configs/_base_/schedules/mmdet_schedule_1x.py
+++ b/mmdetection3d/mmdet3d/configs/_base_/schedules/mmdet_schedule_1x.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+# training schedule for 1x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=12, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
--- a/mmdetection3d/mmdet3d/configs/_base_/schedules/schedule_2x.py
+++ b/mmdetection3d/mmdet3d/configs/_base_/schedules/schedule_2x.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.adamw import AdamW
+
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+lr = 0.001
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=lr, weight_decay=0.01),
+    # max_norm=10 is better for SECOND
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# training schedule for 2x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=24, val_interval=24)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[20, 23],
+        gamma=0.1)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
--- a/mmdetection3d/mmdet3d/configs/_base_/schedules/schedule_3x.py
+++ b/mmdetection3d/mmdet3d/configs/_base_/schedules/schedule_3x.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.adamw import AdamW
+
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008  # max learning rate
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=lr, weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2),
+)
+
+# training schedule for 3x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=36, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[24, 32],
+        gamma=0.1)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
--- a/mmdetection3d/mmdet3d/configs/_base_/schedules/seg_cosine_100e.py
+++ b/mmdetection3d/mmdet3d/configs/_base_/schedules/seg_cosine_100e.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from torch.optim.sgd import SGD
+
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        T_max=100,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=100)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (32 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
--- a/mmdetection3d/mmdet3d/configs/_base_/schedules/seg_cosine_150e.py
+++ b/mmdetection3d/mmdet3d/configs/_base_/schedules/seg_cosine_150e.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from torch.optim.sgd import SGD
+
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.2, momentum=0.9, weight_decay=0.0001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        T_max=150,
+        eta_min=0.002,
+        by_epoch=True,
+        begin=0,
+        end=150)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=150, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
--- a/mmdetection3d/mmdet3d/configs/_base_/schedules/seg_cosine_200e.py
+++ b/mmdetection3d/mmdet3d/configs/_base_/schedules/seg_cosine_200e.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from torch.optim.adam import Adam
+
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=Adam, lr=0.001, weight_decay=0.01),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        T_max=200,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=200)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=200, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (2 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
--- a/mmdetection3d/mmdet3d/configs/_base_/schedules/seg_cosine_50e.py
+++ b/mmdetection3d/mmdet3d/configs/_base_/schedules/seg_cosine_50e.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from torch.optim.adam import Adam
+
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=Adam, lr=0.001, weight_decay=0.001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        T_max=50,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=50)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=50, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (2 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
--- a/mmdetection3d/mmdet3d/configs/centerpoint/centerpoint_pillar02_second_secfpn_8xb4_cyclic_20e_nus_3d.py
+++ b/mmdetection3d/mmdet3d/configs/centerpoint/centerpoint_pillar02_second_secfpn_8xb4_cyclic_20e_nus_3d.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.nus_3d import *
+    from .._base_.models.centerpoint_pillar02_second_secfpn_nus import *
+    from .._base_.schedules.cyclic_20e import *
+    from .._base_.default_runtime import *
+
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+from mmdet3d.datasets.nuscenes_dataset import NuScenesDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 LoadPointsFromMultiSweeps)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectNameFilter, ObjectRangeFilter, ObjectSample,
+    PointShuffle, PointsRangeFilter, RandomFlip3D)
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_prefix.update(
+    dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP'))
+model.update(
+    dict(
+        data_preprocessor=dict(
+            voxel_layer=dict(point_cloud_range=point_cloud_range)),
+        pts_voxel_encoder=dict(point_cloud_range=point_cloud_range),
+        pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])),
+        # model training and testing settings
+        train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)),
+        test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=ObjectSample, db_sampler=db_sampler),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectNameFilter, classes=class_names),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+
+train_dataloader.merge(
+    dict(
+        _delete_=True,
+        batch_size=4,
+        num_workers=4,
+        persistent_workers=True,
+        sampler=dict(type=DefaultSampler, shuffle=True),
+        dataset=dict(
+            type=CBGSDataset,
+            dataset=dict(
+                type=NuScenesDataset,
+                data_root=data_root,
+                ann_file='nuscenes_infos_train.pkl',
+                pipeline=train_pipeline,
+                metainfo=dict(classes=class_names),
+                test_mode=False,
+                data_prefix=data_prefix,
+                use_valid_flag=True,
+                # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+                # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+                box_type_3d='LiDAR',
+                backend_args=backend_args))))
+test_dataloader.update(
+    dict(
+        dataset=dict(
+            pipeline=test_pipeline, metainfo=dict(classes=class_names))))
+val_dataloader.update(
+    dict(
+        dataset=dict(
+            pipeline=test_pipeline, metainfo=dict(classes=class_names))))
+
+train_cfg.update(dict(val_interval=20))
--- a/mmdetection3d/mmdet3d/configs/centerpoint/centerpoint_voxel01_second_secfpn_8xb4_cyclic_20e_nus_3d.py
+++ b/mmdetection3d/mmdet3d/configs/centerpoint/centerpoint_voxel01_second_secfpn_8xb4_cyclic_20e_nus_3d.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.nus_3d import *
+    from .._base_.models.centerpoint_voxel01_second_secfpn_nus import *
+    from .._base_.schedules.cyclic_20e import *
+    from .._base_.default_runtime import *
+
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+from mmdet3d.datasets.nuscenes_dataset import NuScenesDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 LoadPointsFromMultiSweeps)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectNameFilter, ObjectRangeFilter, ObjectSample,
+    PointShuffle, PointsRangeFilter, RandomFlip3D)
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_prefix.update(
+    dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP'))
+model.update(
+    dict(
+        data_preprocessor=dict(
+            voxel_layer=dict(point_cloud_range=point_cloud_range)),
+        pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])),
+        # model training and testing settings
+        train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)),
+        test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=ObjectSample, db_sampler=db_sampler),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectNameFilter, classes=class_names),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+
+train_dataloader.merge(
+    dict(
+        _delete_=True,
+        batch_size=4,
+        num_workers=4,
+        persistent_workers=True,
+        sampler=dict(type=DefaultSampler, shuffle=True),
+        dataset=dict(
+            type=CBGSDataset,
+            dataset=dict(
+                type=NuScenesDataset,
+                data_root=data_root,
+                ann_file='nuscenes_infos_train.pkl',
+                pipeline=train_pipeline,
+                metainfo=dict(classes=class_names),
+                test_mode=False,
+                data_prefix=data_prefix,
+                use_valid_flag=True,
+                # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+                # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+                box_type_3d='LiDAR',
+                backend_args=backend_args))))
+test_dataloader.update(
+    dict(
+        dataset=dict(
+            pipeline=test_pipeline, metainfo=dict(classes=class_names))))
+val_dataloader.update(
+    dict(
+        dataset=dict(
+            pipeline=test_pipeline, metainfo=dict(classes=class_names))))
+
+train_cfg.update(dict(val_interval=20))
--- a/mmdetection3d/mmdet3d/configs/cylinder3d/cylinder3d_4xb4-3x_semantickitti.py
+++ b/mmdetection3d/mmdet3d/configs/cylinder3d/cylinder3d_4xb4-3x_semantickitti.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.semantickitti import *
+    from .._base_.models.cylinder3d import *
+    from .._base_.default_runtime import *
+
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+# optimizer
+lr = 0.001
+optim_wrapper = dict(
+    type=OptimWrapper, optimizer=dict(type=AdamW, lr=lr, weight_decay=0.01))
+
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=36, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[30],
+        gamma=0.1)
+]
+
+train_dataloader.update(dict(batch_size=4, ))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+# auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_hooks.update(dict(checkpoint=dict(type=CheckpointHook, interval=5)))
--- a/mmdetection3d/mmdet3d/configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py
+++ b/mmdetection3d/mmdet3d/configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.semantickitti import *
+    from .._base_.default_runtime import *
+    from .._base_.models.cylinder3d import *
+    from .._base_.schedules.schedule_3x import *
+
+from mmcv.transforms.wrappers import RandomChoice
+
+from mmdet3d.datasets.transforms.transforms_3d import LaserMix, PolarMix
+
+train_pipeline = [
+    dict(type=LoadPointsFromFile, coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type=PointSegClassMapping),
+    dict(
+        type=RandomChoice,
+        transforms=[
+            [
+                dict(
+                    type=LaserMix,
+                    num_areas=[3, 4, 5, 6],
+                    pitch_angles=[-25, 3],
+                    pre_transform=[
+                        dict(
+                            type=LoadPointsFromFile,
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type=LoadAnnotations3D,
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type=PointSegClassMapping)
+                    ],
+                    prob=1)
+            ],
+            [
+                dict(
+                    type=PolarMix,
+                    instance_classes=[0, 1, 2, 3, 4, 5, 6, 7],
+                    swap_ratio=0.5,
+                    rotate_paste_ratio=1.0,
+                    pre_transform=[
+                        dict(
+                            type=LoadPointsFromFile,
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type=LoadAnnotations3D,
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type=PointSegClassMapping)
+                    ],
+                    prob=1)
+            ],
+        ],
+        prob=[0.5, 0.5]),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type=Pack3DDetInputs, keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+default_hooks.update(dict(checkpoint=dict(type=CheckpointHook, interval=1)))
--- a/mmdetection3d/mmdet3d/configs/minkunet/minkunet34_w32_torchsparse_8xb2_laser_polar_mix_3x_semantickitti.py
+++ b/mmdetection3d/mmdet3d/configs/minkunet/minkunet34_w32_torchsparse_8xb2_laser_polar_mix_3x_semantickitti.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.semantickitti import *
+    from .._base_.models.minkunet import *
+    from .._base_.schedules.schedule_3x import *
+    from .._base_.default_runtime import *
+
+from mmcv.transforms.wrappers import RandomChoice
+from mmengine.hooks.checkpoint_hook import CheckpointHook
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 PointSegClassMapping)
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       LaserMix, PolarMix)
+
+model.update(
+    dict(
+        data_preprocessor=dict(max_voxels=None),
+        backbone=dict(encoder_blocks=[2, 3, 4, 6])))
+
+train_pipeline = [
+    dict(type=LoadPointsFromFile, coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type=PointSegClassMapping),
+    dict(
+        type=RandomChoice,
+        transforms=[
+            [
+                dict(
+                    type=LaserMix,
+                    num_areas=[3, 4, 5, 6],
+                    pitch_angles=[-25, 3],
+                    pre_transform=[
+                        dict(
+                            type=LoadPointsFromFile,
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type=LoadAnnotations3D,
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type=PointSegClassMapping)
+                    ],
+                    prob=1)
+            ],
+            [
+                dict(
+                    type=PolarMix,
+                    instance_classes=[0, 1, 2, 3, 4, 5, 6, 7],
+                    swap_ratio=0.5,
+                    rotate_paste_ratio=1.0,
+                    pre_transform=[
+                        dict(
+                            type=LoadPointsFromFile,
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type=LoadAnnotations3D,
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type=PointSegClassMapping)
+                    ],
+                    prob=1)
+            ],
+        ],
+        prob=[0.5, 0.5]),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type=Pack3DDetInputs, keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+default_hooks.update(dict(checkpoint=dict(type=CheckpointHook, interval=1)))
--- a/mmdetection3d/mmdet3d/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2_80e_kitti_3d_3class.py
+++ b/mmdetection3d/mmdet3d/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2_80e_kitti_3d_3class.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.schedules.cosine import *
+    from .._base_.default_runtime import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize, Resize
+from mmengine.dataset.dataset_wrapper import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.kitti_dataset import KittiDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       ObjectRangeFilter,
+                                                       PointShuffle,
+                                                       PointsRangeFilter,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.kitti_metric import KittiMetric
+from mmdet3d.models.backbones.second import SECOND
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.dense_heads.anchor3d_head import Anchor3DHead
+from mmdet3d.models.detectors.mvx_faster_rcnn import DynamicMVXFasterRCNN
+from mmdet3d.models.layers.fusion_layers.point_fusion import PointFusion
+from mmdet3d.models.middle_encoders.sparse_encoder import SparseEncoder
+from mmdet3d.models.necks.second_fpn import SECONDFPN
+from mmdet3d.models.task_modules.anchor.anchor_3d_generator import \
+    Anchor3DRangeGenerator
+from mmdet3d.models.task_modules.assigners.max_3d_iou_assigner import \
+    Max3DIoUAssigner
+from mmdet3d.models.task_modules.coders.delta_xyzwhlr_bbox_coder import \
+    DeltaXYZWLHRBBoxCoder
+from mmdet3d.models.voxel_encoders.voxel_encoder import DynamicVFE
+from mmdet3d.structures.ops.iou3d_calculator import BboxOverlapsNearest3D
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type=DynamicMVXFasterRCNN,
+    data_preprocessor=dict(
+        type=Det3DDataPreprocessor,
+        voxel=True,
+        voxel_type='dynamic',
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(-1, -1)),
+        mean=[102.9801, 115.9465, 122.7717],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    img_backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        # make the image features more stable numerically to avoid loss nan
+        norm_cfg=dict(type='BN', requires_grad=False),
+        num_outs=5),
+    pts_voxel_encoder=dict(
+        type=DynamicVFE,
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        fusion_layer=dict(
+            type=PointFusion,
+            img_channels=256,
+            pts_channels=64,
+            mid_channels=128,
+            out_channels=128,
+            img_levels=[0, 1, 2, 3, 4],
+            align_corners=False,
+            activate_out=True,
+            fuse_out=False)),
+    pts_middle_encoder=dict(
+        type=SparseEncoder,
+        in_channels=128,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    pts_backbone=dict(
+        type=SECOND,
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    pts_neck=dict(
+        type=SECONDFPN,
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    pts_bbox_head=dict(
+        type=Anchor3DHead,
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type=Anchor3DRangeGenerator,
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        assigner_per_size=True,
+        diff_rad_by_sin=True,
+        assign_per_class=True,
+        bbox_coder=dict(type=DeltaXYZWLHRBBoxCoder),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type=Max3DIoUAssigner,
+                    iou_calculator=dict(type=BboxOverlapsNearest3D),
+                    pos_iou_thr=0.35,
+                    neg_iou_thr=0.2,
+                    min_pos_iou=0.2,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type=Max3DIoUAssigner,
+                    iou_calculator=dict(type=BboxOverlapsNearest3D),
+                    pos_iou_thr=0.35,
+                    neg_iou_thr=0.2,
+                    min_pos_iou=0.2,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type=Max3DIoUAssigner,
+                    iou_calculator=dict(type=BboxOverlapsNearest3D),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_thr=0.01,
+            score_thr=0.1,
+            min_bbox_size=0,
+            nms_pre=100,
+            max_num=50)))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=True, use_camera=True)
+backend_args = None
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=RandomResize, scale=[(640, 192), (2560, 768)], keep_ratio=True),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.2, 0.2, 0.2]),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
+            'gt_labels'
+        ])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1280, 384),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            # Temporary solution, fix this after refactor the augtest
+            dict(type=Resize, scale=0, keep_ratio=True),
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points', 'img'])
+]
+modality = dict(use_lidar=True, use_camera=True)
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=2,
+        dataset=dict(
+            type=KittiDataset,
+            data_root=data_root,
+            modality=modality,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(
+                pts='training/velodyne_reduced', img='training/image_2'),
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        modality=modality,
+        ann_file='kitti_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne_reduced', img='training/image_2'),
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        ann_file='kitti_infos_val.pkl',
+        modality=modality,
+        data_prefix=dict(
+            pts='training/velodyne_reduced', img='training/image_2'),
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+optim_wrapper.update(
+    dict(
+        optimizer=dict(weight_decay=0.01),
+        clip_grad=dict(max_norm=35, norm_type=2),
+    ))
+val_evaluator = dict(
+    type=KittiMetric, ann_file='data/kitti/kitti_infos_val.pkl')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+
+# You may need to download the model first is the network is unstable
+load_from = 'https://download.openmmlab.com/mmdetection3d/pretrain_models/mvx_faster_rcnn_detectron2-caffe_20e_coco-pretrain_gt-sample_kitti-3-class_moderate-79.3_20200207-a4a6a3c7.pth'  # noqa
--- a/mmdetection3d/mmdet3d/configs/pgd/pgd_r101_caffe_fpn_head_gn_4xb3_4x_kitti_mono3d.py
+++ b/mmdetection3d/mmdet3d/configs/pgd/pgd_r101_caffe_fpn_head_gn_4xb3_4x_kitti_mono3d.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.kitti_mono3d import *
+    from .._base_.models.pgd import *
+    from .._base_.schedules.mmdet_schedule_1x import *
+    from .._base_.default_runtime import *
+
+from mmcv.transforms.processing import Resize
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadImageFromFileMono3D)
+from mmdet3d.datasets.transforms.transforms_3d import RandomFlip3D
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.losses.uncertain_smooth_l1_loss import \
+    UncertainSmoothL1Loss
+from mmdet3d.models.task_modules.coders.pgd_bbox_coder import PGDBBoxCoder
+
+# model settings
+model.update(
+    dict(
+        data_preprocessor=dict(
+            type=Det3DDataPreprocessor,
+            mean=[103.530, 116.280, 123.675],
+            std=[1.0, 1.0, 1.0],
+            bgr_to_rgb=False,
+            pad_size_divisor=32),
+        backbone=dict(frozen_stages=0),
+        neck=dict(start_level=0, num_outs=4),
+        bbox_head=dict(
+            num_classes=3,
+            bbox_code_size=7,
+            pred_attrs=False,
+            pred_velo=False,
+            pred_bbox2d=True,
+            use_onlyreg_proj=True,
+            strides=(4, 8, 16, 32),
+            regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 1e8)),
+            group_reg_dims=(2, 1, 3, 1, 16,
+                            4),  # offset, depth, size, rot, kpts, bbox2d
+            reg_branch=(
+                (256, ),  # offset
+                (256, ),  # depth
+                (256, ),  # size
+                (256, ),  # rot
+                (256, ),  # kpts
+                (256, )  # bbox2d
+            ),
+            centerness_branch=(256, ),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_centerness=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0),
+            use_depth_classifier=True,
+            depth_branch=(256, ),
+            depth_range=(0, 70),
+            depth_unit=10,
+            division='uniform',
+            depth_bins=8,
+            pred_keypoints=True,
+            weight_dim=1,
+            loss_depth=dict(
+                type=UncertainSmoothL1Loss,
+                alpha=1.0,
+                beta=3.0,
+                loss_weight=1.0),
+            bbox_coder=dict(
+                type=PGDBBoxCoder,
+                base_depths=((28.01, 16.32), ),
+                base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56,
+                                                                 1.6)),
+                code_size=7)),
+        # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+        # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+        train_cfg=dict(code_weight=[
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+            0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0,
+            1.0
+        ]),
+        test_cfg=dict(
+            nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20)))
+
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type=Resize, scale=(1242, 375), keep_ratio=True),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(type=Resize, scale_factor=1.0),
+    dict(type=Pack3DDetInputs, keys=['img'])
+]
+
+train_dataloader.update(
+    dict(batch_size=3, num_workers=3, dataset=dict(pipeline=train_pipeline)))
+test_dataloader.update(dict(dataset=dict(pipeline=test_pipeline)))
+val_dataloader.update(dict(dataset=dict(pipeline=test_pipeline)))
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        optimizer=dict(lr=0.001),
+        paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+        clip_grad=dict(max_norm=35, norm_type=2)))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR, start_factor=1.0 / 3, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=48,
+        by_epoch=True,
+        milestones=[32, 44],
+        gamma=0.1)
+]
+
+train_cfg.update(dict(max_epochs=48, val_interval=2))
+auto_scale_lr.update(dict(base_batch_size=12))
--- a/mmdetection3d/mmdet3d/configs/votenet/__init__.py
+++ b/mmdetection3d/mmdet3d/configs/votenet/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
--- a/mmdetection3d/mmdet3d/configs/votenet/votenet_8xb8_scannet_3d.py
+++ b/mmdetection3d/mmdet3d/configs/votenet/votenet_8xb8_scannet_3d.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.scannet_3d import *
+    from .._base_.models.votenet import *
+    from .._base_.schedules.schedule_3x import *
+    from .._base_.default_runtime import *
+
+from mmengine.hooks.logger_hook import LoggerHook
+
+from mmdet3d.models.task_modules.coders.partial_bin_based_bbox_coder import \
+    PartialBinBasedBBoxCoder
+
+# model settings
+model.update(
+    dict(
+        bbox_head=dict(
+            num_classes=18,
+            bbox_coder=dict(
+                type=PartialBinBasedBBoxCoder,
+                num_sizes=18,
+                num_dir_bins=1,
+                with_rot=False,
+                mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                            [1.876858, 1.8425595, 1.1931566],
+                            [0.61328, 0.6148609, 0.7182701],
+                            [1.3955007, 1.5121545, 0.83443564],
+                            [0.97949594, 1.0675149, 0.6329687],
+                            [0.531663, 0.5955577, 1.7500148],
+                            [0.9624706, 0.72462326, 1.1481868],
+                            [0.83221924, 1.0490936, 1.6875663],
+                            [0.21132214, 0.4206159, 0.5372846],
+                            [1.4440073, 1.8970833, 0.26985747],
+                            [1.0294262, 1.4040797, 0.87554324],
+                            [1.3766412, 0.65521795, 1.6813129],
+                            [0.6650819, 0.71111923, 1.298853],
+                            [0.41999173, 0.37906948, 1.7513971],
+                            [0.59359556, 0.5912492, 0.73919016],
+                            [0.50867593, 0.50656086, 0.30136237],
+                            [1.1511526, 1.0546296, 0.49706793],
+                            [0.47535285, 0.49249494, 0.5802117]]))))
+
+default_hooks.update(dict(logger=dict(type=LoggerHook, interval=30)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=64))
--- a/mmdetection3d/mmdet3d/datasets/__init__.py
+++ b/mmdetection3d/mmdet3d/datasets/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dataset_wrappers import CBGSDataset
+from .det3d_dataset import Det3DDataset
+from .kitti_dataset import KittiDataset
+from .lyft_dataset import LyftDataset
+from .nuscenes_dataset import NuScenesDataset
+# yapf: enable
+from .s3dis_dataset import S3DISDataset, S3DISSegDataset
+from .scannet_dataset import (ScanNetDataset, ScanNetInstanceSegDataset,
+                              ScanNetSegDataset)
+from .seg3d_dataset import Seg3DDataset
+from .semantickitti_dataset import SemanticKittiDataset
+from .sunrgbd_dataset import SUNRGBDDataset
+# yapf: disable
+from .transforms import (AffineResize, BackgroundPointsFilter, GlobalAlignment,
+                         GlobalRotScaleTrans, IndoorPatchPointSample,
+                         IndoorPointSample, LoadAnnotations3D,
+                         LoadPointsFromDict, LoadPointsFromFile,
+                         LoadPointsFromMultiSweeps, NormalizePointsColor,
+                         ObjectNameFilter, ObjectNoise, ObjectRangeFilter,
+                         ObjectSample, PointSample, PointShuffle,
+                         PointsRangeFilter, RandomDropPointsColor,
+                         RandomFlip3D, RandomJitterPoints, RandomResize3D,
+                         RandomShiftScale, Resize3D, VoxelBasedPointSampler)
+from .utils import get_loading_pipeline
+from .waymo_dataset import WaymoDataset
+
+__all__ = [
+    'KittiDataset', 'CBGSDataset', 'NuScenesDataset', 'LyftDataset',
+    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter',
+    'LoadPointsFromFile', 'S3DISSegDataset', 'S3DISDataset',
+    'NormalizePointsColor', 'IndoorPatchPointSample', 'IndoorPointSample',
+    'PointSample', 'LoadAnnotations3D', 'GlobalAlignment', 'SUNRGBDDataset',
+    'ScanNetDataset', 'ScanNetSegDataset', 'ScanNetInstanceSegDataset',
+    'SemanticKittiDataset', 'Det3DDataset', 'Seg3DDataset',
+    'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
+    'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor',
+    'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize',
+    'RandomShiftScale', 'LoadPointsFromDict', 'Resize3D', 'RandomResize3D',
+]
--- a/mmdetection3d/mmdet3d/datasets/convert_utils.py
+++ b/mmdetection3d/mmdet3d/datasets/convert_utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+from nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+from shapely.geometry.polygon import Polygon
+
+from mmdet3d.structures import Box3DMode, CameraInstance3DBoxes, points_cam2img
+from mmdet3d.structures.ops import box_np_ops
+
+kitti_categories = ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
+                    'Person_sitting', 'Tram', 'Misc')
+
+waymo_categories = ('Car', 'Pedestrian', 'Cyclist')
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+NuScenesNameMapping = {
+    'movable_object.barrier': 'barrier',
+    'vehicle.bicycle': 'bicycle',
+    'vehicle.bus.bendy': 'bus',
+    'vehicle.bus.rigid': 'bus',
+    'vehicle.car': 'car',
+    'vehicle.construction': 'construction_vehicle',
+    'vehicle.motorcycle': 'motorcycle',
+    'human.pedestrian.adult': 'pedestrian',
+    'human.pedestrian.child': 'pedestrian',
+    'human.pedestrian.construction_worker': 'pedestrian',
+    'human.pedestrian.police_officer': 'pedestrian',
+    'movable_object.trafficcone': 'traffic_cone',
+    'vehicle.trailer': 'trailer',
+    'vehicle.truck': 'truck'
+}
+LyftNameMapping = {
+    'bicycle': 'bicycle',
+    'bus': 'bus',
+    'car': 'car',
+    'emergency_vehicle': 'emergency_vehicle',
+    'motorcycle': 'motorcycle',
+    'other_vehicle': 'other_vehicle',
+    'pedestrian': 'pedestrian',
+    'truck': 'truck',
+    'animal': 'animal'
+}
+
+
+def get_nuscenes_2d_boxes(nusc: NuScenes, sample_data_token: str,
+                          visibilities: List[str]) -> List[dict]:
+    """Get the 2d / mono3d annotation records for a given `sample_data_token`
+    of nuscenes dataset.
+
+    Args:
+        nusc (:obj:`NuScenes`): NuScenes class.
+        sample_data_token (str): Sample data token belonging to a camera
+            keyframe.
+        visibilities (List[str]): Visibility filter.
+
+    Return:
+        List[dict]: List of 2d annotation record that belongs to the input
+        `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    'nuscenes')
+
+        # if repro_rec is None, we do not append it into repre_recs
+        if repro_rec is not None:
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_3d'] = loc + dim + rot
+            repro_rec['velocity'] = velo
+
+            center_3d = np.array(loc).reshape([1, 3])
+            center_2d_with_depth = points_cam2img(
+                center_3d, camera_intrinsic, with_depth=True)
+            center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+            repro_rec['center_2d'] = center_2d_with_depth[:2]
+            repro_rec['depth'] = center_2d_with_depth[2]
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['depth'] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            # repro_rec['attribute_name'] = attr_name
+            repro_rec['attr_label'] = attr_id
+
+            repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def get_kitti_style_2d_boxes(info: dict,
+                             cam_idx: int = 2,
+                             occluded: Tuple[int] = (0, 1, 2, 3),
+                             annos: Optional[dict] = None,
+                             mono3d: bool = True,
+                             dataset: str = 'kitti') -> List[dict]:
+    """Get the 2d / mono3d annotation records for a given info.
+
+    This function is used to get 2D/Mono3D annotations when loading annotations
+    from a kitti-style dataset class, such as KITTI and Waymo dataset.
+
+    Args:
+        info (dict): Information of the given sample data.
+        cam_idx (int): Camera id which the 2d / mono3d annotations to obtain
+            belong to. In KITTI, typically only CAM 2 will be used,
+            and in Waymo, multi cameras could be used.
+            Defaults to 2.
+        occluded (Tuple[int]): Integer (0, 1, 2, 3) indicating occlusion state:
+            0 = fully visible, 1 = partly occluded, 2 = largely occluded,
+            3 = unknown, -1 = DontCare.
+            Defaults to (0, 1, 2, 3).
+        annos (dict, optional): Original annotations. Defaults to None.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+            Defaults to True.
+        dataset (str): Dataset name of getting 2d bboxes.
+            Defaults to 'kitti'.
+
+    Return:
+        List[dict]: List of 2d / mono3d annotation record that
+        belongs to the input camera id.
+    """
+    # Get calibration information
+    camera_intrinsic = info['calib'][f'P{cam_idx}']
+
+    repro_recs = []
+    # if no annotations in info (test dataset), then return
+    if annos is None:
+        return repro_recs
+
+    # Get all the annotation with the specified visibilties.
+    # filter the annotation bboxes by occluded attributes
+    ann_dicts = annos
+    mask = [(ocld in occluded) for ocld in ann_dicts['occluded']]
+    for k in ann_dicts.keys():
+        ann_dicts[k] = ann_dicts[k][mask]
+
+    # convert dict of list to list of dict
+    ann_recs = []
+    for i in range(len(ann_dicts['occluded'])):
+        ann_rec = {}
+        for k in ann_dicts.keys():
+            ann_rec[k] = ann_dicts[k][i]
+        ann_recs.append(ann_rec)
+
+    for ann_idx, ann_rec in enumerate(ann_recs):
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = \
+            f"{info['image']['image_idx']}.{ann_idx}"
+        ann_rec['sample_data_token'] = info['image']['image_idx']
+
+        loc = ann_rec['location'][np.newaxis, :]
+        dim = ann_rec['dimensions'][np.newaxis, :]
+        rot = ann_rec['rotation_y'][np.newaxis, np.newaxis]
+
+        # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5]
+        dst = np.array([0.5, 0.5, 0.5])
+        src = np.array([0.5, 1.0, 0.5])
+        # gravity center
+        loc_center = loc + dim * (dst - src)
+        gt_bbox_3d = np.concatenate([loc_center, dim, rot],
+                                    axis=1).astype(np.float32)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box_np_ops.center_to_corner_box3d(
+            gt_bbox_3d[:, :3],
+            gt_bbox_3d[:, 3:6],
+            gt_bbox_3d[:, 6], (0.5, 0.5, 0.5),
+            axis=1)
+        corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(
+            corner_coords,
+            imsize=(info['image']['image_shape'][1],
+                    info['image']['image_shape'][0]))
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    dataset)
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            # use bottom center to represent the bbox_3d
+            repro_rec['bbox_3d'] = np.concatenate(
+                [loc, dim, rot], axis=1).astype(np.float32).squeeze().tolist()
+            repro_rec['velocity'] = -1  # no velocity in KITTI
+
+            center_3d = np.array(loc_center).reshape([1, 3])
+            center_2d_with_depth = points_cam2img(
+                center_3d, camera_intrinsic, with_depth=True)
+            center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+
+            repro_rec['center_2d'] = center_2d_with_depth[:2]
+            repro_rec['depth'] = center_2d_with_depth[2]
+            # normalized center2D + depth
+            # samples with depth < 0 will be removed
+            if repro_rec['depth'] <= 0:
+                continue
+            repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def convert_annos(info: dict, cam_idx: int) -> dict:
+    """Convert front-cam anns to i-th camera (KITTI-style info)."""
+    rect = info['calib']['R0_rect'].astype(np.float32)
+    lidar2cam0 = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+    lidar2cami = info['calib'][f'Tr_velo_to_cam{cam_idx}'].astype(np.float32)
+    annos = info['annos']
+    converted_annos = copy.deepcopy(annos)
+    loc = annos['location']
+    dims = annos['dimensions']
+    rots = annos['rotation_y']
+    gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                  axis=1).astype(np.float32)
+    # convert gt_bboxes_3d to velodyne coordinates
+    gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
+        Box3DMode.LIDAR, np.linalg.inv(rect @ lidar2cam0), correct_yaw=True)
+    # convert gt_bboxes_3d to cam coordinates
+    gt_bboxes_3d = gt_bboxes_3d.convert_to(
+        Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
+    converted_annos['location'] = gt_bboxes_3d[:, :3]
+    converted_annos['dimensions'] = gt_bboxes_3d[:, 3:6]
+    converted_annos['rotation_y'] = gt_bboxes_3d[:, 6]
+    return converted_annos
+
+
+def post_process_coords(
+    corner_coords: List[int], imsize: Tuple[int] = (1600, 900)
+) -> Union[Tuple[float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (List[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (Tuple[int]): Size of the image canvas.
+            Defaults to (1600, 900).
+
+    Return:
+        Tuple[float] or None: Intersection of the convex hull of the 2D box
+        corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        if isinstance(img_intersection, Polygon):
+            intersection_coords = np.array(
+                [coord for coord in img_intersection.exterior.coords])
+            min_x = min(intersection_coords[:, 0])
+            min_y = min(intersection_coords[:, 1])
+            max_x = max(intersection_coords[:, 0])
+            max_y = max(intersection_coords[:, 1])
+            return min_x, min_y, max_x, max_y
+        else:
+            warnings.warn('img_intersection is not an object of Polygon.')
+            return None
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    dataset: str) -> Union[dict, None]:
+    """Generate one 2D annotation record given various information on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        dataset (str): Name of dataset.
+
+    Returns:
+        dict or None: A sample 2d annotation record.
+
+            - bbox_label (int): 2d box label id
+            - bbox_label_3d (int): 3d box label id
+            - bbox (List[float]): left x, top y, right x, bottom y of 2d box
+            - bbox_3d_isvalid (bool): whether the box is valid
+    """
+
+    if dataset == 'nuscenes':
+        cat_name = ann_rec['category_name']
+        if cat_name not in NuScenesNameMapping:
+            return None
+        else:
+            cat_name = NuScenesNameMapping[cat_name]
+            categories = nus_categories
+    else:
+        if dataset == 'kitti':
+            categories = kitti_categories
+        elif dataset == 'waymo':
+            categories = waymo_categories
+        else:
+            raise NotImplementedError('Unsupported dataset!')
+
+        cat_name = ann_rec['name']
+        if cat_name not in categories:
+            return None
+
+    rec = dict()
+    rec['bbox_label'] = categories.index(cat_name)
+    rec['bbox_label_3d'] = rec['bbox_label']
+    rec['bbox'] = [x1, y1, x2, y2]
+    rec['bbox_3d_isvalid'] = True
+
+    return rec
--- a/mmdetection3d/mmdet3d/datasets/dataset_wrappers.py
+++ b/mmdetection3d/mmdet3d/datasets/dataset_wrappers.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Set, Union
+
+import numpy as np
+from mmengine.dataset import BaseDataset, force_full_init
+
+from mmdet3d.registry import DATASETS
+
+
+@DATASETS.register_module()
+class CBGSDataset:
+    """A wrapper of class sampled dataset with ann_file path. Implementation of
+    paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object
+    Detection <https://arxiv.org/abs/1908.09492>`_.
+
+    Balance the number of scenes under different classes.
+
+    Args:
+        dataset (:obj:`BaseDataset` or dict): The dataset to be class sampled.
+        lazy_init (bool): Whether to load annotation during instantiation.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 lazy_init: bool = False) -> None:
+        self.dataset: BaseDataset
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+        self._metainfo = self.dataset.metainfo
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the repeated dataset.
+
+        Returns:
+            dict: The meta information of repeated dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self) -> None:
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        # Get sample_indices
+        self.sample_indices = self._get_sample_indices(self.dataset)
+
+        self._fully_initialized = True
+
+    def _get_sample_indices(self, dataset: BaseDataset) -> List[int]:
+        """Load sample indices according to ann_file.
+
+        Args:
+            dataset (:obj:`BaseDataset`): The dataset.
+
+        Returns:
+            List[dict]: List of indices after class sampling.
+        """
+        classes = self.metainfo['classes']
+        cat2id = {name: i for i, name in enumerate(classes)}
+        class_sample_idxs = {cat_id: [] for cat_id in cat2id.values()}
+        for idx in range(len(dataset)):
+            sample_cat_ids = dataset.get_cat_ids(idx)
+            for cat_id in sample_cat_ids:
+                if cat_id != -1:
+                    # Filter categories that do not need to be cared.
+                    # -1 indicates dontcare in MMDet3D.
+                    class_sample_idxs[cat_id].append(idx)
+        duplicated_samples = sum(
+            [len(v) for _, v in class_sample_idxs.items()])
+        class_distribution = {
+            k: len(v) / duplicated_samples
+            for k, v in class_sample_idxs.items()
+        }
+
+        sample_indices = []
+
+        frac = 1.0 / len(classes)
+        ratios = [frac / v for v in class_distribution.values()]
+        for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
+            sample_indices += np.random.choice(cls_inds,
+                                               int(len(cls_inds) *
+                                                   ratio)).tolist()
+        return sample_indices
+
+    @force_full_init
+    def _get_ori_dataset_idx(self, idx: int) -> int:
+        """Convert global index to local index.
+
+        Args:
+            idx (int): Global index of ``CBGSDataset``.
+
+        Returns:
+            int: Local index of data.
+        """
+        return self.sample_indices[idx]
+
+    @force_full_init
+    def get_cat_ids(self, idx: int) -> Set[int]:
+        """Get category ids of class balanced dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            Set[int]: All categories in the sample of specified index.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_cat_ids(sample_idx)
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``CBGSDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the dataset.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_data_info(sample_idx)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Get item from infos according to the given index.
+
+        Args:
+            idx (int): The index of self.sample_indices.
+
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if not self._fully_initialized:
+            warnings.warn('Please call `full_init` method manually to '
+                          'accelerate the speed.')
+            self.full_init()
+
+        ori_index = self._get_ori_dataset_idx(idx)
+        return self.dataset[ori_index]
+
+    @force_full_init
+    def __len__(self) -> int:
+        """Return the length of data infos.
+
+        Returns:
+            int: Length of data infos.
+        """
+        return len(self.sample_indices)
+
+    def get_subset_(self, indices: Union[List[int], int]) -> None:
+        """Not supported in ``CBGSDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`CBGSDataset` does not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `CBGSDataset`.')
+
+    def get_subset(self, indices: Union[List[int], int]) -> BaseDataset:
+        """Not supported in ``CBGSDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`CBGSDataset` does not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `CBGSDataset`.')
--- a/mmdetection3d/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdetection3d/mmdet3d/datasets/det3d_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os
+from os import path as osp
+from typing import Callable, List, Optional, Set, Union
+
+import numpy as np
+import torch
+from mmengine.dataset import BaseDataset
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import get_box_type
+
+
+@DATASETS.register_module()
+class Det3DDataset(BaseDataset):
+    """Base Class of 3D dataset.
+
+    This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
+    dataset.
+    # TODO: doc link here for the standard data format
+
+    Args:
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='velodyne', img='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input,
+            it usually has following keys:
+
+                - use_camera: bool
+                - use_lidar: bool
+            Defaults to dict(use_lidar=True, use_camera=False).
+        default_cam_key (str, optional): The default camera name adopted.
+            Defaults to None.
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates, usually for
+              outdoor point cloud 3d detection.
+            - 'Depth': Box in depth coordinates, usually for
+              indoor point cloud 3d detection.
+            - 'Camera': Box in camera coordinates, usually
+              for vision-based 3d detection.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        load_eval_anns (bool): Whether to load annotations in test_mode,
+            the annotation will be save in `eval_ann_infos`, which can be
+            used in Evaluator. Defaults to True.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        show_ins_var (bool): For debug purpose. Whether to show variation
+            of the number of instances before and after through pipeline.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(pts='velodyne', img=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 default_cam_key: str = None,
+                 box_type_3d: dict = 'LiDAR',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 load_eval_anns: bool = True,
+                 backend_args: Optional[dict] = None,
+                 show_ins_var: bool = False,
+                 **kwargs) -> None:
+        self.backend_args = backend_args
+        self.filter_empty_gt = filter_empty_gt
+        self.load_eval_anns = load_eval_anns
+        _default_modality_keys = ('use_lidar', 'use_camera')
+        if modality is None:
+            modality = dict()
+
+        # Defaults to False if not specify
+        for key in _default_modality_keys:
+            if key not in modality:
+                modality[key] = False
+        self.modality = modality
+        self.default_cam_key = default_cam_key
+        assert self.modality['use_lidar'] or self.modality['use_camera'], (
+            'Please specify the `modality` (`use_lidar` '
+            f', `use_camera`) for {self.__class__.__name__}')
+
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+
+        if metainfo is not None and 'classes' in metainfo:
+            # we allow to train on subset of self.METAINFO['classes']
+            # map unselected labels to -1
+            self.label_mapping = {
+                i: -1
+                for i in range(len(self.METAINFO['classes']))
+            }
+            self.label_mapping[-1] = -1
+            for label_idx, name in enumerate(metainfo['classes']):
+                ori_label = self.METAINFO['classes'].index(name)
+                self.label_mapping[ori_label] = label_idx
+
+            self.num_ins_per_cat = [0] * len(metainfo['classes'])
+        else:
+            self.label_mapping = {
+                i: i
+                for i in range(len(self.METAINFO['classes']))
+            }
+            self.label_mapping[-1] = -1
+
+            self.num_ins_per_cat = [0] * len(self.METAINFO['classes'])
+
+        super().__init__(
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            **kwargs)
+
+        # can be accessed by other component in runner
+        self.metainfo['box_type_3d'] = box_type_3d
+        self.metainfo['label_mapping'] = self.label_mapping
+
+        if not kwargs.get('lazy_init', False):
+            # used for showing variation of the number of instances before and
+            # after through the pipeline
+            self.show_ins_var = show_ins_var
+
+            # show statistics of this dataset
+            print_log('-' * 30, 'current')
+            print_log(
+                f'The length of {"test" if self.test_mode else "training"} dataset: {len(self)}',  # noqa: E501
+                'current')
+            content_show = [['category', 'number']]
+            for label, num in enumerate(self.num_ins_per_cat):
+                cat_name = self.metainfo['classes'][label]
+                content_show.append([cat_name, num])
+            table = AsciiTable(content_show)
+            print_log(
+                f'The number of instances per category in the dataset:\n{table.table}',  # noqa: E501
+                'current')
+
+    def _remove_dontcare(self, ann_info: dict) -> dict:
+        """Remove annotations that do not need to be cared.
+
+        -1 indicates dontcare in MMDet3d.
+
+        Args:
+            ann_info (dict): Dict of annotation infos. The
+                instance with label `-1` will be removed.
+
+        Returns:
+            dict: Annotations after filtering.
+        """
+        img_filtered_annotations = {}
+        filter_mask = ann_info['gt_labels_3d'] > -1
+        for key in ann_info.keys():
+            if key != 'instances':
+                img_filtered_annotations[key] = (ann_info[key][filter_mask])
+            else:
+                img_filtered_annotations[key] = ann_info[key]
+        return img_filtered_annotations
+
+    def get_ann_info(self, index: int) -> dict:
+        """Get annotation info according to the given index.
+
+        Use index to get the corresponding annotations, thus the
+        evalhook could use this api.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information.
+        """
+        data_info = self.get_data_info(index)
+        # test model
+        if 'ann_info' not in data_info:
+            ann_info = self.parse_ann_info(data_info)
+        else:
+            ann_info = data_info['ann_info']
+
+        return ann_info
+
+    def parse_ann_info(self, info: dict) -> Union[dict, None]:
+        """Process the `instances` in data info to `ann_info`.
+
+        In `Custom3DDataset`, we simply concatenate all the field
+        in `instances` to `np.ndarray`, you can do the specific
+        process in subclass. You have to convert `gt_bboxes_3d`
+        to different coordinates according to the task.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict or None: Processed `ann_info`.
+        """
+        # add s or gt prefix for most keys after concat
+        # we only process 3d annotations here, the corresponding
+        # 2d annotation process is in the `LoadAnnotations3D`
+        # in `transforms`
+        name_mapping = {
+            'bbox_label_3d': 'gt_labels_3d',
+            'bbox_label': 'gt_bboxes_labels',
+            'bbox': 'gt_bboxes',
+            'bbox_3d': 'gt_bboxes_3d',
+            'depth': 'depths',
+            'center_2d': 'centers_2d',
+            'attr_label': 'attr_labels',
+            'velocity': 'velocities',
+        }
+        instances = info['instances']
+        # empty gt
+        if len(instances) == 0:
+            return None
+        else:
+            keys = list(instances[0].keys())
+            ann_info = dict()
+            for ann_name in keys:
+                temp_anns = [item[ann_name] for item in instances]
+                # map the original dataset label to training label
+                if 'label' in ann_name and ann_name != 'attr_label':
+                    temp_anns = [
+                        self.label_mapping[item] for item in temp_anns
+                    ]
+                if ann_name in name_mapping:
+                    mapped_ann_name = name_mapping[ann_name]
+                else:
+                    mapped_ann_name = ann_name
+
+                if 'label' in ann_name:
+                    temp_anns = np.array(temp_anns).astype(np.int64)
+                elif ann_name in name_mapping:
+                    temp_anns = np.array(temp_anns).astype(np.float32)
+                else:
+                    temp_anns = np.array(temp_anns)
+
+                ann_info[mapped_ann_name] = temp_anns
+            ann_info['instances'] = info['instances']
+
+            for label in ann_info['gt_labels_3d']:
+                if label != -1:
+                    self.num_ins_per_cat[label] += 1
+
+        return ann_info
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Convert all relative path of needed modality data file to
+        the absolute path. And process the `instances` field to
+        `ann_info` in training stage.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+
+        if self.modality['use_lidar']:
+            info['lidar_points']['lidar_path'] = \
+                osp.join(
+                    self.data_prefix.get('pts', ''),
+                    info['lidar_points']['lidar_path'])
+
+            info['num_pts_feats'] = info['lidar_points']['num_pts_feats']
+            info['lidar_path'] = info['lidar_points']['lidar_path']
+            if 'lidar_sweeps' in info:
+                for sweep in info['lidar_sweeps']:
+                    file_suffix = sweep['lidar_points']['lidar_path'].split(
+                        os.sep)[-1]
+                    if 'samples' in sweep['lidar_points']['lidar_path']:
+                        sweep['lidar_points']['lidar_path'] = osp.join(
+                            self.data_prefix['pts'], file_suffix)
+                    else:
+                        sweep['lidar_points']['lidar_path'] = osp.join(
+                            self.data_prefix['sweeps'], file_suffix)
+
+        if self.modality['use_camera']:
+            for cam_id, img_info in info['images'].items():
+                if 'img_path' in img_info:
+                    if cam_id in self.data_prefix:
+                        cam_prefix = self.data_prefix[cam_id]
+                    else:
+                        cam_prefix = self.data_prefix.get('img', '')
+                    img_info['img_path'] = osp.join(cam_prefix,
+                                                    img_info['img_path'])
+            if self.default_cam_key is not None:
+                info['img_path'] = info['images'][
+                    self.default_cam_key]['img_path']
+                if 'lidar2cam' in info['images'][self.default_cam_key]:
+                    info['lidar2cam'] = np.array(
+                        info['images'][self.default_cam_key]['lidar2cam'])
+                if 'cam2img' in info['images'][self.default_cam_key]:
+                    info['cam2img'] = np.array(
+                        info['images'][self.default_cam_key]['cam2img'])
+                if 'lidar2img' in info['images'][self.default_cam_key]:
+                    info['lidar2img'] = np.array(
+                        info['images'][self.default_cam_key]['lidar2img'])
+                else:
+                    info['lidar2img'] = info['cam2img'] @ info['lidar2cam']
+
+        if not self.test_mode:
+            # used in training
+            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            info['eval_ann_info'] = self.parse_ann_info(info)
+
+        return info
+
+    def _show_ins_var(self, old_labels: np.ndarray,
+                      new_labels: torch.Tensor) -> None:
+        """Show variation of the number of instances before and after through
+        the pipeline.
+
+        Args:
+            old_labels (np.ndarray): The labels before through the pipeline.
+            new_labels (torch.Tensor): The labels after through the pipeline.
+        """
+        ori_num_per_cat = dict()
+        for label in old_labels:
+            if label != -1:
+                cat_name = self.metainfo['classes'][label]
+                ori_num_per_cat[cat_name] = ori_num_per_cat.get(cat_name,
+                                                                0) + 1
+        new_num_per_cat = dict()
+        for label in new_labels:
+            if label != -1:
+                cat_name = self.metainfo['classes'][label]
+                new_num_per_cat[cat_name] = new_num_per_cat.get(cat_name,
+                                                                0) + 1
+        content_show = [['category', 'new number', 'ori number']]
+        for cat_name, num in ori_num_per_cat.items():
+            new_num = new_num_per_cat.get(cat_name, 0)
+            content_show.append([cat_name, new_num, num])
+        table = AsciiTable(content_show)
+        print_log(
+            'The number of instances per category after and before '
+            f'through pipeline:\n{table.table}', 'current')
+
+    def prepare_data(self, index: int) -> Union[dict, None]:
+        """Data preparation for both training and testing stage.
+
+        Called by `__getitem__`  of dataset.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict or None: Data dict of the corresponding index.
+        """
+        ori_input_dict = self.get_data_info(index)
+
+        # deepcopy here to avoid inplace modification in pipeline.
+        input_dict = copy.deepcopy(ori_input_dict)
+
+        # box_type_3d (str): 3D box type.
+        input_dict['box_type_3d'] = self.box_type_3d
+        # box_mode_3d (str): 3D box mode.
+        input_dict['box_mode_3d'] = self.box_mode_3d
+
+        # pre-pipline return None to random another in `__getitem__`
+        if not self.test_mode and self.filter_empty_gt:
+            if len(input_dict['ann_info']['gt_labels_3d']) == 0:
+                return None
+
+        example = self.pipeline(input_dict)
+
+        if not self.test_mode and self.filter_empty_gt:
+            # after pipeline drop the example with empty annotations
+            # return None to random another in `__getitem__`
+            if example is None or len(
+                    example['data_samples'].gt_instances_3d.labels_3d) == 0:
+                return None
+
+        if self.show_ins_var:
+            if 'ann_info' in ori_input_dict:
+                self._show_ins_var(
+                    ori_input_dict['ann_info']['gt_labels_3d'],
+                    example['data_samples'].gt_instances_3d.labels_3d)
+            else:
+                print_log(
+                    "'ann_info' is not in the input dict. It's probably that "
+                    'the data is not in training mode',
+                    'current',
+                    level=30)
+
+        return example
+
+    def get_cat_ids(self, idx: int) -> Set[int]:
+        """Get category ids by index. Dataset wrapped by ClassBalancedDataset
+        must implement this method.
+
+        The ``CBGSDataset`` or ``ClassBalancedDataset``requires a subclass
+        which implements this method.
+
+        Args:
+            idx (int): The index of data.
+
+        Returns:
+            set[int]: All categories in the sample of specified index.
+        """
+        info = self.get_data_info(idx)
+        gt_labels = info['ann_info']['gt_labels_3d'].tolist()
+        return set(gt_labels)