Merge branch 'master' into process_raw_data

f2b01720 · liyinhao · 08c8adb6 · 47850641 · 08c8adb6 · 08c8adb6
Commit f2b01720 authored Jun 10, 2020 by liyinhao
20 changed files
--- a/configs/fileclient/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/fileclient/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
-# model settings
-voxel_size = [0.25, 0.25, 8]
-point_cloud_range = [-50, -50, -5, 50, 50, 3]
-class_names = [
-    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
-    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
-]
-model = dict(
-    type='MVXFasterRCNNV2',
-    pts_voxel_layer=dict(
-        max_num_points=64,  # max_points_per_voxel
-        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
-        voxel_size=voxel_size,
-        max_voxels=(30000, 40000),  # (training, testing) max_coxels
-    ),
-    pts_voxel_encoder=dict(
-        type='HardVFE',
-        num_input_features=4,
-        num_filters=[64, 64],
-        with_distance=False,
-        voxel_size=voxel_size,
-        with_cluster_center=True,
-        with_voxel_center=True,
-        point_cloud_range=point_cloud_range,
-        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
-    pts_middle_encoder=dict(
-        type='PointPillarsScatter',
-        in_channels=64,
-        output_shape=[400, 400],  # checked from PointCloud3D
-    ),
-    pts_backbone=dict(
-        type='SECOND',
-        in_channels=64,
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        layer_nums=[3, 5, 5],
-        layer_strides=[2, 2, 2],
-        out_channels=[64, 128, 256],
-    ),
-    pts_neck=dict(
-        type='SECONDFPN',
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        in_channels=[64, 128, 256],
-        upsample_strides=[1, 2, 4],
-        out_channels=[128, 128, 128],
-    ),
-    pts_bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=10,
-        in_channels=384,
-        feat_channels=384,
-        use_direction_classifier=True,
-        anchor_generator=dict(
-            type='Anchor3DRangeGenerator',
-            ranges=[
-                [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
-                [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
-                [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
-                [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
-                [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
-                [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
-                [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
-            ],
-            sizes=[
-                [1.95017717, 4.60718145, 1.72270761],  # car
-                [2.4560939, 6.73778078, 2.73004906],  # truck
-                [2.87427237, 12.01320693, 3.81509561],  # trailer
-                [0.60058911, 1.68452161, 1.27192197],  # bicycle
-                [0.66344886, 0.7256437, 1.75748069],  # pedestrian
-                [0.39694519, 0.40359262, 1.06232151],  # traffic_cone
-                [2.49008838, 0.48578221, 0.98297065],  # barrier
-            ],
-            custom_values=[0, 0],
-            rotations=[0, 1.57],
-            reshape_out=True),
-        assigner_per_size=False,
-        diff_rad_by_sin=True,
-        dir_offset=0.7854,  # pi/4
-        dir_limit_offset=0,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
-# model training and testing settings
-train_cfg = dict(
-    pts=dict(
-        assigner=dict(  # for Car
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.6,
-            neg_iou_thr=0.3,
-            min_pos_iou=0.3,
-            ignore_iof_thr=-1),
-        allowed_border=0,
-        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
-        pos_weight=-1,
-        debug=False))
-test_cfg = dict(
-    pts=dict(
-        use_rotate_nms=True,
-        nms_across_levels=False,
-        nms_pre=1000,
-        nms_thr=0.2,
-        score_thr=0.05,
-        min_bbox_size=0,
-        max_num=500
-        # soft-nms is also supported for rcnn testing
-        # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
-    ))
-
-# dataset settings
-dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-input_modality = dict(
-    use_lidar=True,
-    use_depth=False,
-    use_lidar_intensity=True,
-    use_camera=False,
-)
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
-    rate=1.0,
-    object_rot_range=[0.0, 0.0],
-    prepare=dict(),
-    classes=class_names,
-    sample_groups=dict(
-        bus=4,
-        trailer=4,
-        truck=4,
-    ))
-
-file_client_args = dict(
-    backend='petrel',
-    path_mapping=dict({
-        './data/nuscenes/': 's3://nuscenes/nuscenes/',
-        'data/nuscenes/': 's3://nuscenes/nuscenes/'
-    }))
-train_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=10,
-        file_client_args=file_client_args),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
-    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.3925, 0.3925],
-        scaling_uniform_noise=[0.95, 1.05],
-        trans_normal_noise=[0, 0, 0]),
-    dict(type='RandomFlip3D', flip_ratio=0.5),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='PointShuffle'),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
-]
-test_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=10,
-        file_client_args=file_client_args),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='RandomFlip3D', flip_ratio=0),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points']),
-]
-
-data = dict(
-    samples_per_gpu=4,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_train.pkl',
-        pipeline=train_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=False),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_val.pkl',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_val.pkl',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True))
-# optimizer
-optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
-# max_norm=10 is better for SECOND
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=1000,
-    warmup_ratio=1.0 / 1000,
-    step=[20, 23])
-momentum_config = None
-checkpoint_config = dict(interval=1)
-# yapf:disable
-evaluation = dict(interval=24)
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 24
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/fileclient/hv_second_secfpn_6x8_80e_fileclient_kitti-3d-car.py
+++ b/configs/fileclient/hv_second_secfpn_6x8_80e_fileclient_kitti-3d-car.py
-# model settings
-voxel_size = [0.05, 0.05, 0.1]
-point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
-
-model = dict(
-    type='VoxelNet',
-    voxel_layer=dict(
-        max_num_points=5,  # max_points_per_voxel
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(16000, 40000),  # (training, testing) max_coxels
-    ),
-    voxel_encoder=dict(
-        type='VoxelFeatureExtractorV3',
-        num_input_features=4,
-        num_filters=[4],
-        with_distance=False),
-    middle_encoder=dict(
-        type='SparseEncoder',
-        in_channels=4,
-        sparse_shape=[41, 1600, 1408],
-        order=('conv', 'norm', 'act')),
-    backbone=dict(
-        type='SECOND',
-        in_channels=256,
-        layer_nums=[5, 5],
-        layer_strides=[1, 2],
-        out_channels=[128, 256],
-    ),
-    neck=dict(
-        type='SECONDFPN',
-        in_channels=[128, 256],
-        upsample_strides=[1, 2],
-        out_channels=[256, 256],
-    ),
-    bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=1,
-        in_channels=512,
-        feat_channels=512,
-        use_direction_classifier=True,
-        anchor_generator=dict(
-            type='Anchor3DRangeGenerator',
-            ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
-            sizes=[[1.6, 3.9, 1.56]],
-            rotations=[0, 1.57],
-            reshape_out=True),
-        diff_rad_by_sin=True,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
-    ),
-)
-# model training and testing settings
-train_cfg = dict(
-    assigner=dict(
-        type='MaxIoUAssigner',
-        iou_calculator=dict(type='BboxOverlapsNearest3D'),
-        pos_iou_thr=0.6,
-        neg_iou_thr=0.45,
-        min_pos_iou=0.45,
-        ignore_iof_thr=-1),
-    allowed_border=0,
-    pos_weight=-1,
-    debug=False)
-test_cfg = dict(
-    use_rotate_nms=True,
-    nms_across_levels=False,
-    nms_thr=0.01,
-    score_thr=0.3,
-    min_bbox_size=0,
-    nms_pre=100,
-    max_num=50)
-
-# dataset settings
-dataset_type = 'KittiDataset'
-data_root = 'data/kitti/'
-class_names = ['Car']
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-input_modality = dict(
-    use_lidar=False,
-    use_lidar_reduced=True,
-    use_depth=False,
-    use_lidar_intensity=True,
-    use_camera=False,
-)
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'kitti_dbinfos_train.pkl',
-    rate=1.0,
-    object_rot_range=[0.0, 0.0],
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(Car=5),
-    ),
-    classes=class_names,
-    sample_groups=dict(Car=15),
-)
-file_client_args = dict(
-    backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
-
-train_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=4,
-        use_dim=4,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadAnnotations3D',
-        with_bbox_3d=True,
-        with_label_3d=True,
-        file_client_args=file_client_args),
-    dict(type='ObjectSample', db_sampler=db_sampler),
-    dict(
-        type='ObjectNoise',
-        num_try=100,
-        loc_noise_std=[1.0, 1.0, 0.5],
-        global_rot_range=[0.0, 0.0],
-        rot_uniform_noise=[-0.78539816, 0.78539816]),
-    dict(type='RandomFlip3D', flip_ratio=0.5),
-    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.78539816, 0.78539816],
-        scaling_uniform_noise=[0.95, 1.05]),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='PointShuffle'),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
-]
-test_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=4,
-        use_dim=4,
-        file_client_args=file_client_args),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points']),
-]
-
-data = dict(
-    samples_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_train.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=train_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=False),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True))
-# optimizer
-lr = 0.0018  # max learning rate
-optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
-lr_config = dict(
-    policy='cyclic',
-    target_ratio=(10, 1e-4),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-momentum_config = dict(
-    policy='cyclic',
-    target_ratio=(0.85 / 0.95, 1),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-checkpoint_config = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 80
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/sec_secfpn_80e'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
+++ b/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
@@ -28,8 +28,8 @@ model = dict(
    ),
    pts_voxel_encoder=dict(
        type='DynamicVFE',
-        num_input_features=4,
-        num_filters=[64, 64],
+        in_channels=4,
+        feat_channels=[64, 64],
        with_distance=False,
        voxel_size=voxel_size,
        with_cluster_center=True,

--- a/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -12,8 +12,8 @@ model = dict(
    ),
    voxel_encoder=dict(
        type='DynamicPillarFeatureNet',
-        num_input_features=4,
-        num_filters=[64],
+        in_channels=4,
+        feat_channels=[64],
        with_distance=False,
        voxel_size=voxel_size,
        point_cloud_range=point_cloud_range,
@@ -189,6 +189,7 @@ momentum_config = dict(
    step_ratio_up=0.4,
 )
 checkpoint_config = dict(interval=1)
+evaluation = dict(interval=2)
 # yapf:disable
 log_config = dict(
    interval=50,

--- a/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
+++ b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
@@ -11,8 +11,7 @@ model = dict(
        max_voxels=(-1, -1),  # (training, testing) max_coxels
    ),
    voxel_encoder=dict(
-        type='DynamicVFEV3',
-        num_input_features=4,
+        type='DynamicSimpleVFE',
        voxel_size=voxel_size,
        point_cloud_range=point_cloud_range),
    middle_encoder=dict(
@@ -214,6 +213,7 @@ lr_config = dict(
    min_lr_ratio=1e-5)
 momentum_config = None
 checkpoint_config = dict(interval=1)
+evaluation = dict(interval=2)
 # yapf:disable
 log_config = dict(
    interval=50,

--- a/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -11,8 +11,7 @@ model = dict(
        max_voxels=(-1, -1),  # (training, testing) max_coxels
    ),
    voxel_encoder=dict(
-        type='DynamicVFEV3',
-        num_input_features=4,
+        type='DynamicSimpleVFE',
        voxel_size=voxel_size,
        point_cloud_range=point_cloud_range),
    middle_encoder=dict(
@@ -184,6 +183,7 @@ momentum_config = dict(
    step_ratio_up=0.4,
 )
 checkpoint_config = dict(interval=1)
+evaluation = dict(interval=2)
 # yapf:disable
 log_config = dict(
    interval=50,

--- a/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py
+++ b/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py
@@ -2,7 +2,7 @@
 norm_cfg = dict(type='BN', requires_grad=False)
 model = dict(
    type='FasterRCNN',
-    pretrained=('open-mmlab://resnet50_caffe_bgr'),
+    pretrained=('open-mmlab://detectron2/resnet50_caffe'),
    backbone=dict(
        type='ResNet',
        depth=50,

--- a/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-3class.py
+++ b/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-3class.py
@@ -10,11 +10,7 @@ model = dict(
        voxel_size=voxel_size,
        max_voxels=(16000, 40000)  # (training, testing) max_coxels
    ),
-    voxel_encoder=dict(
-        type='VoxelFeatureExtractorV3',
-        num_input_features=4,
-        num_filters=[4],
-        with_distance=False),
+    voxel_encoder=dict(type='HardSimpleVFE'),
    middle_encoder=dict(
        type='SparseUNet',
        in_channels=4,
@@ -306,6 +302,7 @@ momentum_config = dict(
    cyclic_times=1,
    step_ratio_up=0.4)
 checkpoint_config = dict(interval=1)
+evaluation = dict(interval=2)
 # yapf:disable
 log_config = dict(
    interval=50,

--- a/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-car.py
+++ b/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-car.py
@@ -10,11 +10,7 @@ model = dict(
        voxel_size=voxel_size,
        max_voxels=(16000, 40000)  # (training, testing) max_coxels
    ),
-    voxel_encoder=dict(
-        type='VoxelFeatureExtractorV3',
-        num_input_features=4,
-        num_filters=[4],
-        with_distance=False),
+    voxel_encoder=dict(type='HardSimpleVFE'),
    middle_encoder=dict(
        type='SparseUNet',
        in_channels=4,
@@ -265,6 +261,7 @@ momentum_config = dict(
    cyclic_times=1,
    step_ratio_up=0.4)
 checkpoint_config = dict(interval=1)
+evaluation = dict(interval=2)
 # yapf:disable
 log_config = dict(
    interval=50,

--- a/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -10,8 +10,8 @@ model = dict(
    ),
    voxel_encoder=dict(
        type='PillarFeatureNet',
-        num_input_features=4,
-        num_filters=[64],
+        in_channels=4,
+        feat_channels=[64],
        with_distance=False,
        # these two arguments should be consistent with the voxel_generator
        voxel_size=[0.16, 0.16, 4],
@@ -192,6 +192,7 @@ momentum_config = dict(
    step_ratio_up=0.4,
 )
 checkpoint_config = dict(interval=1)
+evaluation = dict(interval=2)
 # yapf:disable
 log_config = dict(
    interval=50,

--- a/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -10,11 +10,7 @@ model = dict(
        voxel_size=voxel_size,
        max_voxels=(16000, 40000),  # (training, testing) max_coxels
    ),
-    voxel_encoder=dict(
-        type='VoxelFeatureExtractorV3',
-        num_input_features=4,
-        num_filters=[4],
-        with_distance=False),
+    voxel_encoder=dict(type='HardSimpleVFE'),
    middle_encoder=dict(
        type='SparseEncoder',
        in_channels=4,
@@ -104,9 +100,21 @@ db_sampler = dict(
    classes=class_names,
    sample_groups=dict(Car=15),
 )
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
 train_pipeline = [
-    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
    dict(type='ObjectSample', db_sampler=db_sampler),
    dict(
        type='ObjectNoise',
@@ -126,7 +134,11 @@ train_pipeline = [
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
 ]
 test_pipeline = [
-    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(
        type='DefaultFormatBundle3D',
@@ -139,15 +151,18 @@ data = dict(
    samples_per_gpu=6,
    workers_per_gpu=4,
    train=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_train.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=train_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=False),
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False)),
    val=dict(
        type=dataset_type,
        data_root=data_root,
@@ -185,6 +200,7 @@ momentum_config = dict(
    step_ratio_up=0.4,
 )
 checkpoint_config = dict(interval=1)
+evaluation = dict(interval=1)
 # yapf:disable
 log_config = dict(
    interval=50,
@@ -194,7 +210,7 @@ log_config = dict(
    ])
 # yapf:enable
 # runtime settings
-total_epochs = 80
+total_epochs = 40
 dist_params = dict(backend='nccl')
 log_level = 'INFO'
 work_dir = './work_dirs/sec_secfpn_80e'

--- a/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py
+++ b/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py
@@ -2,7 +2,7 @@
 norm_cfg = dict(type='BN', requires_grad=False)
 model = dict(
    type='FasterRCNN',
-    pretrained=('open-mmlab://resnet50_caffe_bgr'),
+    pretrained=('open-mmlab://detectron2/resnet50_caffe'),
    backbone=dict(
        type='ResNet',
        depth=50,
@@ -120,13 +120,25 @@ classes = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
           'motorcycle', 'pedestrian', 'traffic_cone', 'barrier')
 img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+# file_client_args = dict(backend='disk')
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/': 's3://nuscenes/nuscenes/',
+        'data/nuscenes/': 's3://nuscenes/nuscenes/'
+    }))
 train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True, with_mask=False),
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=False,
+        file_client_args=file_client_args),
    dict(
        type='Resize',
-        img_scale=[(1200, 720), (1920, 1080)],
-        multiscale_mode='range',
+        img_scale=(1280, 720),
+        ratio_range=(0.75, 1.25),
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
@@ -135,10 +147,10 @@ train_pipeline = [
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
 ]
 test_pipeline = [
-    dict(type='LoadImageFromFile'),
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug',
-        img_scale=(1600, 900),
+        img_scale=(1280, 720),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
@@ -192,6 +204,6 @@ total_epochs = 12
 dist_params = dict(backend='nccl', port=29501)
 log_level = 'INFO'
 work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
-load_from = None
+load_from = './pretrain_mmdet/faster_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_3x-4767dd8e.pth'  # noqa
 resume_from = None
 workflow = [('train', 1)]
--- a/configs/nus/faster_rcnn_regnet-3gf_fpn_2x8_1x_nus.py
+++ b/configs/nus/faster_rcnn_regnet-3gf_fpn_2x8_1x_nus.py
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    pretrained='open-mmlab://regnetx_3.2gf',
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        base_channels=32,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=10,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        # following the setting of detectron,
+        # which improves ~0.2 bbox mAP.
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+)
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/nuscenes/'
+classes = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+           'motorcycle', 'pedestrian', 'traffic_cone', 'barrier')
+img_norm_cfg = dict(
+    # The mean and std is used in PyCls when training RegNets
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    to_rgb=False)
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/': 's3://nuscenes/nuscenes/',
+        'data/nuscenes/': 's3://nuscenes/nuscenes/'
+    }))
+train_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=False,
+        file_client_args=file_client_args),
+    dict(
+        type='Resize',
+        img_scale=(1280, 720),
+        ratio_range=(0.75, 1.25),
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1280, 720),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        classes=classes,
+        ann_file=data_root + 'nuscenes_infos_train.coco.json',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        classes=classes,
+        ann_file=data_root + 'nuscenes_infos_val.coco.json',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        classes=classes,
+        ann_file=data_root + 'nuscenes_infos_val.coco.json',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+evaluation = dict(interval=1)
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl', port=29501)
+log_level = 'INFO'
+work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
+load_from = './pretrain_mmdet/mask_rcnn_regnetx-3GF_fpn_mstrain_3x_coco_box-AP-43.1_mask-AP-38.7-e003695a.pth'  # noqa
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
@@ -15,8 +15,8 @@ model = dict(
    ),
    pts_voxel_encoder=dict(
        type='HardVFE',
-        num_input_features=4,
-        num_filters=[64, 64],
+        in_channels=4,
+        feat_channels=[64, 64],
        with_distance=False,
        voxel_size=voxel_size,
        with_cluster_center=True,
@@ -85,9 +85,7 @@ model = dict(
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
-    ),
-)
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
 # model training and testing settings
 train_cfg = dict(
    pts=dict(
@@ -138,10 +136,23 @@ db_sampler = dict(
        trailer=4,
        truck=4,
    ))
-
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
 train_pipeline = [
-    dict(type='LoadPointsFromFile', load_dim=5, use_dim=5),
-    dict(type='LoadPointsFromMultiSweeps', sweeps_num=10),
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='GlobalRotScale',
@@ -156,8 +167,15 @@ train_pipeline = [
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
 ]
 test_pipeline = [
-    dict(type='LoadPointsFromFile', load_dim=5, use_dim=5),
-    dict(type='LoadPointsFromMultiSweeps', sweeps_num=10),
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='RandomFlip3D', flip_ratio=0),
    dict(

--- a/mmdet3d/apis/train.py
+++ b/mmdet3d/apis/train.py
 import torch
 from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import DistSamplerSeedHook, Runner
+from mmcv.runner import DistSamplerSeedHook, Runner, build_optimizer

 from mmdet3d.utils import get_root_logger
 from mmdet.apis.train import parse_losses
 from mmdet.core import (DistEvalHook, DistOptimizerHook, EvalHook,
-                        Fp16OptimizerHook, build_optimizer)
+                        Fp16OptimizerHook)
 from mmdet.datasets import build_dataloader, build_dataset



--- a/mmdet3d/core/bbox/__init__.py
+++ b/mmdet3d/core/bbox/__init__.py
@@ -7,7 +7,8 @@ from .iou_calculators import (BboxOverlaps3D, BboxOverlapsNearest3D,
 from .samplers import (BaseSampler, CombinedSampler,
                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
                       PseudoSampler, RandomSampler, SamplingResult)
-from .structures import Box3DMode, CameraInstance3DBoxes, LiDARInstance3DBoxes
+from .structures import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes,
+                         DepthInstance3DBoxes, LiDARInstance3DBoxes)
 from .transforms import (bbox3d2result, bbox3d2roi,
                         box3d_to_corner3d_upright_depth,
                         boxes3d_to_bev_torch_lidar)
@@ -25,5 +26,6 @@ __all__ = [
    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
    'bbox_overlaps_3d', 'Box3DMode', 'LiDARInstance3DBoxes',
    'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result',
-    'box3d_to_corner3d_upright_depth'
+    'box3d_to_corner3d_upright_depth', 'DepthInstance3DBoxes',
+    'BaseInstance3DBoxes'
 ]
--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -466,8 +466,8 @@ def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
 def surface_equ_3d(polygon_surfaces):
    # return [a, b, c], d in ax+by+cz+d=0
    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
-    surface_vec = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :,
-                                                                   1:3, :]
+    surface_vec = polygon_surfaces[:, :, :2, :] - \
+        polygon_surfaces[:, :, 1:3, :]
    # normal_vec: [..., 3]
    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
    # print(normal_vec.shape, points[..., 0, :].shape)

--- a/mmdet3d/core/bbox/structures/__init__.py
+++ b/mmdet3d/core/bbox/structures/__init__.py
+from .base_box3d import BaseInstance3DBoxes
 from .box_3d_mode import Box3DMode
 from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
 from .lidar_box3d import LiDARInstance3DBoxes

-__all__ = ['Box3DMode', 'LiDARInstance3DBoxes', 'CameraInstance3DBoxes']
+__all__ = [
+    'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes',
+    'CameraInstance3DBoxes', 'DepthInstance3DBoxes'
+]
--- a/mmdet3d/core/bbox/structures/base_box3d.py
+++ b/mmdet3d/core/bbox/structures/base_box3d.py
@@ -10,13 +10,24 @@ from .utils import limit_period, xywhr2xyxyr
 class BaseInstance3DBoxes(object):
    """Base class for 3D Boxes

+    Note:
+        The box is bottom centered, i.e. the relative position of origin in
+            the box is [0.5, 0.5, 0].
+
    Args:
-        tensor (torch.Tensor | np.ndarray): a Nxbox_dim matrix.
+        tensor (torch.Tensor | np.ndarray | list): a Nxbox_dim matrix.
        box_dim (int): number of the dimension of a box
-        Each row is (x, y, z, x_size, y_size, z_size, yaw).
+            Each row is (x, y, z, x_size, y_size, z_size, yaw).
+            Default to 7.
+        with_yaw (bool): Whether the box is with yaw rotation.
+            If False, the value of yaw will be set to 0 as minmax boxes.
+            Default to True.
+        origin (tuple): The relative position of origin in the box.
+            Default to [0.5, 0.5, 0]. This will guide the box be converted to
+            [0.5, 0.5, 0] mode.
    """

-    def __init__(self, tensor, box_dim=7):
+    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=[0.5, 0.5, 0]):
        if isinstance(tensor, torch.Tensor):
            device = tensor.device
        else:
@@ -28,9 +39,22 @@ class BaseInstance3DBoxes(object):
            tensor = tensor.reshape((0, box_dim)).to(
                dtype=torch.float32, device=device)
        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
-        self.box_dim = box_dim
+
+        if not with_yaw and tensor.shape[-1] == 6:
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+        else:
+            self.box_dim = box_dim
+        self.with_yaw = with_yaw
        self.tensor = tensor

+        if origin != [0.5, 0.5, 0]:
+            dst = self.tensor.new_tensor([0.5, 0.5, 0])
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
    @property
    def volume(self):
        """Computes the volume of all the boxes.
@@ -51,12 +75,21 @@ class BaseInstance3DBoxes(object):
        """
        return self.tensor[:, 3:6]

+    @property
+    def yaw(self):
+        """Obtain the rotation of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with yaw of each box.
+        """
+        return self.tensor[:, 6]
+
    @property
    def height(self):
        """Obtain the height of all the boxes.

        Returns:
-            torch.Tensor: a vector with volume of each box.
+            torch.Tensor: a vector with height of each box.
        """
        return self.tensor[:, 5]

@@ -135,8 +168,8 @@ class BaseInstance3DBoxes(object):
        pass

    @abstractmethod
-    def flip(self):
-        """Flip the boxes in horizontal direction
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction
        """
        pass

@@ -184,8 +217,26 @@ class BaseInstance3DBoxes(object):
                (x_min, y_min, x_max, y_max)

        Returns:
-            a binary vector, indicating whether each box is inside
-            the reference range.
+            torch.Tensor: Indicating whether each box is inside
+                the reference range.
+        """
+        pass
+
+    @abstractmethod
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to `dst` mode.
+
+        Args:
+            dst (BoxMode): the target Box mode
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            BaseInstance3DBoxes:
+                The converted box of the same type in the `dst` mode.
        """
        pass

@@ -193,8 +244,7 @@ class BaseInstance3DBoxes(object):
        """Scale the box with horizontal and vertical scaling factors

        Args:
-            scale_factors (float):
-                scale factors to scale the boxes.
+            scale_factors (float): scale factors to scale the boxes.
        """
        self.tensor[:, :6] *= scale_factor
        self.tensor[:, 7:] *= scale_factor
@@ -218,9 +268,8 @@ class BaseInstance3DBoxes(object):
            threshold (float): the threshold of minimal sizes

        Returns:
-            Tensor:
-                a binary vector which represents whether each box is empty
-                (False) or non-empty (True).
+            torch.Tensor: a binary vector which represents whether each
+                box is empty (False) or non-empty (True).
        """
        box = self.tensor
        size_x = box[..., 3]
@@ -245,15 +294,19 @@ class BaseInstance3DBoxes(object):
            subject to Pytorch's indexing semantics.

        Returns:
-            Boxes: Create a new :class:`Boxes` by indexing.
+            BaseInstance3DBoxes: Create a new :class:`BaseInstance3DBoxes`
+                by indexing.
        """
        original_type = type(self)
        if isinstance(item, int):
-            return original_type(self.tensor[item].view(1, -1))
+            return original_type(
+                self.tensor[item].view(1, -1),
+                box_dim=self.box_dim,
+                with_yaw=self.with_yaw)
        b = self.tensor[item]
        assert b.dim() == 2, \
            f'Indexing on Boxes with {item} failed to return a matrix!'
-        return original_type(b)
+        return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)

    def __len__(self):
        return self.tensor.shape[0]
@@ -283,24 +336,30 @@ class BaseInstance3DBoxes(object):

    def to(self, device):
        original_type = type(self)
-        return original_type(self.tensor.to(device))
+        return original_type(
+            self.tensor.to(device),
+            box_dim=self.box_dim,
+            with_yaw=self.with_yaw)

    def clone(self):
        """Clone the Boxes.

        Returns:
-            Boxes
+            BaseInstance3DBoxes: Box object with the same properties as self.
        """
        original_type = type(self)
-        return original_type(self.tensor.clone())
+        return original_type(
+            self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw)

    @property
    def device(self):
        return self.tensor.device

    def __iter__(self):
-        """
-        Yield a box as a Tensor of shape (4,) at a time.
+        """Yield a box as a Tensor of shape (4,) at a time.
+
+        Returns:
+            torch.Tensor: a box of shape (4,).
        """
        yield from self.tensor

@@ -387,3 +446,23 @@ class BaseInstance3DBoxes(object):
            iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)

        return iou3d
+
+    def new_box(self, data):
+        """Create a new box object with data.
+
+        The new box and its tensor has the similar properties
+            as self and self.tensor, respectively.
+
+        Args:
+            data (torch.Tensor | numpy.array | list): Data which the
+                returned Tensor copies.
+
+        Returns:
+            BaseInstance3DBoxes: A new bbox with data and other
+                properties are similar to self.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, torch.Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(
+            new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw)
--- a/mmdet3d/core/bbox/structures/box_3d_mode.py
+++ b/mmdet3d/core/bbox/structures/box_3d_mode.py
@@ -5,6 +5,7 @@ import torch

 from .base_box3d import BaseInstance3DBoxes
 from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
 from .lidar_box3d import LiDARInstance3DBoxes


@@ -61,7 +62,8 @@ class Box3DMode(IntEnum):
        """Convert boxes from `src` mode to `dst` mode.

        Args:
-            box (tuple | list | np.ndarray | torch.Tensor):
+            box (tuple | list | np.ndarray |
+                torch.Tensor | BaseInstance3DBoxes):
                can be a k-tuple, k-list or an Nxk array/tensor, where k = 7
            src (BoxMode): the src Box mode
            dst (BoxMode): the target Box mode
@@ -72,7 +74,7 @@ class Box3DMode(IntEnum):
                to LiDAR. This requires a transformation matrix.

        Returns:
-            (tuple | list | np.ndarray | torch.Tensor):
+            (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes):
                The converted box of the same type.
        """
        if src == dst:
@@ -113,6 +115,14 @@ class Box3DMode(IntEnum):
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
        else:
            raise NotImplementedError(
                f'Conversion from Box3DMode {src} to {dst} '
@@ -141,10 +151,13 @@ class Box3DMode(IntEnum):
                target_type = CameraInstance3DBoxes
            elif dst == Box3DMode.LIDAR:
                target_type = LiDARInstance3DBoxes
+            elif dst == Box3DMode.DEPTH:
+                target_type = DepthInstance3DBoxes
            else:
                raise NotImplementedError(
                    f'Conversion to {dst} through {original_type}'
                    ' is not supported yet')
-            return target_type(arr, box_dim=arr.size(-1))
+            return target_type(
+                arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
        else:
            return arr