Merge branch 'add-tta' into 'master'

Support test time augmentation See merge request open-mmlab/mmdet.3d!70

Merge branch 'add-tta' into 'master'
Support test time augmentation See merge request open-mmlab/mmdet.3d!70
ce79da2e · zhangwenwei · f6e95edd · 3c5ff9fa · ce79da2e · ce79da2e
Commit ce79da2e authored Jun 17, 2020 by zhangwenwei
20 changed files
--- a/configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py
+++ b/configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py
@@ -180,14 +180,14 @@ train_pipeline = [
    dict(
        type='ObjectNoise',
        num_try=100,
-        loc_noise_std=[1.0, 1.0, 0.5],
+        translation_std=[1.0, 1.0, 0.5],
        global_rot_range=[0.0, 0.0],
-        rot_uniform_noise=[-0.78539816, 0.78539816]),
+        rot_range=[-0.78539816, 0.78539816]),
    dict(type='RandomFlip3D', flip_ratio=0.5),
    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.78539816, 0.78539816],
-        scaling_uniform_noise=[0.95, 1.05]),
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectNameFilter', classes=class_names),
@@ -197,12 +197,26 @@ train_pipeline = [
 ]
 test_pipeline = [
    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points'])
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
 ]

 data = dict(

--- a/configs/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py
 _base_ = [
+    '../_base_/models/pointpillars_second_fpn.py',
    '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py',
    '../_base_/default_runtime.py'
 ]
-# model settings
-voxel_size = [0.25, 0.25, 8]
-point_cloud_range = [-50, -50, -5, 50, 50, 3]
-class_names = [
-    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
-    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
-]
-model = dict(
-    type='MVXFasterRCNNV2',
-    pts_voxel_layer=dict(
-        max_num_points=64,  # max_points_per_voxel
-        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
-        voxel_size=voxel_size,
-        max_voxels=(30000, 40000),  # (training, testing) max_coxels
-    ),
-    pts_voxel_encoder=dict(
-        type='HardVFE',
-        in_channels=4,
-        feat_channels=[64, 64],
-        with_distance=False,
-        voxel_size=voxel_size,
-        with_cluster_center=True,
-        with_voxel_center=True,
-        point_cloud_range=point_cloud_range,
-        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
-    pts_middle_encoder=dict(
-        type='PointPillarsScatter',
-        in_channels=64,
-        output_shape=[400, 400],  # checked from PointCloud3D
-    ),
-    pts_backbone=dict(
-        type='SECOND',
-        in_channels=64,
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        layer_nums=[3, 5, 5],
-        layer_strides=[2, 2, 2],
-        out_channels=[64, 128, 256],
-    ),
-    pts_neck=dict(
-        type='FPN',
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        act_cfg=dict(type='ReLU'),
-        in_channels=[64, 128, 256],
-        out_channels=256,
-        start_level=0,
-        num_outs=3,
-    ),
-    pts_bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=10,
-        in_channels=256,
-        feat_channels=256,
-        use_direction_classifier=True,
-        anchor_generator=dict(
-            type='AlignedAnchor3DRangeGenerator',
-            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
-            scales=[1, 2, 4],
-            sizes=[
-                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
-                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
-                [1., 1., 1.],
-                [0.4, 0.4, 1],
-            ],
-            custom_values=[0, 0],
-            rotations=[0, 1.57],
-            reshape_out=True),
-        assigner_per_size=False,
-        diff_rad_by_sin=True,
-        dir_offset=0.7854,  # pi/4
-        dir_limit_offset=0,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
-# model training and testing settings
-train_cfg = dict(
-    pts=dict(
-        assigner=dict(  # for Car
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.6,
-            neg_iou_thr=0.3,
-            min_pos_iou=0.3,
-            ignore_iof_thr=-1),
-        allowed_border=0,
-        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
-        pos_weight=-1,
-        debug=False))
-test_cfg = dict(
-    pts=dict(
-        use_rotate_nms=True,
-        nms_across_levels=False,
-        nms_pre=1000,
-        nms_thr=0.2,
-        score_thr=0.05,
-        min_bbox_size=0,
-        max_num=500
-        # soft-nms is also supported for rcnn testing
-        # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
-    ))
-
-# dataset settings
-input_modality = dict(
-    use_lidar=True,
-    use_camera=False,
-    use_radar=False,
-    use_map=False,
-    use_external=False)
-
-data = dict(
-    train=dict(modality=input_modality),
-    val=dict(modality=input_modality),
-    test=dict(modality=input_modality))
-
-evaluation = dict(interval=24)
--- a/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py
+++ b/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py
-# model settings
-point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
-voxel_size = [0.16, 0.16, 4]
-model = dict(
-    type='VoxelNet',
-    voxel_layer=dict(
-        max_num_points=32,
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(16000, 40000)),
-    voxel_encoder=dict(
-        type='PillarFeatureNet',
-        in_channels=4,
-        feat_channels=[64],
-        with_distance=False,
-        voxel_size=voxel_size,
-        point_cloud_range=point_cloud_range),
-    middle_encoder=dict(
-        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
-    backbone=dict(
-        type='SECOND',
-        in_channels=64,
-        layer_nums=[3, 5, 5],
-        layer_strides=[2, 2, 2],
-        out_channels=[64, 128, 256]),
-    neck=dict(
-        type='SECONDFPN',
-        in_channels=[64, 128, 256],
-        upsample_strides=[1, 2, 4],
-        out_channels=[128, 128, 128]),
-    bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=3,
-        in_channels=384,
-        feat_channels=384,
-        use_direction_classifier=True,
-        anchor_generator=dict(
-            type='Anchor3DRangeGenerator',
-            ranges=[
-                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
-                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
-                [0, -39.68, -1.78, 70.4, 39.68, -1.78],
-            ],
-            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
-            rotations=[0, 1.57],
-            reshape_out=False),
-        diff_rad_by_sin=True,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
-# model training and testing settings
-train_cfg = dict(
-    assigner=[
-        dict(  # for Pedestrian
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.5,
-            neg_iou_thr=0.35,
-            min_pos_iou=0.35,
-            ignore_iof_thr=-1),
-        dict(  # for Cyclist
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.5,
-            neg_iou_thr=0.35,
-            min_pos_iou=0.35,
-            ignore_iof_thr=-1),
-        dict(  # for Car
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.6,
-            neg_iou_thr=0.45,
-            min_pos_iou=0.45,
-            ignore_iof_thr=-1),
-    ],
-    allowed_border=0,
-    pos_weight=-1,
-    debug=False)
-test_cfg = dict(
-    use_rotate_nms=True,
-    nms_across_levels=False,
-    nms_thr=0.01,
-    score_thr=0.1,
-    min_bbox_size=0,
-    nms_pre=100,
-    max_num=50)
+_base_ = [
+    '../_base_/models/hv_pointpillars_secfpn.py',
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py'
+]

+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
 # dataset settings
-dataset_type = 'KittiDataset'
 data_root = 'data/kitti/'
 class_names = ['Pedestrian', 'Cyclist', 'Car']
-input_modality = dict(use_lidar=True, use_camera=False)
+# PointPillars adopted a different sampling strategies among classes
 db_sampler = dict(
    data_root=data_root,
    info_path=data_root + 'kitti_dbinfos_train.pkl',
@@ -108,6 +20,7 @@ db_sampler = dict(
    classes=class_names,
    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10))

+# PointPillars uses different augmentation hyper parameters
 train_pipeline = [
    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
@@ -115,14 +28,14 @@ train_pipeline = [
    dict(
        type='ObjectNoise',
        num_try=100,
-        loc_noise_std=[0.25, 0.25, 0.25],
+        translation_std=[0.25, 0.25, 0.25],
        global_rot_range=[0.0, 0.0],
-        rot_uniform_noise=[-0.15707963267, 0.15707963267]),
+        rot_range=[-0.15707963267, 0.15707963267]),
    dict(type='RandomFlip3D', flip_ratio=0.5),
    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.78539816, 0.78539816],
-        scaling_uniform_noise=[0.95, 1.05]),
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
@@ -131,86 +44,44 @@ train_pipeline = [
 ]
 test_pipeline = [
    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points'])
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
 ]

 data = dict(
-    samples_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type='RepeatDataset',
-        times=2,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file=data_root + 'kitti_infos_train.pkl',
-            split='training',
-            pts_prefix='velodyne_reduced',
-            pipeline=train_pipeline,
-            modality=input_modality,
-            classes=class_names,
-            test_mode=False)),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True))
+    train=dict(dataset=dict(pipeline=train_pipeline, classes=class_names)),
+    val=dict(pipeline=test_pipeline, classes=class_names),
+    test=dict(pipeline=test_pipeline, classes=class_names))
+
+# In practice PointPillars also uses a different schedule
 # optimizer
-lr = 0.001  # max learning rate
-optimizer = dict(
-    type='AdamW',
-    lr=lr,
-    betas=(0.95, 0.99),  # the momentum is change during training
-    weight_decay=0.01)
+lr = 0.001
+optimizer = dict(lr=lr)
+# max_norm=35 is slightly better than 10 for PointPillars in the earlier
+# development of the codebase thus we keep the setting. But we does not
+# specifically tune this parameter.
 optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='cyclic',
-    target_ratio=(10, 1e-4),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-momentum_config = dict(
-    policy='cyclic',
-    target_ratio=(0.85 / 0.95, 1),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-checkpoint_config = dict(interval=1)
+# Use evaluation interval=2 reduce the number of evaluation timese
 evaluation = dict(interval=2)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
+# PointPillars usually need longer schedule than second, we simply double
+# the training schedule. Do remind that since we use RepeatDataset and
+# repeat factor is 2, so we actually train 160 epochs.
 total_epochs = 80
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/pp_secfpn_80e'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
 # model settings
-voxel_size = [0.16, 0.16, 4]
+_base_ = './hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py'
+
 point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
 model = dict(
-    type='VoxelNet',
-    voxel_layer=dict(
-        max_num_points=64,
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(12000, 20000)),
-    voxel_encoder=dict(
-        type='PillarFeatureNet',
-        in_channels=4,
-        feat_channels=[64],
-        with_distance=False,
-        voxel_size=voxel_size,
-        point_cloud_range=point_cloud_range),
-    middle_encoder=dict(
-        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
-    backbone=dict(
-        type='SECOND',
-        in_channels=64,
-        layer_nums=[3, 5, 5],
-        layer_strides=[2, 2, 2],
-        out_channels=[64, 128, 256]),
-    neck=dict(
-        type='SECONDFPN',
-        in_channels=[64, 128, 256],
-        upsample_strides=[1, 2, 4],
-        out_channels=[128, 128, 128]),
    bbox_head=dict(
        type='Anchor3DHead',
        num_classes=1,
-        in_channels=384,
-        feat_channels=384,
-        use_direction_classifier=True,
        anchor_generator=dict(
+            _delete_=True,
            type='Anchor3DRangeGenerator',
            ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]],
            sizes=[[1.6, 3.9, 1.56]],
            rotations=[0, 1.57],
-            reshape_out=True),
-        diff_rad_by_sin=True,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+            reshape_out=True)))
 # model training and testing settings
 train_cfg = dict(
+    _delete_=True,
    assigner=dict(
        type='MaxIoUAssigner',
        iou_calculator=dict(type='BboxOverlapsNearest3D'),
@@ -63,20 +26,11 @@ train_cfg = dict(
    allowed_border=0,
    pos_weight=-1,
    debug=False)
-test_cfg = dict(
-    use_rotate_nms=True,
-    nms_across_levels=False,
-    nms_thr=0.01,
-    score_thr=0.1,
-    min_bbox_size=0,
-    nms_pre=100,
-    max_num=50)

 # dataset settings
 dataset_type = 'KittiDataset'
 data_root = 'data/kitti/'
 class_names = ['Car']
-input_modality = dict(use_lidar=True, use_camera=False)
 db_sampler = dict(
    data_root=data_root,
    info_path=data_root + 'kitti_dbinfos_train.pkl',
@@ -93,14 +47,14 @@ train_pipeline = [
    dict(
        type='ObjectNoise',
        num_try=100,
-        loc_noise_std=[0.25, 0.25, 0.25],
+        translation_std=[0.25, 0.25, 0.25],
        global_rot_range=[0.0, 0.0],
-        rot_uniform_noise=[-0.15707963267, 0.15707963267]),
+        rot_range=[-0.15707963267, 0.15707963267]),
    dict(type='RandomFlip3D', flip_ratio=0.5),
    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.78539816, 0.78539816],
-        scaling_uniform_noise=[0.95, 1.05]),
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
@@ -109,84 +63,32 @@ train_pipeline = [
 ]
 test_pipeline = [
    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points'])
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
 ]

 data = dict(
-    samples_per_gpu=6,
-    workers_per_gpu=4,
    train=dict(
        type='RepeatDataset',
        times=2,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file=data_root + 'kitti_infos_train.pkl',
-            split='training',
-            pts_prefix='velodyne_reduced',
-            pipeline=train_pipeline,
-            modality=input_modality,
-            classes=class_names,
-            test_mode=False)),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True))
-# optimizer
-lr = 0.001  # max learning rate
-optimizer = dict(
-    type='AdamW',
-    lr=lr,
-    betas=(0.95, 0.99),  # the momentum is change during training
-    weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='cyclic',
-    target_ratio=(10, 1e-4),
-    cyclic_times=1,
-    step_ratio_up=0.4)
-momentum_config = dict(
-    policy='cyclic',
-    target_ratio=(0.85 / 0.95, 1),
-    cyclic_times=1,
-    step_ratio_up=0.4)
-checkpoint_config = dict(interval=1)
-evaluation = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 80
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/pp_secfpn_80e'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
+        dataset=dict(pipeline=train_pipeline, classes=class_names)),
+    val=dict(pipeline=test_pipeline, classes=class_names),
+    test=dict(pipeline=test_pipeline, classes=class_names))
--- a/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
 _base_ = [
-    '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py',
-    '../_base_/default_runtime.py'
+    '../_base_/models/pointpillars_second_fpn.py',
+    '../_base_/datasets/nus-3d.py',
+    '../_base_/schedules/schedule_2x.py',
+    '../_base_/default_runtime.py',
 ]
 # model settings
-voxel_size = [0.25, 0.25, 8]
-point_cloud_range = [-50, -50, -5, 50, 50, 3]
-class_names = [
-    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
-    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
-]
 model = dict(
-    type='MVXFasterRCNNV2',
-    pts_voxel_layer=dict(
-        max_num_points=64,
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(30000, 40000)),
-    pts_voxel_encoder=dict(
-        type='HardVFE',
-        in_channels=4,
-        feat_channels=[64, 64],
-        with_distance=False,
-        voxel_size=voxel_size,
-        with_cluster_center=True,
-        with_voxel_center=True,
-        point_cloud_range=point_cloud_range,
-        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
-    pts_middle_encoder=dict(
-        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
-    pts_backbone=dict(
-        type='SECOND',
-        in_channels=64,
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        layer_nums=[3, 5, 5],
-        layer_strides=[2, 2, 2],
-        out_channels=[64, 128, 256]),
    pts_neck=dict(
+        _delete_=True,
        type='SECONDFPN',
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        in_channels=[64, 128, 256],
        upsample_strides=[1, 2, 4],
        out_channels=[128, 128, 128]),
    pts_bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=10,
        in_channels=384,
        feat_channels=384,
-        use_direction_classifier=True,
        anchor_generator=dict(
-            type='Anchor3DRangeGenerator',
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
            ranges=[
                [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
                [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
@@ -69,56 +39,4 @@ model = dict(
            ],
            custom_values=[0, 0],
            rotations=[0, 1.57],
-            reshape_out=True),
-        assigner_per_size=False,
-        diff_rad_by_sin=True,
-        dir_offset=0.7854,  # pi/4
-        dir_limit_offset=0,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
-# model training and testing settings
-train_cfg = dict(
-    pts=dict(
-        assigner=dict(  # for Car
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.6,
-            neg_iou_thr=0.3,
-            min_pos_iou=0.3,
-            ignore_iof_thr=-1),
-        allowed_border=0,
-        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
-        pos_weight=-1,
-        debug=False))
-test_cfg = dict(
-    pts=dict(
-        use_rotate_nms=True,
-        nms_across_levels=False,
-        nms_pre=1000,
-        nms_thr=0.2,
-        score_thr=0.05,
-        min_bbox_size=0,
-        max_num=500))
-
-# dataset settings
-input_modality = dict(
-    use_lidar=True,
-    use_camera=False,
-    use_radar=False,
-    use_map=False,
-    use_external=False)
-
-data = dict(
-    train=dict(modality=input_modality),
-    val=dict(modality=input_modality),
-    test=dict(modality=input_modality))
-
-evaluation = dict(interval=24)
+            reshape_out=True)))
--- a/configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py
 _base_ = [
-    '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py',
-    '../_base_/default_runtime.py'
+    '../_base_/models/pointpillars_second_fpn.py',
+    '../_base_/datasets/nus-3d.py',
+    '../_base_/schedules/schedule_2x.py',
+    '../_base_/default_runtime.py',
 ]
 # model settings
-voxel_size = [0.25, 0.25, 8]
-point_cloud_range = [-50, -50, -5, 50, 50, 3]
-class_names = [
-    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
-    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
-]
 model = dict(
    type='MVXFasterRCNNV2',
    pretrained=dict(pts='open-mmlab://regnetx_400mf'),
-    pts_voxel_layer=dict(
-        max_num_points=64,
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(30000, 40000)),
-    pts_voxel_encoder=dict(
-        type='HardVFE',
-        in_channels=4,
-        feat_channels=[64, 64],
-        with_distance=False,
-        voxel_size=voxel_size,
-        with_cluster_center=True,
-        with_voxel_center=True,
-        point_cloud_range=point_cloud_range,
-        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
-    pts_middle_encoder=dict(
-        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
    pts_backbone=dict(
+        _delete_=True,
        type='NoStemRegNet',
        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
        out_indices=(1, 2, 3),
        frozen_stages=-1,
        strides=(1, 2, 2, 2),
        base_channels=64,
+        stem_channels=64,
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        norm_eval=False,
        style='pytorch'),
-    pts_neck=dict(
-        type='FPN',
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        act_cfg=dict(type='ReLU'),
-        in_channels=[64, 160, 384],
-        out_channels=256,
-        start_level=0,
-        num_outs=3),
-    pts_bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=10,
-        in_channels=256,
-        feat_channels=256,
-        use_direction_classifier=True,
-        anchor_generator=dict(
-            type='AlignedAnchor3DRangeGenerator',
-            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
-            scales=[1, 2, 4],
-            sizes=[
-                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
-                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
-                [1., 1., 1.],
-                [0.4, 0.4, 1],
-            ],
-            custom_values=[0, 0],
-            rotations=[0, 1.57],
-            reshape_out=True),
-        assigner_per_size=False,
-        diff_rad_by_sin=True,
-        dir_offset=0.7854,  # pi/4
-        dir_limit_offset=0,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
-# model training and testing settings
-train_cfg = dict(
-    pts=dict(
-        assigner=dict(
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.6,
-            neg_iou_thr=0.3,
-            min_pos_iou=0.3,
-            ignore_iof_thr=-1),
-        allowed_border=0,
-        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
-        pos_weight=-1,
-        debug=False))
-test_cfg = dict(
-    pts=dict(
-        use_rotate_nms=True,
-        nms_across_levels=False,
-        nms_pre=1000,
-        nms_thr=0.2,
-        score_thr=0.05,
-        min_bbox_size=0,
-        max_num=500))
-
-# dataset settings
-input_modality = dict(
-    use_lidar=True,
-    use_depth=False,
-    use_lidar_intensity=True,
-    use_camera=False)
-
-data = dict(
-    train=dict(modality=input_modality),
-    val=dict(modality=input_modality),
-    test=dict(modality=input_modality))
-
-evaluation = dict(interval=24)
+    pts_neck=dict(in_channels=[64, 160, 384]))
--- a/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py
-_base_ = [
-    '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py',
-    '../_base_/default_runtime.py'
-]
+_base_ = './hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py'
 # model settings
-voxel_size = [0.25, 0.25, 8]
-point_cloud_range = [-50, -50, -5, 50, 50, 3]
-class_names = [
-    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
-    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
-]
 model = dict(
-    type='MVXFasterRCNNV2',
-    pretrained=dict(pts='open-mmlab://regnetx_400mf'),
-    pts_voxel_layer=dict(
-        max_num_points=64,
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(30000, 40000)),
-    pts_voxel_encoder=dict(
-        type='HardVFE',
-        in_channels=4,
-        feat_channels=[64, 64],
-        with_distance=False,
-        voxel_size=voxel_size,
-        with_cluster_center=True,
-        with_voxel_center=True,
-        point_cloud_range=point_cloud_range,
-        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
-    pts_middle_encoder=dict(
-        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
-    pts_backbone=dict(
-        type='NoStemRegNet',
-        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
-        out_indices=(1, 2, 3),
-        frozen_stages=-1,
-        strides=(1, 2, 2, 2),
-        base_channels=64,
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        norm_eval=False,
-        style='pytorch'),
    pts_neck=dict(
        type='SECONDFPN',
+        _delete_=True,
        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
        in_channels=[64, 160, 384],
        upsample_strides=[1, 2, 4],
        out_channels=[128, 128, 128]),
    pts_bbox_head=dict(
        type='Anchor3DHead',
-        num_classes=10,
        in_channels=384,
        feat_channels=384,
-        use_direction_classifier=True,
        anchor_generator=dict(
-            type='Anchor3DRangeGenerator',
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
            ranges=[
                [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
                [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
@@ -73,56 +35,5 @@ model = dict(
            ],
            custom_values=[0, 0],
            rotations=[0, 1.57],
-            reshape_out=True),
-        assigner_per_size=False,
-        diff_rad_by_sin=True,
-        dir_offset=0.7854,  # pi/4
-        dir_limit_offset=0,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+            reshape_out=True)))
 # model training and testing settings
-train_cfg = dict(
-    pts=dict(
-        assigner=dict(  # for Car
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.6,
-            neg_iou_thr=0.3,
-            min_pos_iou=0.3,
-            ignore_iof_thr=-1),
-        allowed_border=0,
-        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
-        pos_weight=-1,
-        debug=False))
-test_cfg = dict(
-    pts=dict(
-        use_rotate_nms=True,
-        nms_across_levels=False,
-        nms_pre=1000,
-        nms_thr=0.2,
-        score_thr=0.05,
-        min_bbox_size=0,
-        max_num=500))
-
-# dataset settings
-input_modality = dict(
-    use_lidar=True,
-    use_depth=False,
-    use_lidar_intensity=True,
-    use_camera=False,
-)
-
-data = dict(
-    train=dict(modality=input_modality),
-    val=dict(modality=input_modality),
-    test=dict(modality=input_modality))
-
-evaluation = dict(interval=24)
--- a/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py
+++ b/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py
-# model settings
-voxel_size = [0.05, 0.05, 0.1]
-point_cloud_range = [0, -40, -3, 70.4, 40, 1]
-
-model = dict(
-    type='VoxelNet',
-    voxel_layer=dict(
-        max_num_points=5,
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(16000, 40000),  # (training, testing) max_coxels
-    ),
-    voxel_encoder=dict(type='HardSimpleVFE'),
-    middle_encoder=dict(
-        type='SparseEncoder',
-        in_channels=4,
-        sparse_shape=[41, 1600, 1408],
-        order=('conv', 'norm', 'act')),
-    backbone=dict(
-        type='SECOND',
-        in_channels=256,
-        layer_nums=[5, 5],
-        layer_strides=[1, 2],
-        out_channels=[128, 256],
-    ),
-    neck=dict(
-        type='SECONDFPN',
-        in_channels=[128, 256],
-        upsample_strides=[1, 2],
-        out_channels=[256, 256],
-    ),
-    bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=3,
-        in_channels=512,
-        feat_channels=512,
-        use_direction_classifier=True,
-        anchor_generator=dict(
-            type='Anchor3DRangeGenerator',
-            ranges=[
-                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
-                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
-                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
-            ],
-            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
-            rotations=[0, 1.57],
-            reshape_out=False),
-        diff_rad_by_sin=True,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
-    ),
-)
-# model training and testing settings
-train_cfg = dict(
-    assigner=[
-        dict(  # for Pedestrian
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.35,
-            neg_iou_thr=0.2,
-            min_pos_iou=0.2,
-            ignore_iof_thr=-1),
-        dict(  # for Cyclist
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.35,
-            neg_iou_thr=0.2,
-            min_pos_iou=0.2,
-            ignore_iof_thr=-1),
-        dict(  # for Car
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.6,
-            neg_iou_thr=0.45,
-            min_pos_iou=0.45,
-            ignore_iof_thr=-1),
-    ],
-    allowed_border=0,
-    pos_weight=-1,
-    debug=False)
-test_cfg = dict(
-    use_rotate_nms=True,
-    nms_across_levels=False,
-    nms_thr=0.01,
-    score_thr=0.1,
-    min_bbox_size=0,
-    nms_pre=100,
-    max_num=50)
-
-# dataset settings
-dataset_type = 'KittiDataset'
-data_root = 'data/kitti/'
-class_names = ['Pedestrian', 'Cyclist', 'Car']
-input_modality = dict(use_lidar=True, use_camera=False)
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'kitti_dbinfos_train.pkl',
-    rate=1.0,
-    object_rot_range=[0.0, 0.0],
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(
-            Car=5,
-            Pedestrian=10,
-            Cyclist=10,
-        )),
-    classes=class_names,
-    sample_groups=dict(
-        Car=12,
-        Pedestrian=6,
-        Cyclist=6,
-    ))
-file_client_args = dict(backend='disk')
-# file_client_args = dict(
-#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
-
-train_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=4,
-        use_dim=4,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadAnnotations3D',
-        with_bbox_3d=True,
-        with_label_3d=True,
-        file_client_args=file_client_args),
-    dict(type='ObjectSample', db_sampler=db_sampler),
-    dict(
-        type='ObjectNoise',
-        num_try=100,
-        loc_noise_std=[1.0, 1.0, 0.5],
-        global_rot_range=[0.0, 0.0],
-        rot_uniform_noise=[-0.78539816, 0.78539816]),
-    dict(type='RandomFlip3D', flip_ratio=0.5),
-    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.78539816, 0.78539816],
-        scaling_uniform_noise=[0.95, 1.05]),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='PointShuffle'),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+_base_ = [
+    '../_base_/models/hv_second_secfpn.py',
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py'
 ]
-test_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=4,
-        use_dim=4,
-        file_client_args=file_client_args),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points']),
-]
-
-data = dict(
-    samples_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type='RepeatDataset',
-        times=2,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file=data_root + 'kitti_infos_train.pkl',
-            split='training',
-            pts_prefix='velodyne_reduced',
-            pipeline=train_pipeline,
-            modality=input_modality,
-            classes=class_names,
-            test_mode=False)),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True))
-# optimizer
-lr = 0.0018  # max learning rate
-optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
-lr_config = dict(
-    policy='cyclic',
-    target_ratio=(10, 1e-4),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-momentum_config = dict(
-    policy='cyclic',
-    target_ratio=(0.85 / 0.95, 1),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-checkpoint_config = dict(interval=1)
-evaluation = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 40
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/sec_secfpn_80e'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py
-# model settings
-voxel_size = [0.05, 0.05, 0.1]
-point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
-
+_base_ = [
+    '../_base_/models/hv_second_secfpn.py',
+    '../_base_/datasets/kitti-3d-car.py', '../_base_/schedules/cyclic_40e.py',
+    '../_base_/default_runtime.py'
+]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
 model = dict(
-    type='VoxelNet',
-    voxel_layer=dict(
-        max_num_points=5,  # max_points_per_voxel
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(16000, 40000),  # (training, testing) max_coxels
-    ),
-    voxel_encoder=dict(type='HardSimpleVFE'),
-    middle_encoder=dict(
-        type='SparseEncoder',
-        in_channels=4,
-        sparse_shape=[41, 1600, 1408],
-        order=('conv', 'norm', 'act')),
-    backbone=dict(
-        type='SECOND',
-        in_channels=256,
-        layer_nums=[5, 5],
-        layer_strides=[1, 2],
-        out_channels=[128, 256],
-    ),
-    neck=dict(
-        type='SECONDFPN',
-        in_channels=[128, 256],
-        upsample_strides=[1, 2],
-        out_channels=[256, 256],
-    ),
    bbox_head=dict(
        type='Anchor3DHead',
        num_classes=1,
-        in_channels=512,
-        feat_channels=512,
-        use_direction_classifier=True,
        anchor_generator=dict(
+            _delete_=True,
            type='Anchor3DRangeGenerator',
            ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
            sizes=[[1.6, 3.9, 1.56]],
            rotations=[0, 1.57],
-            reshape_out=True),
-        diff_rad_by_sin=True,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
-    ),
-)
+            reshape_out=True)))
 # model training and testing settings
 train_cfg = dict(
+    _delete_=True,
    assigner=dict(
        type='MaxIoUAssigner',
        iou_calculator=dict(type='BboxOverlapsNearest3D'),
@@ -66,146 +28,3 @@ train_cfg = dict(
    allowed_border=0,
    pos_weight=-1,
    debug=False)
-test_cfg = dict(
-    use_rotate_nms=True,
-    nms_across_levels=False,
-    nms_thr=0.01,
-    score_thr=0.1,
-    min_bbox_size=0,
-    nms_pre=100,
-    max_num=50)
-
-# dataset settings
-dataset_type = 'KittiDataset'
-data_root = 'data/kitti/'
-class_names = ['Car']
-input_modality = dict(use_lidar=True, use_camera=False)
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'kitti_dbinfos_train.pkl',
-    rate=1.0,
-    object_rot_range=[0.0, 0.0],
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(Car=5),
-    ),
-    classes=class_names,
-    sample_groups=dict(Car=15),
-)
-file_client_args = dict(backend='disk')
-# file_client_args = dict(
-#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
-
-train_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=4,
-        use_dim=4,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadAnnotations3D',
-        with_bbox_3d=True,
-        with_label_3d=True,
-        file_client_args=file_client_args),
-    dict(type='ObjectSample', db_sampler=db_sampler),
-    dict(
-        type='ObjectNoise',
-        num_try=100,
-        loc_noise_std=[1.0, 1.0, 0.5],
-        global_rot_range=[0.0, 0.0],
-        rot_uniform_noise=[-0.78539816, 0.78539816]),
-    dict(type='RandomFlip3D', flip_ratio=0.5),
-    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.78539816, 0.78539816],
-        scaling_uniform_noise=[0.95, 1.05]),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='PointShuffle'),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
-]
-test_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=4,
-        use_dim=4,
-        file_client_args=file_client_args),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points']),
-]
-
-data = dict(
-    samples_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type='RepeatDataset',
-        times=2,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file=data_root + 'kitti_infos_train.pkl',
-            split='training',
-            pts_prefix='velodyne_reduced',
-            pipeline=train_pipeline,
-            modality=input_modality,
-            classes=class_names,
-            test_mode=False)),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True))
-# optimizer
-lr = 0.0018  # max learning rate
-optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
-lr_config = dict(
-    policy='cyclic',
-    target_ratio=(10, 1e-4),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-momentum_config = dict(
-    policy='cyclic',
-    target_ratio=(0.85 / 0.95, 1),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-checkpoint_config = dict(interval=1)
-evaluation = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 40
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/sec_secfpn_80e'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/votenet/votenet_16x8_sunrgbd-3d-10class.py
+++ b/configs/votenet/votenet_16x8_sunrgbd-3d-10class.py
@@ -19,13 +19,3 @@ model = dict(
                [0.404671, 1.071108, 1.688889], [0.76584, 1.398258, 0.472728]
            ]),
    ))
-
-# optimizer
-# yapf:disable
-log_config = dict(
-    interval=30,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
--- a/configs/votenet/votenet_8x8_scannet-3d-18class.py
+++ b/configs/votenet/votenet_8x8_scannet-3d-18class.py
@@ -34,7 +34,7 @@ model = dict(
 # optimizer
 # yapf:disable
 log_config = dict(
-    interval=50,
+    interval=30,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')

--- a/docs/tutorials/data_pipeline.md
+++ b/docs/tutorials/data_pipeline.md
@@ -117,7 +117,7 @@ For each operation, we list the related dict fields that are added/updated/remov
 - update: img, proposals, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg

 `Collect`
- add: img_meta (the keys of img_meta is specified by `meta_keys`)
+- add: img_metas (the keys of img_metas is specified by `meta_keys`)
 - remove: all other keys except for those specified by `keys`

 ### Test time augmentation

--- a/mmdet3d/apis/train.py
+++ b/mmdet3d/apis/train.py
@@ -28,8 +28,8 @@ def batch_processor(model, data, train_mode):
    losses = model(**data)
    loss, log_vars = parse_losses(losses)

-    if 'img_meta' in data:
-        num_samples = len(data['img_meta'].data)
+    if 'img_metas' in data:
+        num_samples = len(data['img_metas'].data)
    else:
        num_samples = len(data['img'].data)
    outputs = dict(loss=loss, log_vars=log_vars, num_samples=num_samples)

--- a/mmdet3d/core/bbox/__init__.py
+++ b/mmdet3d/core/bbox/__init__.py
@@ -8,8 +8,9 @@ from .samplers import (BaseSampler, CombinedSampler,
                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
                       PseudoSampler, RandomSampler, SamplingResult)
 from .structures import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes,
-                         DepthInstance3DBoxes, LiDARInstance3DBoxes)
-from .transforms import (bbox3d2result, bbox3d2roi,
+                         DepthInstance3DBoxes, LiDARInstance3DBoxes,
+                         xywhr2xyxyr)
+from .transforms import (bbox3d2result, bbox3d2roi, bbox3d_mapping_back,
                         box3d_to_corner3d_upright_depth,
                         boxes3d_to_bev_torch_lidar)

@@ -27,5 +28,5 @@ __all__ = [
    'bbox_overlaps_3d', 'Box3DMode', 'LiDARInstance3DBoxes',
    'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result',
    'box3d_to_corner3d_upright_depth', 'DepthInstance3DBoxes',
-    'BaseInstance3DBoxes'
+    'BaseInstance3DBoxes', 'bbox3d_mapping_back', 'xywhr2xyxyr'
 ]
--- a/mmdet3d/core/bbox/structures/__init__.py
+++ b/mmdet3d/core/bbox/structures/__init__.py
@@ -3,8 +3,9 @@ from .box_3d_mode import Box3DMode
 from .cam_box3d import CameraInstance3DBoxes
 from .depth_box3d import DepthInstance3DBoxes
 from .lidar_box3d import LiDARInstance3DBoxes
+from .utils import xywhr2xyxyr

 __all__ = [
    'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes',
-    'CameraInstance3DBoxes', 'DepthInstance3DBoxes'
+    'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr'
 ]
--- a/mmdet3d/core/bbox/structures/base_box3d.py
+++ b/mmdet3d/core/bbox/structures/base_box3d.py
@@ -334,7 +334,10 @@ class BaseInstance3DBoxes(object):

        # use torch.cat (v.s. layers.cat)
        # so the returned boxes never share storage with input
-        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
+        cat_boxes = cls(
+            torch.cat([b.tensor for b in boxes_list], dim=0),
+            box_dim=boxes_list[0].tensor.shape[1],
+            with_yaw=boxes_list[0].with_yaw)
        return cat_boxes

    def to(self, device):

--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
 import torch


+def bbox3d_mapping_back(bboxes, scale_factor, flip):
+    """Map bboxes from testing scale to original image scale"""
+    new_bboxes = bboxes.clone()
+    if flip:
+        new_bboxes.flip()
+    new_bboxes.scale(1 / scale_factor)
+
+    return new_bboxes
+
+
 def transform_lidar_to_cam(boxes_lidar):
    """
    Only transform format, not exactly in camera coords

--- a/mmdet3d/core/post_processing/__init__.py
+++ b/mmdet3d/core/post_processing/__init__.py
@@ -2,9 +2,10 @@ from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks,
                                        merge_aug_proposals, merge_aug_scores,
                                        multiclass_nms)
 from .box3d_nms import aligned_3d_nms, box3d_multiclass_nms
+from .merge_augs import merge_aug_bboxes_3d

 __all__ = [
    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
-    'aligned_3d_nms'
+    'aligned_3d_nms', 'merge_aug_bboxes_3d'
 ]
--- a/mmdet3d/core/post_processing/box3d_nms.py
+++ b/mmdet3d/core/post_processing/box3d_nms.py
@@ -61,7 +61,7 @@ def box3d_multiclass_nms(mlvl_bboxes,
    else:
        bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
        scores = mlvl_scores.new_zeros((0, ))
-        labels = mlvl_scores.new_zeros((0, mlvl_scores.size(-1)))
+        labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
        dir_scores = mlvl_scores.new_zeros((0, ))
    return bboxes, scores, labels, dir_scores


--- a/mmdet3d/core/post_processing/merge_augs.py
+++ b/mmdet3d/core/post_processing/merge_augs.py
+import torch
+
+from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu
+from ..bbox import bbox3d2result, bbox3d_mapping_back, xywhr2xyxyr
+
+
+def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg):
+    """Merge augmented detection 3D bboxes and scores.
+
+    Args:
+        aug_results (list[dict]): The dict of detection results.
+            The dict contains the following keys
+            - boxes_3d (:obj:BaseInstance3DBoxes): detection bbox
+            - scores_3d (torch.Tensor): detection scores
+            - labels_3d (torch.Tensor): predicted box labels
+        img_metas (list[dict]): Meta information of each sample
+        test_cfg (dict): Test config.
+
+    Returns:
+        dict: bbox results in cpu mode, containing the merged results
+            - boxes_3d (:obj:BaseInstance3DBoxes): merged detection bbox
+            - scores_3d (torch.Tensor): merged detection scores
+            - labels_3d (torch.Tensor): merged predicted box labels
+    """
+
+    assert len(aug_results) == len(img_metas), \
+        '"aug_results" should have the same length as "img_metas", got len(' \
+        f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}'
+
+    recovered_bboxes = []
+    recovered_scores = []
+    recovered_labels = []
+
+    for bboxes, img_info in zip(aug_results, img_metas):
+        scale_factor = img_info[0]['pcd_scale_factor']
+        flip = img_info[0]['pcd_flip']
+        recovered_scores.append(bboxes['scores_3d'])
+        recovered_labels.append(bboxes['labels_3d'])
+        bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor, flip)
+        recovered_bboxes.append(bboxes)
+
+    aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes)
+    aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev)
+    aug_scores = torch.cat(recovered_scores, dim=0)
+    aug_labels = torch.cat(recovered_labels, dim=0)
+
+    # TODO: use a more elegent way to deal with nms
+    if test_cfg.use_rotate_nms:
+        nms_func = nms_gpu
+    else:
+        nms_func = nms_normal_gpu
+
+    merged_bboxes = []
+    merged_scores = []
+    merged_labels = []
+
+    # Apply multi-class nms when merge bboxes
+    if len(aug_labels) == 0:
+        return bbox3d2result(aug_bboxes, aug_scores, aug_labels)
+
+    for class_id in range(torch.max(aug_labels).item() + 1):
+        class_inds = (aug_labels == class_id)
+        bboxes_i = aug_bboxes[class_inds]
+        bboxes_nms_i = aug_bboxes_for_nms[class_inds, :]
+        scores_i = aug_scores[class_inds]
+        labels_i = aug_labels[class_inds]
+        if len(bboxes_nms_i) == 0:
+            continue
+        selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr)
+
+        merged_bboxes.append(bboxes_i[selected, :])
+        merged_scores.append(scores_i[selected])
+        merged_labels.append(labels_i[selected])
+
+    merged_bboxes = merged_bboxes[0].cat(merged_bboxes)
+    merged_scores = torch.cat(merged_scores, dim=0)
+    merged_labels = torch.cat(merged_labels, dim=0)
+
+    _, order = merged_scores.sort(0, descending=True)
+    num = min(test_cfg.max_num, len(aug_bboxes))
+    order = order[:num]
+
+    merged_bboxes = merged_bboxes[order]
+    merged_scores = merged_scores[order]
+    merged_labels = merged_labels[order]
+
+    return bbox3d2result(merged_bboxes, merged_scores, merged_labels)