Clean voxel encoders

191288eb · zhangwenwei · 27ebcfac · 27ebcfac · 27ebcfac · 191288eb
Commit 191288eb authored Jun 07, 2020 by zhangwenwei
20 changed files
--- a/configs/fileclient/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/fileclient/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
-# model settings
-voxel_size = [0.25, 0.25, 8]
-point_cloud_range = [-50, -50, -5, 50, 50, 3]
-class_names = [
-    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
-    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
-]
-model = dict(
-    type='MVXFasterRCNNV2',
-    pts_voxel_layer=dict(
-        max_num_points=64,  # max_points_per_voxel
-        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
-        voxel_size=voxel_size,
-        max_voxels=(30000, 40000),  # (training, testing) max_coxels
-    ),
-    pts_voxel_encoder=dict(
-        type='HardVFE',
-        num_input_features=4,
-        num_filters=[64, 64],
-        with_distance=False,
-        voxel_size=voxel_size,
-        with_cluster_center=True,
-        with_voxel_center=True,
-        point_cloud_range=point_cloud_range,
-        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
-    pts_middle_encoder=dict(
-        type='PointPillarsScatter',
-        in_channels=64,
-        output_shape=[400, 400],  # checked from PointCloud3D
-    ),
-    pts_backbone=dict(
-        type='SECOND',
-        in_channels=64,
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        layer_nums=[3, 5, 5],
-        layer_strides=[2, 2, 2],
-        out_channels=[64, 128, 256],
-    ),
-    pts_neck=dict(
-        type='SECONDFPN',
-        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
-        in_channels=[64, 128, 256],
-        upsample_strides=[1, 2, 4],
-        out_channels=[128, 128, 128],
-    ),
-    pts_bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=10,
-        in_channels=384,
-        feat_channels=384,
-        use_direction_classifier=True,
-        anchor_generator=dict(
-            type='Anchor3DRangeGenerator',
-            ranges=[
-                [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
-                [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
-                [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
-                [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
-                [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
-                [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
-                [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
-            ],
-            sizes=[
-                [1.95017717, 4.60718145, 1.72270761],  # car
-                [2.4560939, 6.73778078, 2.73004906],  # truck
-                [2.87427237, 12.01320693, 3.81509561],  # trailer
-                [0.60058911, 1.68452161, 1.27192197],  # bicycle
-                [0.66344886, 0.7256437, 1.75748069],  # pedestrian
-                [0.39694519, 0.40359262, 1.06232151],  # traffic_cone
-                [2.49008838, 0.48578221, 0.98297065],  # barrier
-            ],
-            custom_values=[0, 0],
-            rotations=[0, 1.57],
-            reshape_out=True),
-        assigner_per_size=False,
-        diff_rad_by_sin=True,
-        dir_offset=0.7854,  # pi/4
-        dir_limit_offset=0,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
-# model training and testing settings
-train_cfg = dict(
-    pts=dict(
-        assigner=dict(  # for Car
-            type='MaxIoUAssigner',
-            iou_calculator=dict(type='BboxOverlapsNearest3D'),
-            pos_iou_thr=0.6,
-            neg_iou_thr=0.3,
-            min_pos_iou=0.3,
-            ignore_iof_thr=-1),
-        allowed_border=0,
-        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
-        pos_weight=-1,
-        debug=False))
-test_cfg = dict(
-    pts=dict(
-        use_rotate_nms=True,
-        nms_across_levels=False,
-        nms_pre=1000,
-        nms_thr=0.2,
-        score_thr=0.05,
-        min_bbox_size=0,
-        max_num=500
-        # soft-nms is also supported for rcnn testing
-        # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
-    ))
-
-# dataset settings
-dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-input_modality = dict(
-    use_lidar=True,
-    use_depth=False,
-    use_lidar_intensity=True,
-    use_camera=False,
-)
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
-    rate=1.0,
-    object_rot_range=[0.0, 0.0],
-    prepare=dict(),
-    classes=class_names,
-    sample_groups=dict(
-        bus=4,
-        trailer=4,
-        truck=4,
-    ))
-
-file_client_args = dict(
-    backend='petrel',
-    path_mapping=dict({
-        './data/nuscenes/': 's3://nuscenes/nuscenes/',
-        'data/nuscenes/': 's3://nuscenes/nuscenes/'
-    }))
-train_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=10,
-        file_client_args=file_client_args),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
-    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.3925, 0.3925],
-        scaling_uniform_noise=[0.95, 1.05],
-        trans_normal_noise=[0, 0, 0]),
-    dict(type='RandomFlip3D', flip_ratio=0.5),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='PointShuffle'),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
-]
-test_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=10,
-        file_client_args=file_client_args),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='RandomFlip3D', flip_ratio=0),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points']),
-]
-
-data = dict(
-    samples_per_gpu=4,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_train.pkl',
-        pipeline=train_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=False),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_val.pkl',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_val.pkl',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True))
-# optimizer
-optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
-# max_norm=10 is better for SECOND
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=1000,
-    warmup_ratio=1.0 / 1000,
-    step=[20, 23])
-momentum_config = None
-checkpoint_config = dict(interval=1)
-# yapf:disable
-evaluation = dict(interval=24)
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 24
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/fileclient/hv_second_secfpn_6x8_80e_fileclient_kitti-3d-car.py
+++ b/configs/fileclient/hv_second_secfpn_6x8_80e_fileclient_kitti-3d-car.py
-# model settings
-voxel_size = [0.05, 0.05, 0.1]
-point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
-
-model = dict(
-    type='VoxelNet',
-    voxel_layer=dict(
-        max_num_points=5,  # max_points_per_voxel
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(16000, 40000),  # (training, testing) max_coxels
-    ),
-    voxel_encoder=dict(
-        type='VoxelFeatureExtractorV3',
-        num_input_features=4,
-        num_filters=[4],
-        with_distance=False),
-    middle_encoder=dict(
-        type='SparseEncoder',
-        in_channels=4,
-        sparse_shape=[41, 1600, 1408],
-        order=('conv', 'norm', 'act')),
-    backbone=dict(
-        type='SECOND',
-        in_channels=256,
-        layer_nums=[5, 5],
-        layer_strides=[1, 2],
-        out_channels=[128, 256],
-    ),
-    neck=dict(
-        type='SECONDFPN',
-        in_channels=[128, 256],
-        upsample_strides=[1, 2],
-        out_channels=[256, 256],
-    ),
-    bbox_head=dict(
-        type='Anchor3DHead',
-        num_classes=1,
-        in_channels=512,
-        feat_channels=512,
-        use_direction_classifier=True,
-        anchor_generator=dict(
-            type='Anchor3DRangeGenerator',
-            ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
-            sizes=[[1.6, 3.9, 1.56]],
-            rotations=[0, 1.57],
-            reshape_out=True),
-        diff_rad_by_sin=True,
-        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
-    ),
-)
-# model training and testing settings
-train_cfg = dict(
-    assigner=dict(
-        type='MaxIoUAssigner',
-        iou_calculator=dict(type='BboxOverlapsNearest3D'),
-        pos_iou_thr=0.6,
-        neg_iou_thr=0.45,
-        min_pos_iou=0.45,
-        ignore_iof_thr=-1),
-    allowed_border=0,
-    pos_weight=-1,
-    debug=False)
-test_cfg = dict(
-    use_rotate_nms=True,
-    nms_across_levels=False,
-    nms_thr=0.01,
-    score_thr=0.3,
-    min_bbox_size=0,
-    nms_pre=100,
-    max_num=50)
-
-# dataset settings
-dataset_type = 'KittiDataset'
-data_root = 'data/kitti/'
-class_names = ['Car']
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-input_modality = dict(
-    use_lidar=False,
-    use_lidar_reduced=True,
-    use_depth=False,
-    use_lidar_intensity=True,
-    use_camera=False,
-)
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'kitti_dbinfos_train.pkl',
-    rate=1.0,
-    object_rot_range=[0.0, 0.0],
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(Car=5),
-    ),
-    classes=class_names,
-    sample_groups=dict(Car=15),
-)
-file_client_args = dict(
-    backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
-
-train_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=4,
-        use_dim=4,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadAnnotations3D',
-        with_bbox_3d=True,
-        with_label_3d=True,
-        file_client_args=file_client_args),
-    dict(type='ObjectSample', db_sampler=db_sampler),
-    dict(
-        type='ObjectNoise',
-        num_try=100,
-        loc_noise_std=[1.0, 1.0, 0.5],
-        global_rot_range=[0.0, 0.0],
-        rot_uniform_noise=[-0.78539816, 0.78539816]),
-    dict(type='RandomFlip3D', flip_ratio=0.5),
-    dict(
-        type='GlobalRotScale',
-        rot_uniform_noise=[-0.78539816, 0.78539816],
-        scaling_uniform_noise=[0.95, 1.05]),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='PointShuffle'),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
-]
-test_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        load_dim=4,
-        use_dim=4,
-        file_client_args=file_client_args),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points']),
-]
-
-data = dict(
-    samples_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_train.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=train_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=False),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=test_pipeline,
-        modality=input_modality,
-        classes=class_names,
-        test_mode=True))
-# optimizer
-lr = 0.0018  # max learning rate
-optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
-lr_config = dict(
-    policy='cyclic',
-    target_ratio=(10, 1e-4),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-momentum_config = dict(
-    policy='cyclic',
-    target_ratio=(0.85 / 0.95, 1),
-    cyclic_times=1,
-    step_ratio_up=0.4,
-)
-checkpoint_config = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 80
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/sec_secfpn_80e'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
+++ b/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
@@ -28,8 +28,8 @@ model = dict(
    ),
    pts_voxel_encoder=dict(
        type='DynamicVFE',
-        num_input_features=4,
-        num_filters=[64, 64],
+        in_channels=4,
+        feat_channels=[64, 64],
        with_distance=False,
        voxel_size=voxel_size,
        with_cluster_center=True,

--- a/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -12,8 +12,8 @@ model = dict(
    ),
    voxel_encoder=dict(
        type='DynamicPillarFeatureNet',
-        num_input_features=4,
-        num_filters=[64],
+        in_channels=4,
+        feat_channels=[64],
        with_distance=False,
        voxel_size=voxel_size,
        point_cloud_range=point_cloud_range,

--- a/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
+++ b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
@@ -11,8 +11,7 @@ model = dict(
        max_voxels=(-1, -1),  # (training, testing) max_coxels
    ),
    voxel_encoder=dict(
-        type='DynamicVFEV3',
-        num_input_features=4,
+        type='DynamicSimpleVFE',
        voxel_size=voxel_size,
        point_cloud_range=point_cloud_range),
    middle_encoder=dict(

--- a/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -11,8 +11,7 @@ model = dict(
        max_voxels=(-1, -1),  # (training, testing) max_coxels
    ),
    voxel_encoder=dict(
-        type='DynamicVFEV3',
-        num_input_features=4,
+        type='DynamicSimpleVFE',
        voxel_size=voxel_size,
        point_cloud_range=point_cloud_range),
    middle_encoder=dict(

--- a/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py
+++ b/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py
@@ -2,7 +2,7 @@
 norm_cfg = dict(type='BN', requires_grad=False)
 model = dict(
    type='FasterRCNN',
-    pretrained=('open-mmlab://resnet50_caffe_bgr'),
+    pretrained=('open-mmlab://detectron2/resnet50_caffe'),
    backbone=dict(
        type='ResNet',
        depth=50,

--- a/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-3class.py
+++ b/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-3class.py
@@ -10,11 +10,7 @@ model = dict(
        voxel_size=voxel_size,
        max_voxels=(16000, 40000)  # (training, testing) max_coxels
    ),
-    voxel_encoder=dict(
-        type='VoxelFeatureExtractorV3',
-        num_input_features=4,
-        num_filters=[4],
-        with_distance=False),
+    voxel_encoder=dict(type='HardSimpleVFE'),
    middle_encoder=dict(
        type='SparseUNet',
        in_channels=4,

--- a/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-car.py
+++ b/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-car.py
@@ -10,11 +10,7 @@ model = dict(
        voxel_size=voxel_size,
        max_voxels=(16000, 40000)  # (training, testing) max_coxels
    ),
-    voxel_encoder=dict(
-        type='VoxelFeatureExtractorV3',
-        num_input_features=4,
-        num_filters=[4],
-        with_distance=False),
+    voxel_encoder=dict(type='HardSimpleVFE'),
    middle_encoder=dict(
        type='SparseUNet',
        in_channels=4,

--- a/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -10,8 +10,8 @@ model = dict(
    ),
    voxel_encoder=dict(
        type='PillarFeatureNet',
-        num_input_features=4,
-        num_filters=[64],
+        in_channels=4,
+        feat_channels=[64],
        with_distance=False,
        # these two arguments should be consistent with the voxel_generator
        voxel_size=[0.16, 0.16, 4],

--- a/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -10,11 +10,7 @@ model = dict(
        voxel_size=voxel_size,
        max_voxels=(16000, 40000),  # (training, testing) max_coxels
    ),
-    voxel_encoder=dict(
-        type='VoxelFeatureExtractorV3',
-        num_input_features=4,
-        num_filters=[4],
-        with_distance=False),
+    voxel_encoder=dict(type='HardSimpleVFE'),
    middle_encoder=dict(
        type='SparseEncoder',
        in_channels=4,
@@ -104,9 +100,21 @@ db_sampler = dict(
    classes=class_names,
    sample_groups=dict(Car=15),
 )
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
 train_pipeline = [
-    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
    dict(type='ObjectSample', db_sampler=db_sampler),
    dict(
        type='ObjectNoise',
@@ -126,7 +134,11 @@ train_pipeline = [
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
 ]
 test_pipeline = [
-    dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(
        type='DefaultFormatBundle3D',

--- a/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py
+++ b/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py
@@ -2,7 +2,7 @@
 norm_cfg = dict(type='BN', requires_grad=False)
 model = dict(
    type='FasterRCNN',
-    pretrained=('open-mmlab://resnet50_caffe_bgr'),
+    pretrained=('open-mmlab://detectron2/resnet50_caffe'),
    backbone=dict(
        type='ResNet',
        depth=50,
@@ -120,13 +120,25 @@ classes = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
           'motorcycle', 'pedestrian', 'traffic_cone', 'barrier')
 img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+# file_client_args = dict(backend='disk')
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/': 's3://nuscenes/nuscenes/',
+        'data/nuscenes/': 's3://nuscenes/nuscenes/'
+    }))
 train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True, with_mask=False),
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=False,
+        file_client_args=file_client_args),
    dict(
        type='Resize',
-        img_scale=[(1200, 720), (1920, 1080)],
-        multiscale_mode='range',
+        img_scale=(1280, 720),
+        ratio_range=(0.75, 1.25),
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
@@ -135,10 +147,10 @@ train_pipeline = [
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
 ]
 test_pipeline = [
-    dict(type='LoadImageFromFile'),
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug',
-        img_scale=(1600, 900),
+        img_scale=(1280, 720),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
@@ -192,6 +204,6 @@ total_epochs = 12
 dist_params = dict(backend='nccl', port=29501)
 log_level = 'INFO'
 work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
-load_from = None
+load_from = './pretrain_mmdet/faster_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_3x-4767dd8e.pth'  # noqa
 resume_from = None
 workflow = [('train', 1)]
--- a/configs/nus/faster_rcnn_regnet-3gf_fpn_2x8_1x_nus.py
+++ b/configs/nus/faster_rcnn_regnet-3gf_fpn_2x8_1x_nus.py
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    pretrained='open-mmlab://regnetx_3.2gf',
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        base_channels=32,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=10,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        # following the setting of detectron,
+        # which improves ~0.2 bbox mAP.
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+)
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/nuscenes/'
+classes = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+           'motorcycle', 'pedestrian', 'traffic_cone', 'barrier')
+img_norm_cfg = dict(
+    # The mean and std is used in PyCls when training RegNets
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    to_rgb=False)
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/': 's3://nuscenes/nuscenes/',
+        'data/nuscenes/': 's3://nuscenes/nuscenes/'
+    }))
+train_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=False,
+        file_client_args=file_client_args),
+    dict(
+        type='Resize',
+        img_scale=(1280, 720),
+        ratio_range=(0.75, 1.25),
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1280, 720),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        classes=classes,
+        ann_file=data_root + 'nuscenes_infos_train.coco.json',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        classes=classes,
+        ann_file=data_root + 'nuscenes_infos_val.coco.json',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        classes=classes,
+        ann_file=data_root + 'nuscenes_infos_val.coco.json',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+evaluation = dict(interval=1)
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl', port=29501)
+log_level = 'INFO'
+work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
+load_from = './pretrain_mmdet/mask_rcnn_regnetx-3GF_fpn_mstrain_3x_coco_box-AP-43.1_mask-AP-38.7-e003695a.pth'  # noqa
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
@@ -15,8 +15,8 @@ model = dict(
    ),
    pts_voxel_encoder=dict(
        type='HardVFE',
-        num_input_features=4,
-        num_filters=[64, 64],
+        in_channels=4,
+        feat_channels=[64, 64],
        with_distance=False,
        voxel_size=voxel_size,
        with_cluster_center=True,
@@ -85,9 +85,7 @@ model = dict(
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
-    ),
-)
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
 # model training and testing settings
 train_cfg = dict(
    pts=dict(
@@ -138,10 +136,23 @@ db_sampler = dict(
        trailer=4,
        truck=4,
    ))
-
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
 train_pipeline = [
-    dict(type='LoadPointsFromFile', load_dim=5, use_dim=5),
-    dict(type='LoadPointsFromMultiSweeps', sweeps_num=10),
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='GlobalRotScale',
@@ -156,8 +167,15 @@ train_pipeline = [
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
 ]
 test_pipeline = [
-    dict(type='LoadPointsFromFile', load_dim=5, use_dim=5),
-    dict(type='LoadPointsFromMultiSweeps', sweeps_num=10),
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='RandomFlip3D', flip_ratio=0),
    dict(

--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -466,8 +466,8 @@ def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
 def surface_equ_3d(polygon_surfaces):
    # return [a, b, c], d in ax+by+cz+d=0
    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
-    surface_vec = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :,
-                                                                   1:3, :]
+    surface_vec = polygon_surfaces[:, :, :2, :] - \
+        polygon_surfaces[:, :, 1:3, :]
    # normal_vec: [..., 3]
    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
    # print(normal_vec.shape, points[..., 0, :].shape)

--- a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
@@ -121,8 +121,7 @@ class PartA2BboxHead(nn.Module):
                    3,
                    padding=1,
                    norm_cfg=norm_cfg,
-                    indice_key=f'rcnn_down0',
-                    conv_type='SubMConv3d'))
+                    indice_key='rcnn_down0'))
            merge_conv_channel_last = channel

        down_conv_channel_last = merge_conv_channel_last
@@ -135,8 +134,7 @@ class PartA2BboxHead(nn.Module):
                    3,
                    padding=1,
                    norm_cfg=norm_cfg,
-                    indice_key=f'rcnn_down1',
-                    conv_type='SubMConv3d'))
+                    indice_key='rcnn_down1'))
            down_conv_channel_last = channel

        self.conv_down.add_module('merge_conv',

--- a/mmdet3d/models/voxel_encoders/__init__.py
+++ b/mmdet3d/models/voxel_encoders/__init__.py
-from .pillar_encoder import AlignedPillarFeatureNet, PillarFeatureNet
-from .voxel_encoder import (DynamicVFE, VoxelFeatureExtractor,
-                            VoxelFeatureExtractorV2, VoxelFeatureExtractorV3)
+from .pillar_encoder import PillarFeatureNet
+from .voxel_encoder import DynamicSimpleVFE, DynamicVFE, HardSimpleVFE, HardVFE

 __all__ = [
-    'PillarFeatureNet', 'AlignedPillarFeatureNet', 'VoxelFeatureExtractor',
-    'DynamicVFE', 'VoxelFeatureExtractorV2', 'VoxelFeatureExtractorV3'
+    'PillarFeatureNet', 'HardVFE', 'DynamicVFE', 'HardSimpleVFE',
+    'DynamicSimpleVFE'
 ]
--- a/mmdet3d/models/voxel_encoders/pillar_encoder.py
+++ b/mmdet3d/models/voxel_encoders/pillar_encoder.py
@@ -9,55 +9,54 @@ from .utils import PFNLayer, get_paddings_indicator

 @VOXEL_ENCODERS.register_module()
 class PillarFeatureNet(nn.Module):
+    """Pillar Feature Net.
+
+    The network prepares the pillar features and performs forward pass
+    through PFNLayers.
+
+    Args:
+        in_channels (int). Number of input features,
+            either x, y, z or x, y, z, r.
+        feat_channels (list[int]). Number of features in each of the
+            N PFNLayers.
+        with_distance (bool). Whether to include Euclidean distance
+            to points.
+        voxel_size (list[float]). Size of voxels, only utilize x and y
+            size.
+        point_cloud_range (list[float]). Point cloud range, only
+            utilizes x and y min.
+    """

    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=(64, ),
+                 in_channels=4,
+                 feat_channels=(64, ),
                 with_distance=False,
                 with_cluster_center=True,
                 with_voxel_center=True,
                 voxel_size=(0.2, 0.2, 4),
                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
                 mode='max'):
-        """ Pillar Feature Net.
-        The network prepares the pillar features and performs forward pass
-        through PFNLayers.
-
-        Args:
-            num_input_features (int). Number of input features,
-                either x, y, z or x, y, z, r.
-            use_norm (bool). Whether to include BatchNorm.
-            num_filters (list[int]). Number of features in each of the
-                N PFNLayers.
-            with_distance (bool). Whether to include Euclidean distance
-                to points.
-            voxel_size (list[float]). Size of voxels, only utilize x and y
-                size.
-            point_cloud_range (list[float>]). Point cloud range, only
-                utilize x and y min.
-        """
-
        super(PillarFeatureNet, self).__init__()
-        assert len(num_filters) > 0
+        assert len(feat_channels) > 0
        if with_cluster_center:
-            num_input_features += 3
+            in_channels += 3
        if with_voxel_center:
-            num_input_features += 2
+            in_channels += 2
        if with_distance:
-            num_input_features += 1
+            in_channels += 1
        self._with_distance = with_distance
        self._with_cluster_center = with_cluster_center
        self._with_voxel_center = with_voxel_center

        # Create PillarFeatureNet layers
-        self.num_input_features = num_input_features
-        num_filters = [num_input_features] + list(num_filters)
+        self.in_channels = in_channels
+        feat_channels = [in_channels] + list(feat_channels)
        pfn_layers = []
-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
-            if i < len(num_filters) - 2:
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i < len(feat_channels) - 2:
                last_layer = False
            else:
                last_layer = True
@@ -65,7 +64,7 @@ class PillarFeatureNet(nn.Module):
                PFNLayer(
                    in_filters,
                    out_filters,
-                    use_norm,
+                    norm_cfg=norm_cfg,
                    last_layer=last_layer,
                    mode=mode))
        self.pfn_layers = nn.ModuleList(pfn_layers)
@@ -122,9 +121,8 @@ class PillarFeatureNet(nn.Module):
 class DynamicPillarFeatureNet(PillarFeatureNet):

    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=(64, ),
+                 in_channels=4,
+                 feat_channels=(64, ),
                 with_distance=False,
                 with_cluster_center=True,
                 with_voxel_center=True,
@@ -138,23 +136,23 @@ class DynamicPillarFeatureNet(PillarFeatureNet):
        """

        super(DynamicPillarFeatureNet, self).__init__(
-            num_input_features,
-            use_norm,
-            num_filters,
+            in_channels,
+            feat_channels,
            with_distance,
            with_cluster_center=with_cluster_center,
            with_voxel_center=with_voxel_center,
            voxel_size=voxel_size,
            point_cloud_range=point_cloud_range,
+            norm_cfg=norm_cfg,
            mode=mode)

-        num_filters = [self.num_input_features] + list(num_filters)
+        feat_channels = [self.in_channels] + list(feat_channels)
        pfn_layers = []
        # TODO: currently only support one PFNLayer

-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
            if i > 0:
                in_filters *= 2
            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
@@ -235,145 +233,3 @@ class DynamicPillarFeatureNet(PillarFeatureNet):
                features = torch.cat([point_feats, feat_per_point], dim=1)

        return voxel_feats, voxel_coors
-
-
-@VOXEL_ENCODERS.register_module()
-class AlignedPillarFeatureNet(nn.Module):
-
-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=(64, ),
-                 with_distance=False,
-                 with_cluster_center=True,
-                 with_voxel_center=True,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
-                 mode='max'):
-        """ Pillar Feature Net.
-
-        The network prepares the pillar features and performs forward pass
-        through PFNLayers.
-
-        Args:
-            num_input_features (int): Number of input features, either x, y, z
-                or x, y, z, r.
-            use_norm (bool): Whether to include BatchNorm.
-            num_filters (list[int]): Number of features in each of the N
-                PFNLayers.
-            with_distance (bool): Whether to include Euclidean distance to
-                points.
-            voxel_size (list[float]): Size of voxels, only utilize x and y
-                size.
-            point_cloud_range: (list[float]): Point cloud range, only
-                utilize x and y min.
-        """
-
-        super(AlignedPillarFeatureNet, self).__init__()
-
-        assert len(num_filters) > 0
-        if with_cluster_center:
-            print('Use cluster center')
-            num_input_features += 3
-        if with_voxel_center:
-            print('Use voxel center')
-            num_input_features += 2
-        if with_distance:
-            num_input_features += 1
-        self._with_distance = with_distance
-        self._with_cluster_center = with_cluster_center
-        self._with_voxel_center = with_voxel_center
-
-        # Create PillarFeatureNet layers
-        num_filters = [num_input_features] + list(num_filters)
-        pfn_layers = []
-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
-            if i < len(num_filters) - 2:
-                last_layer = False
-            else:
-                last_layer = True
-            pfn_layers.append(
-                PFNLayer(
-                    in_filters,
-                    out_filters,
-                    use_norm,
-                    last_layer=last_layer,
-                    mode=mode))
-        self.pfn_layers = nn.ModuleList(pfn_layers)
-
-        # Need pillar (voxel) size and x/y offset in order to
-        # calculate pillar offset
-        self.vx = voxel_size[0]
-        self.vy = voxel_size[1]
-        self.vz = voxel_size[2]
-        self.x_offset = self.vx / 2 + point_cloud_range[0]
-        self.y_offset = self.vy / 2 + point_cloud_range[1]
-        self.z_offset = self.vz / 2 + point_cloud_range[2]
-
-    def forward(self, features, num_points, coors):
-        features_ls = [features]
-        # Find distance of x, y, and z from cluster center
-        if self._with_cluster_center:
-            points_mean = features[:, :, :3].sum(
-                dim=1, keepdim=True) / num_points.type_as(features).view(
-                    -1, 1, 1)
-            f_cluster = features[:, :, :3] - points_mean
-            features_ls.append(f_cluster)
-
-        x_distance = features[:, :, 0] - (
-            coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
-            self.x_offset)
-        y_distance = features[:, :, 1] - (
-            coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
-            self.y_offset)
-        z_distance = features[:, :, 2] - (
-            coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
-            self.z_offset)
-
-        normed_x_distance = 1 - torch.abs(x_distance / self.vx)
-        normed_y_distance = 1 - torch.abs(y_distance / self.vy)
-        normed_z_distance = 1 - torch.abs(z_distance / self.vz)
-
-        x_mask = torch.gt(normed_x_distance, 0).type_as(features)
-        y_mask = torch.gt(normed_y_distance, 0).type_as(features)
-        z_mask = torch.gt(normed_z_distance, 0).type_as(features)
-
-        nonzero_points_mask = x_mask.mul(y_mask).mul(z_mask)
-        aligned_distance = normed_x_distance.mul(normed_y_distance).mul(
-            normed_z_distance).mul(nonzero_points_mask)
-
-        # Find distance of x, y, and z from pillar center
-        if self._with_voxel_center:
-            f_center = features[:, :, :2]
-            f_center[:, :, 0] = f_center[:, :, 0] - (
-                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
-                self.x_offset)
-            f_center[:, :, 1] = f_center[:, :, 1] - (
-                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
-                self.y_offset)
-            features_ls.append(f_center)
-
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features_ls.append(points_dist)
-
-        # Combine together feature decorations
-        features = torch.cat(features_ls, dim=-1)
-
-        # The feature decorations were calculated without regard to
-        # whether pillar was empty. Need to ensure that
-        # empty pillars remain set to zeros.
-        voxel_count = features.shape[1]
-        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
-        mask = torch.unsqueeze(mask, -1).type_as(features)
-        features *= mask
-
-        for pfn in self.pfn_layers:
-            if pfn.last_vfe:
-                features = pfn(features, aligned_distance)
-            else:
-                features = pfn(features)
-
-        return features.squeeze()
--- a/mmdet3d/models/voxel_encoders/utils.py
+++ b/mmdet3d/models/voxel_encoders/utils.py
@@ -4,28 +4,15 @@ from torch import nn
 from torch.nn import functional as F


-class Empty(nn.Module):
-
-    def __init__(self, *args, **kwargs):
-        super(Empty, self).__init__()
-
-    def forward(self, *args, **kwargs):
-        if len(args) == 1:
-            return args[0]
-        elif len(args) == 0:
-            return None
-        return args
-
-
 def get_paddings_indicator(actual_num, max_num, axis=0):
    """Create boolean mask by actually number of a padded tensor.

    Args:
-        actual_num ([type]): [description]
-        max_num ([type]): [description]
+        actual_num (torch.Tensor): Actual number of points in each voxel.
+        max_num (int): Max number of points in each voxel

    Returns:
-        [type]: [description]
+        torch.Tensor: Mask indicates which points are valid inside a voxel.
    """
    actual_num = torch.unsqueeze(actual_num, axis + 1)
    # tiled_actual_num: [N, M, 1]
@@ -52,13 +39,9 @@ class VFELayer(nn.Module):
        self.cat_max = cat_max
        self.max_out = max_out
        # self.units = int(out_channels / 2)
-        if norm_cfg:
-            norm_name, norm_layer = build_norm_layer(norm_cfg, out_channels)
-            self.norm = norm_layer
-            self.linear = nn.Linear(in_channels, out_channels, bias=False)
-        else:
-            self.norm = Empty(out_channels)
-            self.linear = nn.Linear(in_channels, out_channels, bias=True)
+
+        self.norm = build_norm_layer(norm_cfg, out_channels)[1]
+        self.linear = nn.Linear(in_channels, out_channels, bias=False)

    def forward(self, inputs):
        # [K, T, 7] tensordot [7, units] = [K, T, units]
@@ -89,7 +72,7 @@ class PFNLayer(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
-                 use_norm=True,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
                 last_layer=False,
                 mode='max'):
        """ Pillar Feature Net Layer.
@@ -100,9 +83,11 @@ class PFNLayer(nn.Module):
        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
-            use_norm (bool): Whether to include BatchNorm.
+            norm_cfg (dict): Config dict of normalization layers
            last_layer (bool): If last_layer, there is no concatenation of
                features.
+            mode (str): Pooling model to gather features inside voxels.
+                Default to 'max'.
        """

        super().__init__()
@@ -112,13 +97,10 @@ class PFNLayer(nn.Module):
            out_channels = out_channels // 2
        self.units = out_channels

-        if use_norm:
-            self.norm = nn.BatchNorm1d(self.units, eps=1e-3, momentum=0.01)
-            self.linear = nn.Linear(in_channels, self.units, bias=False)
-        else:
-            self.norm = Empty(self.unints)
-            self.linear = nn.Linear(in_channels, self.units, bias=True)
+        self.norm = build_norm_layer(norm_cfg, self.units)[1]
+        self.linear = nn.Linear(in_channels, self.units, bias=False)

+        assert mode in ['max', 'avg']
        self.mode = mode

    def forward(self, inputs, num_voxels=None, aligned_distance=None):

--- a/mmdet3d/models/voxel_encoders/voxel_encoder.py
+++ b/mmdet3d/models/voxel_encoders/voxel_encoder.py
 import torch
 from mmcv.cnn import build_norm_layer
 from torch import nn
-from torch.nn import functional as F

 from mmdet3d.ops import DynamicScatter
 from .. import builder
 from ..registry import VOXEL_ENCODERS
-from .utils import Empty, VFELayer, get_paddings_indicator
+from .utils import VFELayer, get_paddings_indicator


 @VOXEL_ENCODERS.register_module()
-class VoxelFeatureExtractor(nn.Module):
+class HardSimpleVFE(nn.Module):
+    """Simple voxel feature encoder used in SECOND

-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=[32, 128],
-                 with_distance=False,
-                 name='VoxelFeatureExtractor'):
-        super(VoxelFeatureExtractor, self).__init__()
-        self.name = name
-        assert len(num_filters) == 2
-        num_input_features += 3  # add mean features
-        if with_distance:
-            num_input_features += 1
-        self._with_distance = with_distance
-        self.vfe1 = VFELayer(num_input_features, num_filters[0], use_norm)
-        self.vfe2 = VFELayer(num_filters[0], num_filters[1], use_norm)
-
-        if use_norm:
-            self.linear = nn.Linear(num_filters[1], num_filters[1], bias=False)
-            self.norm = nn.BatchNorm1d(num_filters[1], eps=1e-3, momentum=0.01)
-        else:
-            self.linear = nn.Linear(num_filters[1], num_filters[1], bias=True)
-            self.norm = Empty(num_filters[1])
-
-    def forward(self, features, num_voxels, **kwargs):
-        # features: [concated_num_points, num_voxel_size, 3(4)]
-        # num_voxels: [concated_num_points]
-        # t = time.time()
-        # torch.cuda.synchronize()
-
-        points_mean = features[:, :, :3].sum(
-            dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1)
-        features_relative = features[:, :, :3] - points_mean
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features = torch.cat([features, features_relative, points_dist],
-                                 dim=-1)
-        else:
-            features = torch.cat([features, features_relative], dim=-1)
-        voxel_count = features.shape[1]
-        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
-        mask = torch.unsqueeze(mask, -1).type_as(features)
-        # mask = features.max(dim=2, keepdim=True)[0] != 0
-
-        # torch.cuda.synchronize()
-        # print("vfe prep forward time", time.time() - t)
-        x = self.vfe1(features)
-        x *= mask
-        x = self.vfe2(x)
-        x *= mask
-        x = self.linear(x)
-        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
-                                                               1).contiguous()
-        x = F.relu(x)
-        x *= mask
-        # x: [concated_num_points, num_voxel_size, 128]
-        voxelwise = torch.max(x, dim=1)[0]
-        return voxelwise
+    It simply averages the values of points in a voxel.
+    """

-
-@VOXEL_ENCODERS.register_module()
-class VoxelFeatureExtractorV2(nn.Module):
-
-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=[32, 128],
-                 with_distance=False,
-                 name='VoxelFeatureExtractor'):
-        super(VoxelFeatureExtractorV2, self).__init__()
-        self.name = name
-        assert len(num_filters) > 0
-        num_input_features += 3
-        if with_distance:
-            num_input_features += 1
-        self._with_distance = with_distance
-
-        num_filters = [num_input_features] + num_filters
-        filters_pairs = [[num_filters[i], num_filters[i + 1]]
-                         for i in range(len(num_filters) - 1)]
-        self.vfe_layers = nn.ModuleList(
-            [VFELayer(i, o, use_norm) for i, o in filters_pairs])
-
-        if use_norm:
-            self.linear = nn.Linear(
-                num_filters[-1], num_filters[-1], bias=False)
-            self.norm = nn.BatchNorm1d(
-                num_filters[-1], eps=1e-3, momentum=0.01)
-        else:
-            self.linear = nn.Linear(
-                num_filters[-1], num_filters[-1], bias=True)
-            self.norm = Empty(num_filters[-1])
-
-    def forward(self, features, num_voxels, **kwargs):
-        # features: [concated_num_points, num_voxel_size, 3(4)]
-        # num_voxels: [concated_num_points]
-        points_mean = features[:, :, :3].sum(
-            dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1)
-        features_relative = features[:, :, :3] - points_mean
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features = torch.cat([features, features_relative, points_dist],
-                                 dim=-1)
-        else:
-            features = torch.cat([features, features_relative], dim=-1)
-        voxel_count = features.shape[1]
-        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
-        mask = torch.unsqueeze(mask, -1).type_as(features)
-        for vfe in self.vfe_layers:
-            features = vfe(features)
-            features *= mask
-        features = self.linear(features)
-        features = self.norm(features.permute(0, 2, 1).contiguous()).permute(
-            0, 2, 1).contiguous()
-        features = F.relu(features)
-        features *= mask
-        # x: [concated_num_points, num_voxel_size, 128]
-        voxelwise = torch.max(features, dim=1)[0]
-        return voxelwise
-
-
-@VOXEL_ENCODERS.register_module()
-class VoxelFeatureExtractorV3(nn.Module):
-
-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=[32, 128],
-                 with_distance=False,
-                 name='VoxelFeatureExtractor'):
-        super(VoxelFeatureExtractorV3, self).__init__()
-        self.name = name
+    def __init__(self):
+        super(HardSimpleVFE, self).__init__()

    def forward(self, features, num_points, coors):
        # features: [concated_num_points, num_voxel_size, 3(4)]
@@ -153,13 +27,21 @@ class VoxelFeatureExtractorV3(nn.Module):


 @VOXEL_ENCODERS.register_module()
-class DynamicVFEV3(nn.Module):
+class DynamicSimpleVFE(nn.Module):
+    """Simple dynamic voxel feature encoder used in DV-SECOND
+
+    It simply averages the values of points in a voxel.
+    But the number of points in a voxel is dynamic and varies.
+
+    Args:
+        voxel_size (tupe[float]): Size of a single voxel
+        point_cloud_range (tuple[float]): Range of the point cloud and voxels
+    """

    def __init__(self,
-                 num_input_features=4,
                 voxel_size=(0.2, 0.2, 4),
                 point_cloud_range=(0, -40, -3, 70.4, 40, 1)):
-        super(DynamicVFEV3, self).__init__()
+        super(DynamicSimpleVFE, self).__init__()
        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)

    @torch.no_grad()
@@ -172,10 +54,37 @@ class DynamicVFEV3(nn.Module):

 @VOXEL_ENCODERS.register_module()
 class DynamicVFE(nn.Module):
+    """Dynamic Voxel feature encoder used in DV-SECOND
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+    The number of points inside the voxel varies.
+
+    Args:
+        in_channels (int): Input channels of VFE. Defaults to 4.
+        feat_channels (list(int)): Channels of features in VFE.
+        with_distance (bool): Whether to use the L2 distance of points to the
+            origin point. Default False.
+        with_cluster_center (bool): Whether to use the distance to cluster
+            center of points inside a voxel. Default to False.
+        with_voxel_center (bool): Whether to use the distance to center of
+            voxel for each points inside a voxel. Default to False.
+        voxel_size (tuple[float]): Size of a single voxel. Default to
+            (0.2, 0.2, 4).
+        point_cloud_range (tuple[float]): The range of points or voxels.
+            Default to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg (dict): Config dict of normalization layers.
+        mode (str): The mode when pooling features of points inside a voxel.
+            Available options include 'max' and 'avg'. Default to 'max'.
+        fusion_layer (dict | None): The config dict of fusion layer used in
+            multi-modal detectors. Default to None.
+        return_point_feats (bool): Whether to return the features of each
+            points. Default to False.
+    """

    def __init__(self,
-                 num_input_features=4,
-                 num_filters=[],
+                 in_channels=4,
+                 feat_channels=[],
                 with_distance=False,
                 with_cluster_center=False,
                 with_voxel_center=False,
@@ -186,14 +95,15 @@ class DynamicVFE(nn.Module):
                 fusion_layer=None,
                 return_point_feats=False):
        super(DynamicVFE, self).__init__()
-        assert len(num_filters) > 0
+        assert mode in ['avg', 'max']
+        assert len(feat_channels) > 0
        if with_cluster_center:
-            num_input_features += 3
+            in_channels += 3
        if with_voxel_center:
-            num_input_features += 3
+            in_channels += 3
        if with_distance:
-            num_input_features += 3
-        self.num_input_features = num_input_features
+            in_channels += 3
+        self.in_channels = in_channels
        self._with_distance = with_distance
        self._with_cluster_center = with_cluster_center
        self._with_voxel_center = with_voxel_center
@@ -209,11 +119,11 @@ class DynamicVFE(nn.Module):
        self.point_cloud_range = point_cloud_range
        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)

-        num_filters = [self.num_input_features] + list(num_filters)
+        feat_channels = [self.in_channels] + list(feat_channels)
        vfe_layers = []
-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
            if i > 0:
                in_filters *= 2
            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
@@ -232,6 +142,16 @@ class DynamicVFE(nn.Module):
            self.fusion_layer = builder.build_fusion_layer(fusion_layer)

    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
+        """Map voxel features to its corresponding points.
+
+        Args:
+            pts_coors (torch.Tensor): Voxel coordinate of each point.
+            voxel_mean (torch.Tensor): Voxel features to be mapped.
+            voxel_coors (torch.Tensor): Coordinates of valid voxels
+
+        Returns:
+            torch.Tensor: Features or centers of each point.
+        """
        # Step 1: scatter voxel into canvas
        # Calculate necessary things for canvas creation
        canvas_z = int(
@@ -269,9 +189,21 @@ class DynamicVFE(nn.Module):
                points=None,
                img_feats=None,
                img_meta=None):
-        """
-        features (torch.Tensor): NxC
-        coors (torch.Tensor): Nx(1+NDim)
+        """Forward functions
+
+        Args:
+            features (torch.Tensor): Features of voxels, shape is NxC.
+            coors (torch.Tensor): Coordinates of voxels, shape is  Nx(1+NDim).
+            points (list[torch.Tensor], optional): Raw points used to guide the
+                multi-modality fusion. Defaults to None.
+            img_feats (list[torch.Tensor], optional): Image fetures used for
+                multi-modality fusion. Defaults to None.
+            img_meta (dict, optional): [description]. Defaults to None.
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels.
        """
        features_ls = [features]
        # Find distance of x, y, and z from cluster center
@@ -320,10 +252,36 @@ class DynamicVFE(nn.Module):

 @VOXEL_ENCODERS.register_module()
 class HardVFE(nn.Module):
+    """Voxel feature encoder used in DV-SECOND
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+
+    Args:
+        in_channels (int): Input channels of VFE. Defaults to 4.
+        feat_channels (list(int)): Channels of features in VFE.
+        with_distance (bool): Whether to use the L2 distance of points to the
+            origin point. Default False.
+        with_cluster_center (bool): Whether to use the distance to cluster
+            center of points inside a voxel. Default to False.
+        with_voxel_center (bool): Whether to use the distance to center of
+            voxel for each points inside a voxel. Default to False.
+        voxel_size (tuple[float]): Size of a single voxel. Default to
+            (0.2, 0.2, 4).
+        point_cloud_range (tuple[float]): The range of points or voxels.
+            Default to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg (dict): Config dict of normalization layers.
+        mode (str): The mode when pooling features of points inside a voxel.
+            Available options include 'max' and 'avg'. Default to 'max'.
+        fusion_layer (dict | None): The config dict of fusion layer used in
+            multi-modal detectors. Default to None.
+        return_point_feats (bool): Whether to return the features of each
+            points. Default to False.
+    """

    def __init__(self,
-                 num_input_features=4,
-                 num_filters=[],
+                 in_channels=4,
+                 feat_channels=[],
                 with_distance=False,
                 with_cluster_center=False,
                 with_voxel_center=False,
@@ -334,14 +292,14 @@ class HardVFE(nn.Module):
                 fusion_layer=None,
                 return_point_feats=False):
        super(HardVFE, self).__init__()
-        assert len(num_filters) > 0
+        assert len(feat_channels) > 0
        if with_cluster_center:
-            num_input_features += 3
+            in_channels += 3
        if with_voxel_center:
-            num_input_features += 3
+            in_channels += 3
        if with_distance:
-            num_input_features += 3
-        self.num_input_features = num_input_features
+            in_channels += 3
+        self.in_channels = in_channels
        self._with_distance = with_distance
        self._with_cluster_center = with_cluster_center
        self._with_voxel_center = with_voxel_center
@@ -357,16 +315,16 @@ class HardVFE(nn.Module):
        self.point_cloud_range = point_cloud_range
        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)

-        num_filters = [self.num_input_features] + list(num_filters)
+        feat_channels = [self.in_channels] + list(feat_channels)
        vfe_layers = []
-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
            if i > 0:
                in_filters *= 2
            # TODO: pass norm_cfg to VFE
            # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
-            if i == (len(num_filters) - 2):
+            if i == (len(feat_channels) - 2):
                cat_max = False
                max_out = True
                if fusion_layer:
@@ -394,9 +352,20 @@ class HardVFE(nn.Module):
                coors,
                img_feats=None,
                img_meta=None):
-        """
-        features (torch.Tensor): NxMxC
-        coors (torch.Tensor): Nx(1+NDim)
+        """Forward functions
+
+        Args:
+            features (torch.Tensor): Features of voxels, shape is MxNxC.
+            num_points (torch.Tensor): Number of points in each voxel.
+            coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim).
+            img_feats (list[torch.Tensor], optional): Image fetures used for
+                multi-modality fusion. Defaults to None.
+            img_meta (dict, optional): [description]. Defaults to None.
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels.
        """
        features_ls = [features]
        # Find distance of x, y, and z from cluster center
@@ -438,19 +407,29 @@ class HardVFE(nn.Module):

        for i, vfe in enumerate(self.vfe_layers):
            voxel_feats = vfe(voxel_feats)
-        if torch.isnan(voxel_feats).any():
-            import pdb
-            pdb.set_trace()
+
        if (self.fusion_layer is not None and img_feats is not None):
            voxel_feats = self.fusion_with_mask(features, mask, voxel_feats,
                                                coors, img_feats, img_meta)
-        if torch.isnan(voxel_feats).any():
-            import pdb
-            pdb.set_trace()
+
        return voxel_feats

    def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats,
                         img_meta):
+        """Fuse image and point features with mask.
+
+        Args:
+            features (torch.Tensor): Features of voxel, usually it is the
+                values of points in voxels.
+            mask (torch.Tensor): Mask indicates valid features in each voxel.
+            voxel_feats (torch.Tensor): Features of voxels.
+            coors (torch.Tensor): Coordinates of each single voxel.
+            img_feats (list[torch.Tensor]): Multi-scale feature maps of image.
+            img_meta (list(dict)): Meta information of image and points.
+
+        Returns:
+            torch.Tensor: Fused features of each voxel.
+        """
        # the features is consist of a batch of points
        batch_size = coors[-1, 0] + 1
        points = []
@@ -459,20 +438,13 @@ class HardVFE(nn.Module):
            points.append(features[single_mask][mask[single_mask]])

        point_feats = voxel_feats[mask]
-        if torch.isnan(point_feats).any():
-            import pdb
-            pdb.set_trace()
        point_feats = self.fusion_layer(img_feats, points, point_feats,
                                        img_meta)
-        if torch.isnan(point_feats).any():
-            import pdb
-            pdb.set_trace()
+
        voxel_canvas = voxel_feats.new_zeros(
            size=(voxel_feats.size(0), voxel_feats.size(1),
                  point_feats.size(-1)))
        voxel_canvas[mask] = point_feats
        out = torch.max(voxel_canvas, dim=1)[0]
-        if torch.isnan(out).any():
-            import pdb
-            pdb.set_trace()
+
        return out