Add dynamic voxelization and config for voxel r-cnn on Waymo Open Dataset (#760)

* add dynamic pillar vfe * make the upperbound unaccessible * add place holder for voxel generation * add DynPillarVFE * add PFNLayerV2 * add try except for torch_scatter package * add dynamic pillar in readme * add the cfg file of centerpoint with dynamic pillar * recover original mask_points_by_range * masking points with points_coords out of grid_size * add dynamic voxelization and config for voxel_rcnn

Add dynamic voxelization and config for voxel r-cnn on Waymo Open Dataset (#760)
* add dynamic pillar vfe * make the upperbound unaccessible * add place holder for voxel generation * add DynPillarVFE * add PFNLayerV2 * add try except for torch_scatter package * add dynamic pillar in readme * add the cfg file of centerpoint with dynamic pillar * recover original mask_points_by_range * masking points with points_coords out of grid_size * add dynamic voxelization and config for voxel_rcnn
1376e610 · Jiajun Deng · GitHub · c47a94bc · 1376e610 · 1376e610
Unverified Commit 1376e610 authored Jan 18, 2022 by Jiajun Deng Committed by GitHub Jan 17, 2022
4 changed files
--- a/README.md
+++ b/README.md
@@ -152,6 +152,7 @@ By default, all models are trained with **a single frame** of **20% data (~32k f
 | [Part-A2-Anchor](tools/cfgs/waymo_models/PartA2.yaml) | 74.66/74.12	|65.82/65.32	|71.71/62.24	|62.46/54.06	|66.53/65.18	|64.05/62.75 |
 | [PV-RCNN (AnchorHead)](tools/cfgs/waymo_models/pv_rcnn.yaml) | 75.41/74.74	|67.44/66.80	|71.98/61.24	|63.70/53.95	|65.88/64.25	|63.39/61.82 | 
 | [PV-RCNN (CenterHead)](tools/cfgs/waymo_models/pv_rcnn_with_centerhead_rpn.yaml) | 75.95/75.43	|68.02/67.54	|75.94/69.40	|67.66/61.62	|70.18/68.98	|67.73/66.57|
+| [Voxel R-CNN (CenterHead)-Dynamic-Voxel](tools/cfgs/waymo_models/voxel_rcnn_with_centerhead_dyn_voxel.yaml) | 76.13/75.66	|68.18/67.74	|78.20/71.98	|69.29/63.59	| 70.75/69.68	|68.25/67.21|
 | [PV-RCNN++](tools/cfgs/waymo_models/pv_rcnn_plusplus.yaml) | 77.82/77.32|	69.07/68.62|	77.99/71.36|	69.92/63.74|	71.80/70.71|	69.31/68.26|
 | [PV-RCNN++ (ResNet)](tools/cfgs/waymo_models/pv_rcnn_plusplus_resnet.yaml) |77.61/77.14|	69.18/68.75|	79.42/73.31|	70.88/65.21|	72.50/71.39|	69.84/68.77|

--- a/pcdet/models/backbones_3d/vfe/__init__.py
+++ b/pcdet/models/backbones_3d/vfe/__init__.py
 from .mean_vfe import MeanVFE
 from .pillar_vfe import PillarVFE
+from .dynamic_mean_vfe import DynamicMeanVFE
 from .dynamic_pillar_vfe import DynamicPillarVFE
 from .image_vfe import ImageVFE
 from .vfe_template import VFETemplate
@@ -9,5 +10,6 @@ __all__ = {
    'MeanVFE': MeanVFE,
    'PillarVFE': PillarVFE,
    'ImageVFE': ImageVFE,
-    'DynPillarVFE': DynamicPillarVFE
+    'DynMeanVFE': DynamicMeanVFE,
+    'DynPillarVFE': DynamicPillarVFE,
 }
--- a/pcdet/models/backbones_3d/vfe/dynamic_mean_vfe.py
+++ b/pcdet/models/backbones_3d/vfe/dynamic_mean_vfe.py
+import torch
+from .vfe_template import VFETemplate
+try:
+    import torch_scatter
+except Exception as e:
+    # Incase someone doesn't want to use dynamic pillar vfe and hasn't installed torch_scatter
+    pass
+from .vfe_template import VFETemplate
+class DynamicMeanVFE(VFETemplate):
+    def __init__(self, model_cfg, num_point_features, voxel_size, grid_size, point_cloud_range, **kwargs):
+        super().__init__(model_cfg=model_cfg)
+        self.num_point_features = num_point_features
+        self.grid_size = torch.tensor(grid_size).cuda()
+        self.voxel_size = torch.tensor(voxel_size).cuda()
+        self.point_cloud_range = torch.tensor(point_cloud_range).cuda()
+        self.voxel_x = voxel_size[0]
+        self.voxel_y = voxel_size[1]
+        self.voxel_z = voxel_size[2]
+        self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
+        self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
+        self.z_offset = self.voxel_z / 2 + point_cloud_range[2]
+        self.scale_xyz = grid_size[0] * grid_size[1] * grid_size[2]
+        self.scale_yz = grid_size[1] * grid_size[2]
+        self.scale_z = grid_size[2]
+    def get_output_feature_dim(self):
+        return self.num_point_features
+    @torch.no_grad()
+    def forward(self, batch_dict, **kwargs):
+        """
+        Args:
+            batch_dict:
+                voxels: (num_voxels, max_points_per_voxel, C)
+                voxel_num_points: optional (num_voxels)
+            **kwargs:
+        Returns:
+            vfe_features: (num_voxels, C)
+        """
+        batch_size = batch_dict['batch_size']
+        points = batch_dict['points'] # (batch_idx, x, y, z, i, e)
+        # # debug
+        point_coords = torch.floor((points[:, 1:4] - self.point_cloud_range[0:3]) / self.voxel_size).int()
+        mask = ((point_coords >= 0) & (point_coords < self.grid_size)).all(dim=1)
+        points = points[mask]
+        point_coords = point_coords[mask]
+        merge_coords = points[:, 0].int() * self.scale_xyz + \
+                        point_coords[:, 0] * self.scale_yz + \
+                        point_coords[:, 1] * self.scale_z + \
+                        point_coords[:, 2]
+        points_data = points[:, 1:].contiguous()
+        unq_coords, unq_inv, unq_cnt = torch.unique(merge_coords, return_inverse=True, return_counts=True)
+        points_mean = torch_scatter.scatter_mean(points_data, unq_inv, dim=0)
+        unq_coords = unq_coords.int()
+        voxel_coords = torch.stack((unq_coords // self.scale_xyz,
+                                    (unq_coords % self.scale_xyz) // self.scale_yz,
+                                    (unq_coords % self.scale_yz) // self.scale_z,
+                                    unq_coords % self.scale_z), dim=1)
+        voxel_coords = voxel_coords[:, [0, 3, 2, 1]]
+        batch_dict['voxel_features'] = points_mean.contiguous()
+        batch_dict['voxel_coords'] = voxel_coords.contiguous()
+        return batch_dict
--- a/tools/cfgs/waymo_models/voxel_rcnn_with_centerhead_dyn_voxel.yaml
+++ b/tools/cfgs/waymo_models/voxel_rcnn_with_centerhead_dyn_voxel.yaml
+CLASS_NAMES: ['Vehicle', 'Pedestrian', 'Cyclist']
+DATA_CONFIG:
+    _BASE_CONFIG_: cfgs/dataset_configs/waymo_dataset.yaml
+    DATA_PROCESSOR:
+        -   NAME: mask_points_and_boxes_outside_range
+            REMOVE_OUTSIDE_BOXES: True
+            STRICT_MASK: True
+        -   NAME: shuffle_points
+            SHUFFLE_ENABLED: {
+                'train': True,
+                'test': True
+            }
+        -   NAME: transform_points_to_voxels_placeholder
+            VOXEL_SIZE: [ 0.10, 0.10, 0.15 ]
+MODEL:
+    NAME: VoxelRCNN
+    VFE:
+        NAME: DynMeanVFE
+    BACKBONE_3D:
+        NAME: VoxelBackBone8x
+    MAP_TO_BEV:
+        NAME: HeightCompression
+        NUM_BEV_FEATURES: 256
+    BACKBONE_2D:
+        NAME: BaseBEVBackbone
+        LAYER_NUMS: [5, 5]
+        LAYER_STRIDES: [1, 2]
+        NUM_FILTERS: [128, 256]
+        UPSAMPLE_STRIDES: [1, 2]
+        NUM_UPSAMPLE_FILTERS: [256, 256]
+    DENSE_HEAD:
+        NAME: CenterHead
+        CLASS_AGNOSTIC: False
+        CLASS_NAMES_EACH_HEAD: [
+            [ 'Vehicle', 'Pedestrian', 'Cyclist' ]
+        ]
+        SHARED_CONV_CHANNEL: 64
+        USE_BIAS_BEFORE_NORM: True
+        NUM_HM_CONV: 2
+        SEPARATE_HEAD_CFG:
+            HEAD_ORDER: [ 'center', 'center_z', 'dim', 'rot' ]
+            HEAD_DICT: {
+                'center': { 'out_channels': 2, 'num_conv': 2 },
+                'center_z': { 'out_channels': 1, 'num_conv': 2 },
+                'dim': { 'out_channels': 3, 'num_conv': 2 },
+                'rot': { 'out_channels': 2, 'num_conv': 2 },
+            }
+        TARGET_ASSIGNER_CONFIG:
+            FEATURE_MAP_STRIDE: 8
+            NUM_MAX_OBJS: 500
+            GAUSSIAN_OVERLAP: 0.1
+            MIN_RADIUS: 2
+        LOSS_CONFIG:
+            LOSS_WEIGHTS: {
+                'cls_weight': 1.0,
+                'loc_weight': 2.0,
+                'code_weights': [ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ]
+            }
+        POST_PROCESSING:
+            SCORE_THRESH: 0.1
+            POST_CENTER_LIMIT_RANGE: [ -75.2, -75.2, -2, 75.2, 75.2, 4 ]
+            MAX_OBJ_PER_SAMPLE: 500
+            NMS_CONFIG:
+                NMS_TYPE: nms_gpu
+                NMS_THRESH: 0.7
+                NMS_PRE_MAXSIZE: 4096
+                NMS_POST_MAXSIZE: 500
+    ROI_HEAD:
+        NAME: VoxelRCNNHead
+        CLASS_AGNOSTIC: True
+        SHARED_FC: [256, 256]
+        CLS_FC: [256, 256]
+        REG_FC: [256, 256]
+        DP_RATIO: 0.3
+        NMS_CONFIG:
+            TRAIN:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 9000
+                NMS_POST_MAXSIZE: 512
+                NMS_THRESH: 0.8
+            TEST:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 1024
+                NMS_POST_MAXSIZE: 100
+                NMS_THRESH: 0.7
+                # NMS_PRE_MAXSIZE: 4096
+                # NMS_POST_MAXSIZE: 300
+                # NMS_THRESH: 0.85
+        ROI_GRID_POOL:
+            FEATURES_SOURCE: ['x_conv2', 'x_conv3', 'x_conv4']
+            PRE_MLP: True
+            GRID_SIZE: 6
+            POOL_LAYERS:
+                x_conv2:
+                    MLPS: [ [ 64, 64 ] ]
+                    QUERY_RANGES: [ [ 3, 3, 2 ] ]
+                    POOL_RADIUS: [ 0.4 ]
+                    NSAMPLE: [ 16 ]
+                    POOL_METHOD: max_pool
+                x_conv3:
+                    MLPS: [ [ 64, 64 ] ]
+                    QUERY_RANGES: [ [ 3, 3, 2 ] ]
+                    POOL_RADIUS: [ 0.8 ]
+                    NSAMPLE: [ 16 ]
+                    POOL_METHOD: max_pool
+                x_conv4:
+                    MLPS: [ [ 64, 64 ] ]
+                    QUERY_RANGES: [ [ 3, 3, 2 ] ]
+                    POOL_RADIUS: [ 1.6 ]
+                    NSAMPLE: [ 16 ]
+                    POOL_METHOD: max_pool
+        TARGET_CONFIG:
+            BOX_CODER: ResidualCoder
+            ROI_PER_IMAGE: 128
+            FG_RATIO: 0.5
+            SAMPLE_ROI_BY_EACH_CLASS: True
+            CLS_SCORE_TYPE: roi_iou
+            CLS_FG_THRESH: 0.75
+            CLS_BG_THRESH: 0.25
+            CLS_BG_THRESH_LO: 0.1
+            HARD_BG_RATIO: 0.8
+            REG_FG_THRESH: 0.55
+        LOSS_CONFIG:
+            CLS_LOSS: BinaryCrossEntropy
+            REG_LOSS: smooth-l1
+            CORNER_LOSS_REGULARIZATION: True
+            LOSS_WEIGHTS: {
+                'rcnn_cls_weight': 1.0,
+                'rcnn_reg_weight': 1.0,
+                'rcnn_corner_weight': 1.0,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+    POST_PROCESSING:
+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
+        SCORE_THRESH: 0.1
+        OUTPUT_RAW_SCORE: False
+        EVAL_METRIC: waymo
+        NMS_CONFIG:
+            MULTI_CLASSES_NMS: False
+            NMS_TYPE: nms_gpu
+            NMS_THRESH: 0.7
+            NMS_PRE_MAXSIZE: 4096
+            NMS_POST_MAXSIZE: 500
+OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 4
+    NUM_EPOCHS: 30
+    OPTIMIZER: adam_onecycle
+    LR: 0.01
+    WEIGHT_DECAY: 0.001
+    MOMENTUM: 0.9
+    MOMS: [0.95, 0.85]
+    PCT_START: 0.4
+    DIV_FACTOR: 10
+    DECAY_STEP_LIST: [35, 45]
+    LR_DECAY: 0.1
+    LR_CLIP: 0.0000001
+    LR_WARMUP: False
+    WARMUP_EPOCH: 1
+    GRAD_NORM_CLIP: 10
\ No newline at end of file