[Refactor] move voxelization to data_preprocessor and fix new ut bugs (#1671)

* mv voxelization * update * update full * fix configs * improve docstring of data_preprocessor * fix dynamic voxel config * remove default voxel_type in config * fix typos * add docstring * fix ut * update * fix docstring

[Refactor] move voxelization to data_preprocessor and fix new ut bugs (#1671)
* mv voxelization * update * update full * fix configs * improve docstring of data_preprocessor * fix dynamic voxel config * remove default voxel_type in config * fix typos * add docstring * fix ut * update * fix docstring
86f6183d · ChaimZhu · GitHub · a50c71dd · 86f6183d · 86f6183d
Unverified Commit 86f6183d authored Aug 15, 2022 by ChaimZhu Committed by GitHub Aug 15, 2022
20 changed files
--- a/configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py
+++ b/configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py
@@ -8,6 +8,13 @@ model = dict(
    type='DynamicMVXFasterRCNN',
    data_preprocessor=dict(
        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_type='dynamic',
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(-1, -1)),
        mean=[102.9801, 115.9465, 122.7717],
        std=[1.0, 1.0, 1.0],
        bgr_to_rgb=False,
@@ -26,12 +33,6 @@ model = dict(
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
-    pts_voxel_layer=dict(
-        max_num_points=-1,
-        point_cloud_range=point_cloud_range,
-        voxel_size=voxel_size,
-        max_voxels=(-1, -1),
-    ),
    pts_voxel_encoder=dict(
        type='DynamicVFE',
        in_channels=4,

--- a/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -38,8 +38,10 @@ db_sampler = dict(
    info_path=data_root + 'kitti_dbinfos_train.pkl',
    rate=1.0,
    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
    sample_groups=dict(Car=15),
-    classes=class_names)
+    points_loader=dict(
+        type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4))
 train_pipeline = [
    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),

--- a/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py
+++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py
@@ -25,7 +25,7 @@ model = dict(
        _delete_=True,
        pts=dict(
            assigner=dict(
-                type='MaxIoUAssigner',
+                type='Max3DIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.55,
                neg_iou_thr=0.4,

--- a/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py
+++ b/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py
@@ -22,7 +22,7 @@ model = dict(
        _delete_=True,
        pts=dict(
            assigner=dict(
-                type='MaxIoUAssigner',
+                type='Max3DIoUAssigner',
                iou_calculator=dict(type='BboxOverlapsNearest3D'),
                pos_iou_thr=0.55,
                neg_iou_thr=0.4,

--- a/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py
+++ b/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py
@@ -60,7 +60,8 @@ val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 # model settings
 model = dict(
-    pts_voxel_layer=dict(point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+    data_preprocessor=dict(
+        voxel_layer=dict(point_cloud_range=[-100, -100, -5, 100, 100, 3])),
    pts_voxel_encoder=dict(
        feat_channels=[32, 64],
        point_cloud_range=[-100, -100, -5, 100, 100, 3]),

--- a/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py
+++ b/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py
@@ -64,7 +64,7 @@ val_dataloader = dict(
 # model settings
 model = dict(
-    pts_voxel_layer=dict(max_num_points=20),
+    data_preprocessor=dict(voxel_layer=dict(max_num_points=20)),
    pts_voxel_encoder=dict(feat_channels=[64, 64]),
    pts_neck=dict(
        _delete_=True,

--- a/docs/en/tutorials/config.md
+++ b/docs/en/tutorials/config.md
@@ -422,7 +422,7 @@ In MMDetection3D, for example, to change the FPN neck of PointPillars with the f
 ```python
 model = dict(
    type='MVXFasterRCNN',
-    pts_voxel_layer=dict(...),
+    data_preprocessor=dict(voxel_layer=dict(...)),
    pts_voxel_encoder=dict(...),
    pts_middle_encoder=dict(...),
    pts_backbone=dict(...),

--- a/docs/zh_cn/tutorials/config.md
+++ b/docs/zh_cn/tutorials/config.md
@@ -423,7 +423,7 @@ gpu_ids = range(0, 1)  # 所使用的 GPU 编号
 ```python
 model = dict(
    type='MVXFasterRCNN',
-    pts_voxel_layer=dict(...),
+    data_preprocessor=dict(voxel_layer=dict(...)),
    pts_voxel_encoder=dict(...),
    pts_middle_encoder=dict(...),
    pts_backbone=dict(...),

--- a/mmdet3d/datasets/seg3d_dataset.py
+++ b/mmdet3d/datasets/seg3d_dataset.py
@@ -250,9 +250,9 @@ class Seg3DDataset(BaseDataset):
                osp.join(self.data_prefix.get('pts_semantic_mask', ''),
                         info['pts_semantic_mask_path'])
-        # Add label_mapping to input dict for directly
+        # only be used in `PointSegClassMapping` in pipeline
-        # use it in PointSegClassMapping pipeline
+        # to map original semantic class to valid category ids.
-        info['label_mapping'] = self.label_mapping
+        info['seg_label_mapping'] = self.seg_label_mapping
        # 'eval_ann_info' will be updated in loading transforms
        if self.test_mode and self.load_eval_anns:

--- a/mmdet3d/evaluation/metrics/kitti_metric.py
+++ b/mmdet3d/evaluation/metrics/kitti_metric.py
@@ -64,6 +64,7 @@ class KittiMetric(BaseMetric):
        self.pklfile_prefix = pklfile_prefix
        self.submission_prefix = submission_prefix
        self.pred_box_type_3d = pred_box_type_3d
+        self.default_cam_key = default_cam_key
        self.file_client_args = file_client_args
        allowed_metrics = ['bbox', 'img_bbox', 'mAP']
@@ -284,15 +285,17 @@ class KittiMetric(BaseMetric):
                pklfile_prefix_ = osp.join(pklfile_prefix, name) + '.pkl'
            else:
                pklfile_prefix_ = None
-            if 'pred_instances' in name and '3d' in name and name[0] != '_':
+            if 'pred_instances' in name and '3d' in name and name[
+                    0] != '_' and results[0][name]:
                net_outputs = [result[name] for result in results]
                result_list_ = self.bbox2result_kitti(net_outputs,
                                                      sample_id_list, classes,
                                                      pklfile_prefix_,
                                                      submission_prefix_)
                result_dict[name] = result_list_
-            elif name == 'pred_instances' and name[0] != '_':
+            elif name == 'pred_instances' and name[0] != '_' and results[0][
-                net_outputs = [info[name] for info in results]
+                    name]:
+                net_outputs = [result[name] for result in results]
                result_list_ = self.bbox2result_kitti2d(
                    net_outputs, sample_id_list, classes, pklfile_prefix_,
                    submission_prefix_)

--- a/mmdet3d/models/data_preprocessors/data_preprocessor.py
+++ b/mmdet3d/models/data_preprocessors/data_preprocessor.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from numbers import Number
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 import numpy as np
+import torch
+from mmcv.ops import Voxelization
 from mmengine.data import BaseDataElement
 from mmengine.model import stack_batch
+from torch.nn import functional as F
 from mmdet3d.registry import MODELS
+from mmdet3d.utils import OptConfigType
 from mmdet.models import DetDataPreprocessor
 @MODELS.register_module()
 class Det3DDataPreprocessor(DetDataPreprocessor):
-    """Points (Image) pre-processor for point clouds / multi-modality 3D
+    """Points / Image pre-processor for point clouds / vision-only / multi-
-    detection tasks.
+    modality 3D detection tasks.
    It provides the data pre-processing as follows
-    - Collate and move data to the target device.
+    - Collate and move image and point cloud data to the target device.
+    - 1) For image data:
    - Pad images in inputs to the maximum size of current batch with defined
      ``pad_value``. The padding size can be divisible by a defined
      ``pad_size_divisor``
@@ -25,8 +31,20 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
    - Convert images in inputs from bgr to rgb if the shape of input is
        (3, H, W).
    - Normalize images in inputs with defined std and mean.
+    - Do batch augmentations during training.
+    - 2) For point cloud data:
+    - if no voxelization, directly return list of point cloud data.
+    - if voxelization is applied, voxelize point cloud according to
+      ``voxel_type`` and obtain ``voxels``.
    Args:
+        voxel (bool): Whether to apply voxelziation to point cloud.
+        voxel_type (str): Voxelization type. Two voxelization types are
+            provided: 'hard' and 'dynamic', respectively for hard
+            voxelization and dynamic voxelization. Defaults to 'hard'.
+        voxel_layer (:obj:`ConfigDict`, optional): Voxelization layer
+            config. Defaults to None.
        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
            Defaults to None.
        std (Sequence[Number], optional): The pixel standard deviation of
@@ -38,9 +56,13 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
            Defaults to False.
        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
            Defaults to False.
+        batch_augments (list[dict], optional): Batch-level augmentations
    """
    def __init__(self,
+                 voxel: bool = False,
+                 voxel_type: str = 'hard',
+                 voxel_layer: OptConfigType = None,
                 mean: Sequence[Number] = None,
                 std: Sequence[Number] = None,
                 pad_size_divisor: int = 1,
@@ -64,6 +86,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
            bgr_to_rgb=bgr_to_rgb,
            rgb_to_bgr=rgb_to_bgr,
            batch_augments=batch_augments)
+        self.voxel = voxel
+        self.voxel_type = voxel_type
+        if voxel:
+            self.voxel_layer = Voxelization(**voxel_layer)
    def forward(self,
                data: List[Union[dict, List[dict]]],
@@ -152,6 +178,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
            'imgs': batch_imgs if imgs is not None else None
        }
+        if self.voxel:
+            voxel_dict = self.voxelize(points)
+            batch_inputs_dict['voxels'] = voxel_dict
        return batch_inputs_dict, batch_data_samples
    def collate_data(
@@ -203,3 +233,66 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
                                self.pad_size_divisor)) * self.pad_size_divisor
            batch_pad_shape.append((pad_h, pad_w))
        return batch_pad_shape
+    @torch.no_grad()
+    def voxelize(self, points: List[torch.Tensor]) -> Dict:
+        """Apply voxelization to point cloud.
+        Args:
+            points (List[Tensor]): Point cloud in one data batch.
+        Returns:
+            dict[str, Tensor]: Voxelization information.
+            - voxels (Tensor): Features of voxels, shape is MXNxC for hard
+                voxelization, NXC for dynamic voxelization.
+            - coors (Tensor): Coordinates of voxels, shape is  Nx(1+NDim),
+                where 1 represents the batch index.
+            - num_points (Tensor, optional): Number of points in each voxel.
+            - voxel_centers (Tensor, optional): Centers of voxels.
+        """
+        voxel_dict = dict()
+        if self.voxel_type == 'hard':
+            voxels, coors, num_points, voxel_centers = [], [], [], []
+            for res in points:
+                res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+                res_voxel_centers = (
+                    res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
+                        self.voxel_layer.voxel_size) + res_voxels.new_tensor(
+                            self.voxel_layer.point_cloud_range[0:3])
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+                num_points.append(res_num_points)
+                voxel_centers.append(res_voxel_centers)
+            voxels = torch.cat(voxels, dim=0)
+            num_points = torch.cat(num_points, dim=0)
+            voxel_centers = torch.cat(voxel_centers, dim=0)
+            coors_batch = []
+            for i, coor in enumerate(coors):
+                coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+                coors_batch.append(coor_pad)
+            coors_batch = torch.cat(coors_batch, dim=0)
+            voxel_dict['num_points'] = num_points
+            voxel_dict['voxel_centers'] = voxel_centers
+        elif self.voxel_type == 'dynamic':
+            coors = []
+            # dynamic voxelization only provide a coors mapping
+            for res in points:
+                res_coors = self.voxel_layer(res)
+                coors.append(res_coors)
+            voxels = torch.cat(points, dim=0)
+            coors_batch = []
+            for i, coor in enumerate(coors):
+                coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+                coors_batch.append(coor_pad)
+            coors_batch = torch.cat(coors_batch, dim=0)
+        else:
+            raise ValueError(f'Invalid voxelization type {self.voxel_type}')
+        voxel_dict['voxels'] = voxels
+        voxel_dict['coors'] = coors_batch
+        return voxel_dict
--- a/mmdet3d/models/detectors/base.py
+++ b/mmdet3d/models/detectors/base.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import List, Optional, Union
+from mmengine import InstanceData
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample
 from mmdet3d.structures.det3d_data_sample import (ForwardResults,
@@ -131,11 +133,17 @@ class Base3DDetector(BaseDetector):
               (results_list_3d is not None),\
               'please pass at least one type of results_list'
+        if results_list_2d is None:
+            results_list_2d = [
+                InstanceData() for _ in range(len(results_list_3d))
+            ]
+        if results_list_3d is None:
+            results_list_3d = [
+                InstanceData() for _ in range(len(results_list_2d))
+            ]
        for i in range(len(results_list_3d)):
            result = Det3DDataSample()
-            if results_list_3d is not None:
            result.pred_instances_3d = results_list_3d[i]
-            if results_list_2d is not None:
            result.pred_instances = results_list_2d[i]
            data_sample_list.append(result)
        return data_sample_list
--- a/mmdet3d/models/detectors/centerpoint.py
+++ b/mmdet3d/models/detectors/centerpoint.py
@@ -10,8 +10,6 @@ class CenterPoint(MVXTwoStageDetector):
    """Base class of Multi-modality VoxelNet.
    Args:
-        pts_voxel_layer (dict, optional): Point cloud voxelization
-            layer. Defaults to None.
        pts_voxel_encoder (dict, optional): Point voxelization
            encoder layer. Defaults to None.
        pts_middle_encoder (dict, optional): Middle encoder layer
@@ -43,7 +41,6 @@ class CenterPoint(MVXTwoStageDetector):
    """
    def __init__(self,
-                 pts_voxel_layer: Optional[dict] = None,
                 pts_voxel_encoder: Optional[dict] = None,
                 pts_middle_encoder: Optional[dict] = None,
                 pts_fusion_layer: Optional[dict] = None,
@@ -61,9 +58,8 @@ class CenterPoint(MVXTwoStageDetector):
                 **kwargs):
        super(CenterPoint,
-              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+              self).__init__(pts_voxel_encoder, pts_middle_encoder,
-                             pts_middle_encoder, pts_fusion_layer,
+                             pts_fusion_layer, img_backbone, pts_backbone,
-                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             img_neck, pts_neck, pts_bbox_head, img_roi_head,
-                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             img_rpn_head, train_cfg, test_cfg, init_cfg,
-                             train_cfg, test_cfg, init_cfg, data_preprocessor,
+                             data_preprocessor, **kwargs)
-                             **kwargs)
--- a/mmdet3d/models/detectors/dynamic_voxelnet.py
+++ b/mmdet3d/models/detectors/dynamic_voxelnet.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple
+from typing import Tuple
-import torch
-from mmcv.runner import force_fp32
 from torch import Tensor
-from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
@@ -17,7 +14,6 @@ class DynamicVoxelNet(VoxelNet):
    """
    def __init__(self,
-                 voxel_layer: ConfigType,
                 voxel_encoder: ConfigType,
                 middle_encoder: ConfigType,
                 backbone: ConfigType,
@@ -28,7 +24,6 @@ class DynamicVoxelNet(VoxelNet):
                 data_preprocessor: OptConfigType = None,
                 init_cfg: OptMultiConfig = None) -> None:
        super().__init__(
-            voxel_layer=voxel_layer,
            voxel_encoder=voxel_encoder,
            middle_encoder=middle_encoder,
            backbone=backbone,
@@ -39,37 +34,12 @@ class DynamicVoxelNet(VoxelNet):
            data_preprocessor=data_preprocessor,
            init_cfg=init_cfg)
-    @torch.no_grad()
-    @force_fp32()
-    def voxelize(self, points: List[torch.Tensor]) -> tuple:
-        """Apply dynamic voxelization to points.
-        Args:
-            points (list[Tensor]): Points of each sample.
-        Returns:
-            tuple[Tensor]: Concatenated points and coordinates.
-        """
-        coors = []
-        # dynamic voxelization only provide a coors mapping
-        for res in points:
-            res_coors = self.voxel_layer(res)
-            coors.append(res_coors)
-        points = torch.cat(points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return points, coors_batch
    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
        """Extract features from points."""
-        # TODO: Remove voxelization to datapreprocessor
+        voxel_dict = batch_inputs_dict['voxels']
-        points = batch_inputs_dict['points']
+        voxel_features, feature_coors = self.voxel_encoder(
-        voxels, coors = self.voxelize(points)
+            voxel_dict['voxels'], voxel_dict['coors'])
-        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
-        batch_size = coors[-1, 0].item() + 1
        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
        x = self.backbone(x)
        if self.with_neck:

--- a/mmdet3d/models/detectors/mvx_faster_rcnn.py
+++ b/mmdet3d/models/detectors/mvx_faster_rcnn.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional, Sequence
+from typing import Dict, List, Optional, Sequence
-import torch
 from torch import Tensor
-from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from .mvx_two_stage import MVXTwoStageDetector
@@ -24,39 +22,18 @@ class DynamicMVXFasterRCNN(MVXTwoStageDetector):
    def __init__(self, **kwargs):
        super(DynamicMVXFasterRCNN, self).__init__(**kwargs)
-    @torch.no_grad()
-    def voxelize(self, points):
-        """Apply dynamic voxelization to points.
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-        Returns:
-            tuple[torch.Tensor]: Concatenated points and coordinates.
-        """
-        coors = []
-        # dynamic voxelization only provide a coors mapping
-        for res in points:
-            res_coors = self.pts_voxel_layer(res)
-            coors.append(res_coors)
-        points = torch.cat(points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return points, coors_batch
    def extract_pts_feat(
            self,
-            points: List[Tensor],
+            voxel_dict: Dict[str, Tensor],
+            points: Optional[List[Tensor]] = None,
            img_feats: Optional[Sequence[Tensor]] = None,
            batch_input_metas: Optional[List[dict]] = None
    ) -> Sequence[Tensor]:
        """Extract features of points.
        Args:
-            points (List[tensor]):  Point cloud of multiple inputs.
+            voxel_dict(Dict[str, Tensor]): Dict of voxelization infos.
+            points (List[tensor], optional):  Point cloud of multiple inputs.
            img_feats (list[Tensor], tuple[tensor], optional): Features from
                image backbone.
            batch_input_metas (list[dict], optional): The meta information
@@ -68,10 +45,10 @@ class DynamicMVXFasterRCNN(MVXTwoStageDetector):
        """
        if not self.with_pts_bbox:
            return None
-        voxels, coors = self.voxelize(points)
        voxel_features, feature_coors = self.pts_voxel_encoder(
-            voxels, coors, points, img_feats, batch_input_metas)
+            voxel_dict['voxels'], voxel_dict['coors'], points, img_feats,
-        batch_size = coors[-1, 0] + 1
+            batch_input_metas)
+        batch_size = voxel_dict['coors'][-1, 0] + 1
        x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size)
        x = self.pts_backbone(x)
        if self.with_pts_neck:

--- a/mmdet3d/models/detectors/mvx_two_stage.py
+++ b/mmdet3d/models/detectors/mvx_two_stage.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
-from typing import Dict, List, Optional, Sequence, Tuple
+from typing import Dict, List, Optional, Sequence
 import torch
-from mmcv.ops import Voxelization
 from mmengine import InstanceData
 from torch import Tensor
-from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample
@@ -18,8 +16,6 @@ class MVXTwoStageDetector(Base3DDetector):
    """Base class of Multi-modality VoxelNet.
    Args:
-        pts_voxel_layer (dict, optional): Point cloud voxelization
-            layer. Defaults to None.
        pts_voxel_encoder (dict, optional): Point voxelization
            encoder layer. Defaults to None.
        pts_middle_encoder (dict, optional): Middle encoder layer
@@ -51,7 +47,6 @@ class MVXTwoStageDetector(Base3DDetector):
    """
    def __init__(self,
-                 pts_voxel_layer: Optional[dict] = None,
                 pts_voxel_encoder: Optional[dict] = None,
                 pts_middle_encoder: Optional[dict] = None,
                 pts_fusion_layer: Optional[dict] = None,
@@ -70,8 +65,6 @@ class MVXTwoStageDetector(Base3DDetector):
        super(MVXTwoStageDetector, self).__init__(
            init_cfg=init_cfg, data_preprocessor=data_preprocessor, **kwargs)
-        if pts_voxel_layer:
-            self.pts_voxel_layer = Voxelization(**pts_voxel_layer)
        if pts_voxel_encoder:
            self.pts_voxel_encoder = MODELS.build(pts_voxel_encoder)
        if pts_middle_encoder:
@@ -192,14 +185,16 @@ class MVXTwoStageDetector(Base3DDetector):
    def extract_pts_feat(
            self,
-            points: List[Tensor],
+            voxel_dict: Dict[str, Tensor],
+            points: Optional[List[Tensor]] = None,
            img_feats: Optional[Sequence[Tensor]] = None,
            batch_input_metas: Optional[List[dict]] = None
    ) -> Sequence[Tensor]:
        """Extract features of points.
        Args:
-            points (List[tensor]):  Point cloud of multiple inputs.
+            voxel_dict(Dict[str, Tensor]): Dict of voxelization infos.
+            points (List[tensor], optional):  Point cloud of multiple inputs.
            img_feats (list[Tensor], tuple[tensor], optional): Features from
                image backbone.
            batch_input_metas (list[dict], optional): The meta information
@@ -211,11 +206,13 @@ class MVXTwoStageDetector(Base3DDetector):
        """
        if not self.with_pts_bbox:
            return None
-        voxels, num_points, coors = self.voxelize(points)
+        voxel_features = self.pts_voxel_encoder(voxel_dict['voxels'],
-        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
+                                                voxel_dict['num_points'],
-                                                img_feats, batch_input_metas)
+                                                voxel_dict['coors'], img_feats,
-        batch_size = coors[-1, 0] + 1
+                                                batch_input_metas)
-        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
+        batch_size = voxel_dict['coors'][-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, voxel_dict['coors'],
+                                    batch_size)
        x = self.pts_backbone(x)
        if self.with_pts_neck:
            x = self.pts_neck(x)
@@ -238,39 +235,17 @@ class MVXTwoStageDetector(Base3DDetector):
             tuple: Two elements in tuple arrange as
             image features and point cloud features.
        """
-        points = batch_inputs_dict['points']
+        voxel_dict = batch_inputs_dict['voxels']
        imgs = batch_inputs_dict['imgs']
+        points = batch_inputs_dict['points']
        img_feats = self.extract_img_feat(imgs, batch_input_metas)
        pts_feats = self.extract_pts_feat(
-            points, img_feats=img_feats, batch_input_metas=batch_input_metas)
+            voxel_dict,
+            points=points,
+            img_feats=img_feats,
+            batch_input_metas=batch_input_metas)
        return (img_feats, pts_feats)
-    @torch.no_grad()
-    def voxelize(self, points: List[Tensor]) -> Tuple:
-        """Apply dynamic voxelization to points.
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-        Returns:
-            tuple[torch.Tensor]: Concatenated points, number of points
-                per voxel, and coordinates.
-        """
-        voxels, coors, num_points = [], [], []
-        for res in points:
-            res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
-            voxels.append(res_voxels)
-            coors.append(res_coors)
-            num_points.append(res_num_points)
-        voxels = torch.cat(voxels, dim=0)
-        num_points = torch.cat(num_points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return voxels, num_points, coors_batch
    def loss(self, batch_inputs_dict: Dict[List, torch.Tensor],
             batch_data_samples: List[Det3DDataSample],
             **kwargs) -> List[Det3DDataSample]:

--- a/mmdet3d/models/detectors/parta2.py
+++ b/mmdet3d/models/detectors/parta2.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
+from typing import Dict, Optional
-import torch
-from mmcv.ops import Voxelization
-from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from .two_stage import TwoStage3DDetector
@@ -17,7 +13,6 @@ class PartA2(TwoStage3DDetector):
    """
    def __init__(self,
-                 voxel_layer: dict,
                 voxel_encoder: dict,
                 middle_encoder: dict,
                 backbone: dict,
@@ -37,7 +32,6 @@ class PartA2(TwoStage3DDetector):
            test_cfg=test_cfg,
            init_cfg=init_cfg,
            data_preprocessor=data_preprocessor)
-        self.voxel_layer = Voxelization(**voxel_layer)
        self.voxel_encoder = MODELS.build(voxel_encoder)
        self.middle_encoder = MODELS.build(middle_encoder)
@@ -57,8 +51,7 @@ class PartA2(TwoStage3DDetector):
                and for inside 3D object detection, usually a dict containing
                features will be obtained.
        """
-        points = batch_inputs_dict['points']
+        voxel_dict = batch_inputs_dict['voxels']
-        voxel_dict = self.voxelize(points)
        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
                                            voxel_dict['num_points'],
                                            voxel_dict['coors'])
@@ -71,34 +64,3 @@ class PartA2(TwoStage3DDetector):
            feats_dict.update({'neck_feats': neck_feats})
        feats_dict['voxels_dict'] = voxel_dict
        return feats_dict
-    @torch.no_grad()
-    def voxelize(self, points: List[torch.Tensor]) -> Dict:
-        """Apply hard voxelization to points."""
-        voxels, coors, num_points, voxel_centers = [], [], [], []
-        for res in points:
-            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
-            res_voxel_centers = (
-                res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
-                    self.voxel_layer.voxel_size) + res_voxels.new_tensor(
-                        self.voxel_layer.point_cloud_range[0:3])
-            voxels.append(res_voxels)
-            coors.append(res_coors)
-            num_points.append(res_num_points)
-            voxel_centers.append(res_voxel_centers)
-        voxels = torch.cat(voxels, dim=0)
-        num_points = torch.cat(num_points, dim=0)
-        voxel_centers = torch.cat(voxel_centers, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        voxel_dict = dict(
-            voxels=voxels,
-            num_points=num_points,
-            coors=coors_batch,
-            voxel_centers=voxel_centers)
-        return voxel_dict
--- a/mmdet3d/models/detectors/voxelnet.py
+++ b/mmdet3d/models/detectors/voxelnet.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple
+from typing import Tuple
-import torch
-from mmcv.ops import Voxelization
-from mmcv.runner import force_fp32
 from torch import Tensor
-from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
@@ -17,7 +13,6 @@ class VoxelNet(SingleStage3DDetector):
    r"""`VoxelNet <https://arxiv.org/abs/1711.06396>`_ for 3D detection."""
    def __init__(self,
-                 voxel_layer: ConfigType,
                 voxel_encoder: ConfigType,
                 middle_encoder: ConfigType,
                 backbone: ConfigType,
@@ -35,37 +30,18 @@ class VoxelNet(SingleStage3DDetector):
            test_cfg=test_cfg,
            data_preprocessor=data_preprocessor,
            init_cfg=init_cfg)
-        self.voxel_layer = Voxelization(**voxel_layer)
        self.voxel_encoder = MODELS.build(voxel_encoder)
        self.middle_encoder = MODELS.build(middle_encoder)
-    @torch.no_grad()
-    @force_fp32()
-    def voxelize(self, points: List[torch.Tensor]) -> tuple:
-        """Apply hard voxelization to points."""
-        voxels, coors, num_points = [], [], []
-        for res in points:
-            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
-            voxels.append(res_voxels)
-            coors.append(res_coors)
-            num_points.append(res_num_points)
-        voxels = torch.cat(voxels, dim=0)
-        num_points = torch.cat(num_points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return voxels, num_points, coors_batch
    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
        """Extract features from points."""
-        # TODO: Remove voxelization to datapreprocessor
+        voxel_dict = batch_inputs_dict['voxels']
-        points = batch_inputs_dict['points']
+        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
-        voxels, num_points, coors = self.voxelize(points)
+                                            voxel_dict['num_points'],
-        voxel_features = self.voxel_encoder(voxels, num_points, coors)
+                                            voxel_dict['coors'])
-        batch_size = coors[-1, 0].item() + 1
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
-        x = self.middle_encoder(voxel_features, coors, batch_size)
+        x = self.middle_encoder(voxel_features, voxel_dict['coors'],
+                                batch_size)
        x = self.backbone(x)
        if self.with_neck:
            x = self.neck(x)

--- a/tests/test_datasets/test_s3dis_dataset.py
+++ b/tests/test_datasets/test_s3dis_dataset.py
@@ -33,10 +33,7 @@ def _generate_s3dis_seg_dataset_config():
            with_label_3d=False,
            with_mask_3d=False,
            with_seg_3d=True),
-        dict(
+        dict(type='PointSegClassMapping'),
-            type='PointSegClassMapping',
-            valid_cat_ids=tuple(range(len(classes))),
-            max_cat_id=13),
        dict(
            type='IndoorPatchPointSample',
            num_points=5,

--- a/tests/test_datasets/test_scannet_dataset.py
+++ b/tests/test_datasets/test_scannet_dataset.py
@@ -55,11 +55,7 @@ def _generate_scannet_seg_dataset_config():
            with_label_3d=False,
            with_mask_3d=False,
            with_seg_3d=True),
-        dict(
+        dict(type='PointSegClassMapping'),
-            type='PointSegClassMapping',
-            valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24,
-                           28, 33, 34, 36, 39),
-            max_cat_id=40),
        dict(
            type='IndoorPatchPointSample',
            num_points=5,