Bump version to v1.1.0rc2

Bump to v1.1.0rc2

Bump version to v1.1.0rc2
Bump to v1.1.0rc2
d7067e44 · Wenwei Zhang · GitHub · 28fe73d2 · fb0e57e5 · d7067e44
Unverified Commit d7067e44 authored Dec 03, 2022 by Wenwei Zhang Committed by GitHub Dec 03, 2022
20 changed files
--- a/mmdet3d/models/layers/dgcnn_modules/dgcnn_fp_module.py
+++ b/mmdet3d/models/layers/dgcnn_modules/dgcnn_fp_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 class DGCNNFPModule(BaseModule):
    """Point feature propagation module used in DGCNN.
@@ -10,21 +15,21 @@ class DGCNNFPModule(BaseModule):
    Propagate the features from one set to another.

    Args:
-        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict, optional): Type of activation method.
-            Defaults to dict(type='BN1d').
-        act_cfg (dict, optional): Type of activation method.
+        mlp_channels (List[int]): List of mlp channels.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to dict(type='ReLU').
-        init_cfg (dict, optional): Initialization config. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
    """

    def __init__(self,
-                 mlp_channels,
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.fp16_enabled = False
+                 mlp_channels: List[int],
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(DGCNNFPModule, self).__init__(init_cfg=init_cfg)
        self.mlps = nn.Sequential()
        for i in range(len(mlp_channels) - 1):
            self.mlps.add_module(
@@ -38,14 +43,14 @@ class DGCNNFPModule(BaseModule):
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg))

-    def forward(self, points):
-        """forward.
+    def forward(self, points: Tensor) -> Tensor:
+        """Forward.

        Args:
-            points (Tensor): (B, N, C) tensor of the input points.
+            points (Tensor): (B, N, C) Tensor of the input points.

        Returns:
-            Tensor: (B, N, M) M = mlp[-1], tensor of the new points.
+            Tensor: (B, N, M) M = mlp[-1]. Tensor of the new points.
        """

        if points is not None:

--- a/mmdet3d/models/layers/dgcnn_modules/dgcnn_gf_module.py
+++ b/mmdet3d/models/layers/dgcnn_modules/dgcnn_gf_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
 import torch
 from mmcv.cnn import ConvModule
 from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

+from mmdet3d.utils import ConfigType
+

 class BaseDGCNNGFModule(nn.Module):
    """Base module for point graph feature module used in DGCNN.

    Args:
-        radii (list[float]): List of radius in each knn or ball query.
-        sample_nums (list[int]): Number of samples in each knn or ball query.
-        mlp_channels (list[list[int]]): Specify of the dgcnn before
-            the global pooling for each graph feature module.
-        knn_modes (list[str], optional): Type of KNN method, valid mode
-            ['F-KNN', 'D-KNN'], Defaults to ['F-KNN'].
-        dilated_group (bool, optional): Whether to use dilated ball query.
+        radii (List[float]): List of radius in each knn or ball query.
+        sample_nums (List[int]): Number of samples in each knn or ball query.
+        mlp_channels (List[List[int]]): Specify of the dgcnn before the global
+            pooling for each graph feature module.
+        knn_modes (List[str]): Type of KNN method, valid mode
+            ['F-KNN', 'D-KNN']. Defaults to ['F-KNN'].
+        dilated_group (bool): Whether to use dilated ball query.
            Defaults to False.
-        use_xyz (bool, optional): Whether to use xyz as point features.
+        use_xyz (bool): Whether to use xyz as point features.
            Defaults to True.
-        pool_mode (str, optional): Type of pooling method. Defaults to 'max'.
-        normalize_xyz (bool, optional): If ball query, whether to normalize
-            local XYZ with radius. Defaults to False.
-        grouper_return_grouped_xyz (bool, optional): Whether to return grouped
-            xyz in `QueryAndGroup`. Defaults to False.
-        grouper_return_grouped_idx (bool, optional): Whether to return grouped
-            idx in `QueryAndGroup`. Defaults to False.
+        pool_mode (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): If ball query, whether to normalize local XYZ
+            with radius. Defaults to False.
+        grouper_return_grouped_xyz (bool): Whether to return grouped xyz in
+            `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool): Whether to return grouped idx in
+            `QueryAndGroup`. Defaults to False.
    """

    def __init__(self,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 knn_modes=['F-KNN'],
-                 dilated_group=False,
-                 use_xyz=True,
-                 pool_mode='max',
-                 normalize_xyz=False,
-                 grouper_return_grouped_xyz=False,
-                 grouper_return_grouped_idx=False):
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 knn_modes: List[str] = ['F-KNN'],
+                 dilated_group: bool = False,
+                 use_xyz: bool = True,
+                 pool_mode: str = 'max',
+                 normalize_xyz: bool = False,
+                 grouper_return_grouped_xyz: bool = False,
+                 grouper_return_grouped_idx: bool = False) -> None:
        super(BaseDGCNNGFModule, self).__init__()

        assert len(sample_nums) == len(
@@ -82,16 +87,15 @@ class BaseDGCNNGFModule(nn.Module):
                grouper = GroupAll(use_xyz)
            self.groupers.append(grouper)

-    def _pool_features(self, features):
+    def _pool_features(self, features: Tensor) -> Tensor:
        """Perform feature aggregation using pooling operation.

        Args:
-            features (torch.Tensor): (B, C, N, K)
-                Features of locally grouped points before pooling.
+            features (Tensor): (B, C, N, K) Features of locally grouped
+                points before pooling.

        Returns:
-            torch.Tensor: (B, C, N)
-                Pooled features aggregating local information.
+            Tensor: (B, C, N) Pooled features aggregating local information.
        """
        if self.pool_mode == 'max':
            # (B, C, N, 1)
@@ -106,15 +110,15 @@ class BaseDGCNNGFModule(nn.Module):

        return new_features.squeeze(-1).contiguous()

-    def forward(self, points):
+    def forward(self, points: Tensor) -> Tensor:
        """forward.

        Args:
-            points (Tensor): (B, N, C) input points.
+            points (Tensor): (B, N, C) Input points.

        Returns:
-            List[Tensor]: (B, N, C1) new points generated from each graph
-                feature module.
+            Tensor: (B, N, C1) New points generated from each graph
+            feature module.
        """
        new_points_list = [points]

@@ -155,43 +159,40 @@ class DGCNNGFModule(BaseDGCNNGFModule):
    """Point graph feature module used in DGCNN.

    Args:
-        mlp_channels (list[int]): Specify of the dgcnn before
-            the global pooling for each graph feature module.
+        mlp_channels (List[int]): Specify of the dgcnn before the global
+            pooling for each graph feature module.
        num_sample (int, optional): Number of samples in each knn or ball
            query. Defaults to None.
-        knn_mode (str, optional): Type of KNN method, valid mode
-            ['F-KNN', 'D-KNN']. Defaults to 'F-KNN'.
-        radius (float, optional): Radius to group with.
-            Defaults to None.
-        dilated_group (bool, optional): Whether to use dilated ball query.
+        knn_mode (str): Type of KNN method, valid mode ['F-KNN', 'D-KNN'].
+            Defaults to 'F-KNN'.
+        radius (float, optional): Radius to group with. Defaults to None.
+        dilated_group (bool): Whether to use dilated ball query.
            Defaults to False.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d').
-        act_cfg (dict, optional): Type of activation method.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to dict(type='ReLU').
-        use_xyz (bool, optional): Whether to use xyz as point features.
-            Defaults to True.
-        pool_mode (str, optional): Type of pooling method.
-            Defaults to 'max'.
-        normalize_xyz (bool, optional): If ball query, whether to normalize
-            local XYZ with radius. Defaults to False.
-        bias (bool | str, optional): If specified as `auto`, it will be decided
-            by the norm_cfg. Bias will be set as True if `norm_cfg` is None,
+        use_xyz (bool): Whether to use xyz as point features. Defaults to True.
+        pool_mode (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): If ball query, whether to normalize local XYZ
+            with radius. Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
            otherwise False. Defaults to 'auto'.
    """

    def __init__(self,
-                 mlp_channels,
-                 num_sample=None,
-                 knn_mode='F-KNN',
-                 radius=None,
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d'),
-                 act_cfg=dict(type='ReLU'),
-                 use_xyz=True,
-                 pool_mode='max',
-                 normalize_xyz=False,
-                 bias='auto'):
+                 mlp_channels: List[int],
+                 num_sample: Optional[int] = None,
+                 knn_mode: str = 'F-KNN',
+                 radius: Optional[float] = None,
+                 dilated_group: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 use_xyz: bool = True,
+                 pool_mode: str = 'max',
+                 normalize_xyz: bool = False,
+                 bias: Union[bool, str] = 'auto') -> None:
        super(DGCNNGFModule, self).__init__(
            mlp_channels=[mlp_channels],
            sample_nums=[num_sample],

--- a/mmdet3d/models/layers/edge_fusion_module.py
+++ b/mmdet3d/models/layers/edge_fusion_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

+from mmdet3d.utils import ConfigType
+

 class EdgeFusionModule(BaseModule):
    """Edge Fusion Module for feature map.
@@ -12,21 +17,22 @@ class EdgeFusionModule(BaseModule):
        out_channels (int): The number of output channels.
        feat_channels (int): The number of channels in feature map
            during edge feature fusion.
-        kernel_size (int, optional): Kernel size of convolution.
-            Default: 3.
-        act_cfg (dict, optional): Config of activation.
-            Default: dict(type='ReLU').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d')).
+        kernel_size (int): Kernel size of convolution. Defaults to 3.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
    """

-    def __init__(self,
-                 out_channels,
-                 feat_channels,
-                 kernel_size=3,
-                 act_cfg=dict(type='ReLU'),
-                 norm_cfg=dict(type='BN1d')):
-        super().__init__()
+    def __init__(
+        self,
+        out_channels: int,
+        feat_channels: int,
+        kernel_size: int = 3,
+        act_cfg: ConfigType = dict(type='ReLU'),
+        norm_cfg: ConfigType = dict(type='BN1d')
+    ) -> None:
+        super(EdgeFusionModule, self).__init__()
        self.edge_convs = nn.Sequential(
            ConvModule(
                feat_channels,
@@ -39,22 +45,22 @@ class EdgeFusionModule(BaseModule):
            nn.Conv1d(feat_channels, out_channels, kernel_size=1))
        self.feat_channels = feat_channels

-    def forward(self, features, fused_features, edge_indices, edge_lens,
-                output_h, output_w):
+    def forward(self, features: Tensor, fused_features: Tensor,
+                edge_indices: Tensor, edge_lens: List[int], output_h: int,
+                output_w: int) -> Tensor:
        """Forward pass.

        Args:
-            features (torch.Tensor): Different representative features
-                for fusion.
-            fused_features (torch.Tensor): Different representative
-                features to be fused.
-            edge_indices (torch.Tensor): Batch image edge indices.
-            edge_lens (list[int]): List of edge length of each image.
+            features (Tensor): Different representative features for fusion.
+            fused_features (Tensor): Different representative features
+                to be fused.
+            edge_indices (Tensor): Batch image edge indices.
+            edge_lens (List[int]): List of edge length of each image.
            output_h (int): Height of output feature map.
            output_w (int): Width of output feature map.

        Returns:
-            torch.Tensor: Fused feature maps.
+            Tensor: Fused feature maps.
        """
        batch_size = features.shape[0]
        # normalize

--- a/mmdet3d/models/layers/fusion_layers/coord_transform.py
+++ b/mmdet3d/models/layers/fusion_layers/coord_transform.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from functools import partial
+from typing import Tuple

 import torch
+from torch import Tensor

 from mmdet3d.structures.points import get_points_type


-def apply_3d_transformation(pcd, coord_type, img_meta, reverse=False):
+def apply_3d_transformation(pcd: Tensor,
+                            coord_type: str,
+                            img_meta: dict,
+                            reverse: bool = False) -> Tensor:
    """Apply transformation to input point cloud.

    Args:
-        pcd (torch.Tensor): The point cloud to be transformed.
+        pcd (Tensor): The point cloud to be transformed.
        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
        img_meta(dict): Meta info regarding data transformation.
-        reverse (bool): Reversed transformation or not.
+        reverse (bool): Reversed transformation or not. Defaults to False.

    Note:
        The elements in img_meta['transformation_3d_flow']:
-        "T" stands for translation;
-        "S" stands for scale;
-        "R" stands for rotation;
-        "HF" stands for horizontal flip;
-        "VF" stands for vertical flip.
+
+            - "T" stands for translation;
+            - "S" stands for scale;
+            - "R" stands for rotation;
+            - "HF" stands for horizontal flip;
+            - "VF" stands for vertical flip.

    Returns:
-        torch.Tensor: The transformed point cloud.
+        Tensor: The transformed point cloud.
    """

    dtype = pcd.dtype
@@ -92,16 +98,18 @@ def apply_3d_transformation(pcd, coord_type, img_meta, reverse=False):
    return pcd.coord


-def extract_2d_info(img_meta, tensor):
+def extract_2d_info(
+        img_meta: dict,
+        tensor: Tensor) -> Tuple[int, int, int, int, Tensor, bool, Tensor]:
    """Extract image augmentation information from img_meta.

    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        tensor(torch.Tensor): Input tensor used to create new ones.
+        img_meta (dict): Meta info regarding data transformation.
+        tensor (Tensor): Input tensor used to create new ones.

    Returns:
-        (int, int, int, int, torch.Tensor, bool, torch.Tensor):
-            The extracted information.
+        Tuple[int, int, int, int, torch.Tensor, bool, torch.Tensor]:
+        The extracted information.
    """
    img_shape = img_meta['img_shape']
    ori_shape = img_meta['ori_shape']
@@ -120,17 +128,17 @@ def extract_2d_info(img_meta, tensor):
            img_crop_offset)


-def bbox_2d_transform(img_meta, bbox_2d, ori2new):
+def bbox_2d_transform(img_meta: dict, bbox_2d: Tensor,
+                      ori2new: bool) -> Tensor:
    """Transform 2d bbox according to img_meta.

    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        bbox_2d (torch.Tensor): Shape (..., >4)
-            The input 2d bboxes to transform.
+        img_meta (dict): Meta info regarding data transformation.
+        bbox_2d (Tensor): Shape (..., >4) The input 2d bboxes to transform.
        ori2new (bool): Origin img coord system to new or not.

    Returns:
-        torch.Tensor: The transformed 2d bboxes.
+        Tensor: The transformed 2d bboxes.
    """

    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
@@ -174,17 +182,17 @@ def bbox_2d_transform(img_meta, bbox_2d, ori2new):
    return bbox_2d_new


-def coord_2d_transform(img_meta, coord_2d, ori2new):
+def coord_2d_transform(img_meta: dict, coord_2d: Tensor,
+                       ori2new: bool) -> Tensor:
    """Transform 2d pixel coordinates according to img_meta.

    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        coord_2d (torch.Tensor): Shape (..., 2)
-            The input 2d coords to transform.
+        img_meta (dict): Meta info regarding data transformation.
+        coord_2d (Tensor): Shape (..., 2) The input 2d coords to transform.
        ori2new (bool): Origin img coord system to new or not.

    Returns:
-        torch.Tensor: The transformed 2d coordinates.
+        Tensor: The transformed 2d coordinates.
    """

    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \

--- a/mmdet3d/models/layers/fusion_layers/point_fusion.py
+++ b/mmdet3d/models/layers/fusion_layers/point_fusion.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
 import torch
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

 from mmdet3d.registry import MODELS
 from mmdet3d.structures.bbox_3d import (get_proj_mat_by_coord_type,
                                        points_cam2img, points_img2cam)
+from mmdet3d.utils import OptConfigType, OptMultiConfig
 from . import apply_3d_transformation


-def point_sample(img_meta,
-                 img_features,
-                 points,
-                 proj_mat,
-                 coord_type,
-                 img_scale_factor,
-                 img_crop_offset,
-                 img_flip,
-                 img_pad_shape,
-                 img_shape,
-                 aligned=True,
-                 padding_mode='zeros',
-                 align_corners=True,
-                 valid_flag=False):
+def point_sample(img_meta: dict,
+                 img_features: Tensor,
+                 points: Tensor,
+                 proj_mat: Tensor,
+                 coord_type: str,
+                 img_scale_factor: Tensor,
+                 img_crop_offset: Tensor,
+                 img_flip: bool,
+                 img_pad_shape: Tuple[int],
+                 img_shape: Tuple[int],
+                 aligned: bool = True,
+                 padding_mode: str = 'zeros',
+                 align_corners: bool = True,
+                 valid_flag: bool = False) -> Tensor:
    """Obtain image features using points.

    Args:
        img_meta (dict): Meta info.
-        img_features (torch.Tensor): 1 x C x H x W image features.
-        points (torch.Tensor): Nx3 point cloud in LiDAR coordinates.
-        proj_mat (torch.Tensor): 4x4 transformation matrix.
+        img_features (Tensor): 1 x C x H x W image features.
+        points (Tensor): Nx3 point cloud in LiDAR coordinates.
+        proj_mat (Tensor): 4x4 transformation matrix.
        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
-        img_scale_factor (torch.Tensor): Scale factor with shape of
+        img_scale_factor (Tensor): Scale factor with shape of
            (w_scale, h_scale).
-        img_crop_offset (torch.Tensor): Crop offset used to crop
-            image during data augmentation with shape of (w_offset, h_offset).
+        img_crop_offset (Tensor): Crop offset used to crop image during
+            data augmentation with shape of (w_offset, h_offset).
        img_flip (bool): Whether the image is flipped.
-        img_pad_shape (tuple[int]): int tuple indicates the h & w after
-            padding, this is necessary to obtain features in feature map.
-        img_shape (tuple[int]): int tuple indicates the h & w before padding
-            after scaling, this is necessary for flipping coordinates.
-        aligned (bool): Whether use bilinear interpolation when
+        img_pad_shape (Tuple[int]): Int tuple indicates the h & w after
+            padding. This is necessary to obtain features in feature map.
+        img_shape (Tuple[int]): Int tuple indicates the h & w before padding
+            after scaling. This is necessary for flipping coordinates.
+        aligned (bool): Whether to use bilinear interpolation when
            sampling image features for each point. Defaults to True.
        padding_mode (str): Padding mode when padding values for
            features of out-of-image points. Defaults to 'zeros'.
        align_corners (bool): Whether to align corners when
            sampling image features for each point. Defaults to True.
-        valid_flag (bool): Whether to filter out the points that
-            outside the image and with depth smaller than 0. Defaults to
-            False.
+        valid_flag (bool): Whether to filter out the points that outside
+            the image and with depth smaller than 0. Defaults to False.

    Returns:
-        torch.Tensor: NxC image features sampled by point coordinates.
+        Tensor: NxC image features sampled by point coordinates.
    """

    # apply transformation based on info in img_meta
@@ -114,55 +117,55 @@ class PointFusion(BaseModule):
    """Fuse image features from multi-scale features.

    Args:
-        img_channels (list[int] | int): Channels of image features.
+        img_channels (List[int] or int): Channels of image features.
            It could be a list if the input is multi-scale image features.
        pts_channels (int): Channels of point features
        mid_channels (int): Channels of middle layers
        out_channels (int): Channels of output fused features
-        img_levels (int, optional): Number of image levels. Defaults to 3.
-        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
-            Defaults to 'LIDAR'.
-        conv_cfg (dict, optional): Dict config of conv layers of middle
-            layers. Defaults to None.
-        norm_cfg (dict, optional): Dict config of norm layers of middle
-            layers. Defaults to None.
-        act_cfg (dict, optional): Dict config of activatation layers.
+        img_levels (List[int] or int): Number of image levels. Defaults to 3.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Defaults to 'LIDAR'.
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layers of middle layers. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layers of middle layers. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to None.
-        activate_out (bool, optional): Whether to apply relu activation
-            to output features. Defaults to True.
-        fuse_out (bool, optional): Whether apply conv layer to the fused
-            features. Defaults to False.
-        dropout_ratio (int, float, optional): Dropout ratio of image
-            features to prevent overfitting. Defaults to 0.
-        aligned (bool, optional): Whether apply aligned feature fusion.
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+        activate_out (bool): Whether to apply relu activation to output
+            features. Defaults to True.
+        fuse_out (bool): Whether to apply conv layer to the fused features.
+            Defaults to False.
+        dropout_ratio (int or float): Dropout ratio of image features to
+            prevent overfitting. Defaults to 0.
+        aligned (bool): Whether to apply aligned feature fusion.
+            Defaults to True.
+        align_corners (bool): Whether to align corner when sampling features
+            according to points. Defaults to True.
+        padding_mode (str): Mode used to pad the features of points that do not
+            have corresponding image features. Defaults to 'zeros'.
+        lateral_conv (bool): Whether to apply lateral convs to image features.
            Defaults to True.
-        align_corners (bool, optional): Whether to align corner when
-            sampling features according to points. Defaults to True.
-        padding_mode (str, optional): Mode used to pad the features of
-            points that do not have corresponding image features.
-            Defaults to 'zeros'.
-        lateral_conv (bool, optional): Whether to apply lateral convs
-            to image features. Defaults to True.
    """

    def __init__(self,
-                 img_channels,
-                 pts_channels,
-                 mid_channels,
-                 out_channels,
-                 img_levels=3,
-                 coord_type='LIDAR',
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=None,
-                 init_cfg=None,
-                 activate_out=True,
-                 fuse_out=False,
-                 dropout_ratio=0,
-                 aligned=True,
-                 align_corners=True,
-                 padding_mode='zeros',
-                 lateral_conv=True):
+                 img_channels: Union[List[int], int],
+                 pts_channels: int,
+                 mid_channels: int,
+                 out_channels: int,
+                 img_levels: Union[List[int], int] = 3,
+                 coord_type: str = 'LIDAR',
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 act_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 activate_out: bool = True,
+                 fuse_out: bool = False,
+                 dropout_ratio: Union[int, float] = 0,
+                 aligned: bool = True,
+                 align_corners: bool = True,
+                 padding_mode: str = 'zeros',
+                 lateral_conv: bool = True) -> None:
        super(PointFusion, self).__init__(init_cfg=init_cfg)
        if isinstance(img_levels, int):
            img_levels = [img_levels]
@@ -225,18 +228,19 @@ class PointFusion(BaseModule):
                dict(type='Xavier', layer='Linear', distribution='uniform')
            ]

-    def forward(self, img_feats, pts, pts_feats, img_metas):
+    def forward(self, img_feats: List[Tensor], pts: List[Tensor],
+                pts_feats: Tensor, img_metas: List[dict]) -> Tensor:
        """Forward function.

        Args:
-            img_feats (list[torch.Tensor]): Image features.
-            pts: [list[torch.Tensor]]: A batch of points with shape N x 3.
-            pts_feats (torch.Tensor): A tensor consist of point features of the
+            img_feats (List[Tensor]): Image features.
+            pts: (List[Tensor]): A batch of points with shape N x 3.
+            pts_feats (Tensor): A tensor consist of point features of the
                total batch.
-            img_metas (list[dict]): Meta information of images.
+            img_metas (List[dict]): Meta information of images.

        Returns:
-            torch.Tensor: Fused features of each point.
+            Tensor: Fused features of each point.
        """
        img_pts = self.obtain_mlvl_feats(img_feats, pts, img_metas)
        img_pre_fuse = self.img_transform(img_pts)
@@ -252,17 +256,18 @@ class PointFusion(BaseModule):

        return fuse_out

-    def obtain_mlvl_feats(self, img_feats, pts, img_metas):
+    def obtain_mlvl_feats(self, img_feats: List[Tensor], pts: List[Tensor],
+                          img_metas: List[dict]) -> Tensor:
        """Obtain multi-level features for each point.

        Args:
-            img_feats (list(torch.Tensor)): Multi-scale image features produced
+            img_feats (List[Tensor]): Multi-scale image features produced
                by image backbone in shape (N, C, H, W).
-            pts (list[torch.Tensor]): Points of each sample.
-            img_metas (list[dict]): Meta information for each sample.
+            pts (List[Tensor]): Points of each sample.
+            img_metas (List[dict]): Meta information for each sample.

        Returns:
-            torch.Tensor: Corresponding image features of each point.
+            Tensor: Corresponding image features of each point.
        """
        if self.lateral_convs is not None:
            img_ins = [
@@ -285,17 +290,17 @@ class PointFusion(BaseModule):
        img_pts = torch.cat(img_feats_per_point, dim=0)
        return img_pts

-    def sample_single(self, img_feats, pts, img_meta):
+    def sample_single(self, img_feats: Tensor, pts: Tensor,
+                      img_meta: dict) -> Tensor:
        """Sample features from single level image feature map.

        Args:
-            img_feats (torch.Tensor): Image feature map in shape
-                (1, C, H, W).
-            pts (torch.Tensor): Points of a single sample.
+            img_feats (Tensor): Image feature map in shape (1, C, H, W).
+            pts (Tensor): Points of a single sample.
            img_meta (dict): Meta information of the single sample.

        Returns:
-            torch.Tensor: Single level image features of each point.
+            Tensor: Single level image features of each point.
        """
        # TODO: image transformation also extracted
        img_scale_factor = (
@@ -324,49 +329,47 @@ class PointFusion(BaseModule):
        return img_pts


-def voxel_sample(voxel_features,
-                 voxel_range,
-                 voxel_size,
-                 depth_samples,
-                 proj_mat,
-                 downsample_factor,
-                 img_scale_factor,
-                 img_crop_offset,
-                 img_flip,
-                 img_pad_shape,
-                 img_shape,
-                 aligned=True,
-                 padding_mode='zeros',
-                 align_corners=True):
+def voxel_sample(voxel_features: Tensor,
+                 voxel_range: List[float],
+                 voxel_size: List[float],
+                 depth_samples: Tensor,
+                 proj_mat: Tensor,
+                 downsample_factor: int,
+                 img_scale_factor: Tensor,
+                 img_crop_offset: Tensor,
+                 img_flip: bool,
+                 img_pad_shape: Tuple[int],
+                 img_shape: Tuple[int],
+                 aligned: bool = True,
+                 padding_mode: str = 'zeros',
+                 align_corners: bool = True) -> Tensor:
    """Obtain image features using points.

    Args:
-        voxel_features (torch.Tensor): 1 x C x Nx x Ny x Nz voxel features.
-        voxel_range (list): The range of voxel features.
-        voxel_size (:obj:`ConfigDict` or dict): The voxel size of voxel
-            features.
-        depth_samples (torch.Tensor): N depth samples in LiDAR coordinates.
-        proj_mat (torch.Tensor): ORIGINAL LiDAR2img projection matrix
-            for N views.
+        voxel_features (Tensor): 1 x C x Nx x Ny x Nz voxel features.
+        voxel_range (List[float]): The range of voxel features.
+        voxel_size (List[float]): The voxel size of voxel features.
+        depth_samples (Tensor): N depth samples in LiDAR coordinates.
+        proj_mat (Tensor): ORIGINAL LiDAR2img projection matrix for N views.
        downsample_factor (int): The downsample factor in rescaling.
-        img_scale_factor (tuple[torch.Tensor]): Scale factor with shape of
+        img_scale_factor (Tensor): Scale factor with shape of
            (w_scale, h_scale).
-        img_crop_offset (tuple[torch.Tensor]): Crop offset used to crop
-            image during data augmentation with shape of (w_offset, h_offset).
+        img_crop_offset (Tensor): Crop offset used to crop image during
+            data augmentation with shape of (w_offset, h_offset).
        img_flip (bool): Whether the image is flipped.
-        img_pad_shape (tuple[int]): int tuple indicates the h & w after
-            padding, this is necessary to obtain features in feature map.
-        img_shape (tuple[int]): int tuple indicates the h & w before padding
-            after scaling, this is necessary for flipping coordinates.
-        aligned (bool, optional): Whether use bilinear interpolation when
+        img_pad_shape (Tuple[int]): Int tuple indicates the h & w after
+            padding. This is necessary to obtain features in feature map.
+        img_shape (Tuple[int]): Int tuple indicates the h & w before padding
+            after scaling. This is necessary for flipping coordinates.
+        aligned (bool): Whether to use bilinear interpolation when
            sampling image features for each point. Defaults to True.
-        padding_mode (str, optional): Padding mode when padding values for
+        padding_mode (str): Padding mode when padding values for
            features of out-of-image points. Defaults to 'zeros'.
-        align_corners (bool, optional): Whether to align corners when
+        align_corners (bool): Whether to align corners when
            sampling image features for each point. Defaults to True.

    Returns:
-        torch.Tensor: 1xCxDxHxW frustum features sampled from voxel features.
+        Tensor: 1xCxDxHxW frustum features sampled from voxel features.
    """
    # construct frustum grid
    device = voxel_features.device

--- a/mmdet3d/models/layers/fusion_layers/vote_fusion.py
+++ b/mmdet3d/models/layers/fusion_layers/vote_fusion.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import torch
+from torch import Tensor
 from torch import nn as nn

 from mmdet3d.registry import MODELS
@@ -14,27 +17,33 @@ class VoteFusion(nn.Module):
    """Fuse 2d features from 3d seeds.

    Args:
-        num_classes (int): number of classes.
-        max_imvote_per_pixel (int): max number of imvotes.
+        num_classes (int): Number of classes.
+        max_imvote_per_pixel (int): Max number of imvotes.
    """

-    def __init__(self, num_classes=10, max_imvote_per_pixel=3):
+    def __init__(self,
+                 num_classes: int = 10,
+                 max_imvote_per_pixel: int = 3) -> None:
        super(VoteFusion, self).__init__()
        self.num_classes = num_classes
        self.max_imvote_per_pixel = max_imvote_per_pixel

-    def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas):
+    def forward(self, imgs: List[Tensor], bboxes_2d_rescaled: List[Tensor],
+                seeds_3d_depth: List[Tensor],
+                img_metas: List[dict]) -> Tuple[Tensor]:
        """Forward function.

        Args:
-            imgs (list[torch.Tensor]): Image features.
-            bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes.
-            seeds_3d_depth (torch.Tensor): 3D seeds.
-            img_metas (list[dict]): Meta information of images.
+            imgs (List[Tensor]): Image features.
+            bboxes_2d_rescaled (List[Tensor]): 2D bboxes.
+            seeds_3d_depth (List[Tensor]): 3D seeds.
+            img_metas (List[dict]): Meta information of images.

        Returns:
-            torch.Tensor: Concatenated cues of each point.
-            torch.Tensor: Validity mask of each feature.
+            Tuple[Tensor]:
+
+                - img_features: Concatenated cues of each point.
+                - masks: Validity mask of each feature.
        """
        img_features = []
        masks = []

--- a/mmdet3d/models/layers/mlp.py
+++ b/mmdet3d/models/layers/mlp.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 class MLP(BaseModule):
    """A simple MLP module.
@@ -10,26 +15,28 @@ class MLP(BaseModule):
    Pass features (B, C, N) through an MLP.

    Args:
-        in_channels (int, optional): Number of channels of input features.
-            Default: 18.
-        conv_channels (tuple[int], optional): Out channels of the convolution.
-            Default: (256, 256).
-        conv_cfg (dict, optional): Config of convolution.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d').
-        act_cfg (dict, optional): Config of activation.
-            Default: dict(type='ReLU').
+        in_channels (int): Number of channels of input features.
+            Defaults to 18.
+        conv_channels (Tuple[int]): Out channels of the convolution.
+            Defaults to (256, 256).
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layer. Defaults to dict(type='Conv1d').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
    """

    def __init__(self,
-                 in_channel=18,
-                 conv_channels=(256, 256),
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
+                 in_channel: int = 18,
+                 conv_channels: Tuple[int] = (256, 256),
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(MLP, self).__init__(init_cfg=init_cfg)
        self.mlp = nn.Sequential()
        prev_channels = in_channel
        for i, conv_channel in enumerate(conv_channels):
@@ -47,5 +54,5 @@ class MLP(BaseModule):
                    inplace=True))
            prev_channels = conv_channels[i]

-    def forward(self, img_features):
+    def forward(self, img_features: Tensor) -> Tensor:
        return self.mlp(img_features)
--- a/mmdet3d/models/layers/norm.py
+++ b/mmdet3d/models/layers/norm.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from mmengine.registry import MODELS
+from torch import Tensor
 from torch import distributed as dist
 from torch import nn as nn
 from torch.autograd.function import Function
@@ -9,7 +10,7 @@ from torch.autograd.function import Function
 class AllReduce(Function):

    @staticmethod
-    def forward(ctx, input):
+    def forward(ctx, input: Tensor) -> Tensor:
        input_list = [
            torch.zeros_like(input) for k in range(dist.get_world_size())
        ]
@@ -19,7 +20,7 @@ class AllReduce(Function):
        return torch.sum(inputs, dim=0)

    @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: Tensor) -> Tensor:
        dist.all_reduce(grad_output, async_op=False)
        return grad_output

@@ -43,20 +44,18 @@ class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
        It is slower than `nn.SyncBatchNorm`.
    """

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.fp16_enabled = False
+    def __init__(self, *args: list, **kwargs: dict) -> None:
+        super(NaiveSyncBatchNorm1d, self).__init__(*args, **kwargs)

-    def forward(self, input):
+    def forward(self, input: Tensor) -> Tensor:
        """
        Args:
-            input (tensor): Has shape (N, C) or (N, C, L), where N is
+            input (Tensor): Has shape (N, C) or (N, C, L), where N is
                the batch size, C is the number of features or
                channels, and L is the sequence length

        Returns:
-            tensor: Has shape (N, C) or (N, C, L), has same shape
-            as input.
+            Tensor: Has shape (N, C) or (N, C, L), same shape as input.
        """
        assert input.dtype == torch.float32, \
            f'input should be in float32 type, got {input.dtype}'
@@ -112,17 +111,16 @@ class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
        It is slower than `nn.SyncBatchNorm`.
    """

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.fp16_enabled = False
+    def __init__(self, *args: list, **kwargs: dict) -> None:
+        super(NaiveSyncBatchNorm2d, self).__init__(*args, **kwargs)

-    def forward(self, input):
+    def forward(self, input: Tensor) -> Tensor:
        """
        Args:
-            Input (tensor): Feature has shape (N, C, H, W).
+            Input (Tensor): Feature has shape (N, C, H, W).

        Returns:
-            tensor: Has shape (N, C, H, W), same shape as input.
+            Tensor: Has shape (N, C, H, W), same shape as input.
        """
        assert input.dtype == torch.float32, \
            f'input should be in float32 type, got {input.dtype}'

--- a/mmdet3d/models/layers/paconv/paconv.py
+++ b/mmdet3d/models/layers/paconv/paconv.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+from typing import List, Tuple, Union

 import torch
 from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
 from mmcv.ops import assign_score_withk as assign_score_cuda
 from mmengine.model import constant_init
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

+from mmdet3d.utils import ConfigType
 from .utils import assign_kernel_withoutk, assign_score, calc_euclidian_dist


@@ -17,33 +20,33 @@ class ScoreNet(nn.Module):

    Args:
        mlp_channels (List[int]): Hidden unit sizes of SharedMLP layers.
-        last_bn (bool, optional): Whether to use BN on the last output of mlps.
+        last_bn (bool): Whether to use BN on the last output of mlps.
            Defaults to False.
-        score_norm (str, optional): Normalization function of output scores.
+        score_norm (str): Normalization function of output scores.
            Can be 'softmax', 'sigmoid' or 'identity'. Defaults to 'softmax'.
-        temp_factor (float, optional): Temperature factor to scale the output
+        temp_factor (float): Temperature factor to scale the output
            scores before softmax. Defaults to 1.0.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d').
-        bias (bool | str, optional): If specified as `auto`, it will be decided
-            by the norm_cfg. Bias will be set as True if `norm_cfg` is None,
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
            otherwise False. Defaults to 'auto'.

    Note:
        The official code applies xavier_init to all Conv layers in ScoreNet,
-            see `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg
-            /model/pointnet2/paconv.py#L105>`_. However in our experiments, we
-            did not find much difference in applying such xavier initialization
-            or not. So we neglect this initialization in our implementation.
+        see `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg
+        /model/pointnet2/paconv.py#L105>`_. However in our experiments, we
+        did not find much difference in applying such xavier initialization
+        or not. So we neglect this initialization in our implementation.
    """

    def __init__(self,
-                 mlp_channels,
-                 last_bn=False,
-                 score_norm='softmax',
-                 temp_factor=1.0,
-                 norm_cfg=dict(type='BN2d'),
-                 bias='auto'):
+                 mlp_channels: List[int],
+                 last_bn: bool = False,
+                 score_norm: str = 'softmax',
+                 temp_factor: float = 1.0,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 bias: Union[bool, str] = 'auto') -> None:
        super(ScoreNet, self).__init__()

        assert score_norm in ['softmax', 'sigmoid', 'identity'], \
@@ -79,16 +82,16 @@ class ScoreNet(nn.Module):
                act_cfg=None,
                bias=bias))

-    def forward(self, xyz_features):
+    def forward(self, xyz_features: Tensor) -> Tensor:
        """Forward.

        Args:
-            xyz_features (torch.Tensor): (B, C, N, K), features constructed
-                from xyz coordinates of point pairs. May contain relative
-                positions, Euclidean distance, etc.
+            xyz_features (Tensor): (B, C, N, K) Features constructed from xyz
+                coordinates of point pairs. May contain relative positions,
+                Euclidean distance, etc.

        Returns:
-            torch.Tensor: (B, N, K, M), predicted scores for `M` kernels.
+            Tensor: (B, N, K, M) Predicted scores for `M` kernels.
        """
        scores = self.mlps(xyz_features)  # (B, M, N, K)

@@ -116,43 +119,49 @@ class PAConv(nn.Module):
        in_channels (int): Input channels of point features.
        out_channels (int): Output channels of point features.
        num_kernels (int): Number of kernel weights in the weight bank.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d', momentum=0.1).
-        act_cfg (dict, optional): Type of activation method.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d', momentum=0.1).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to dict(type='ReLU', inplace=True).
-        scorenet_input (str, optional): Type of input to ScoreNet.
+        scorenet_input (str): Type of input to ScoreNet.
            Can be 'identity', 'w_neighbor' or 'w_neighbor_dist'.
            Defaults to 'w_neighbor_dist'.
-        weight_bank_init (str, optional): Init method of weight bank kernels.
+        weight_bank_init (str): Init method of weight bank kernels.
            Can be 'kaiming' or 'xavier'. Defaults to 'kaiming'.
-        kernel_input (str, optional): Input features to be multiplied with
-            kernel weights. Can be 'identity' or 'w_neighbor'.
+        kernel_input (str): Input features to be multiplied with kernel
+            weights. Can be 'identity' or 'w_neighbor'.
            Defaults to 'w_neighbor'.
-        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
-            may contain the following keys and values:
+        scorenet_cfg (dict): Config of the ScoreNet module, which may contain
+            the following keys and values:

            - mlp_channels (List[int]): Hidden units of MLPs.
            - score_norm (str): Normalization function of output scores.
-                Can be 'softmax', 'sigmoid' or 'identity'.
+              Can be 'softmax', 'sigmoid' or 'identity'.
            - temp_factor (float): Temperature factor to scale the output
-                scores before softmax.
+              scores before softmax.
            - last_bn (bool): Whether to use BN on the last output of mlps.
+            Defaults to dict(mlp_channels=[16, 16, 16],
+                             score_norm='softmax',
+                             temp_factor=1.0,
+                             last_bn=False).
    """

-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_kernels,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 act_cfg=dict(type='ReLU', inplace=True),
-                 scorenet_input='w_neighbor_dist',
-                 weight_bank_init='kaiming',
-                 kernel_input='w_neighbor',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_kernels: int,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        scorenet_input: str = 'w_neighbor_dist',
+        weight_bank_init: str = 'kaiming',
+        kernel_input: str = 'w_neighbor',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConv, self).__init__()

        # determine weight kernel size according to used features
@@ -218,21 +227,20 @@ class PAConv(nn.Module):

        self.init_weights()

-    def init_weights(self):
+    def init_weights(self) -> None:
        """Initialize weights of shared MLP layers and BN layers."""
        if self.bn is not None:
            constant_init(self.bn, val=1, bias=0)

-    def _prepare_scorenet_input(self, points_xyz):
+    def _prepare_scorenet_input(self, points_xyz: Tensor) -> Tensor:
        """Prepare input point pairs features for self.ScoreNet.

        Args:
-            points_xyz (torch.Tensor): (B, 3, npoint, K)
-                Coordinates of the grouped points.
+            points_xyz (Tensor): (B, 3, npoint, K) Coordinates of the
+                grouped points.

        Returns:
-            torch.Tensor: (B, C, npoint, K)
-                The generated features per point pair.
+            Tensor: (B, C, npoint, K) The generated features per point pair.
        """
        B, _, npoint, K = points_xyz.size()
        center_xyz = points_xyz[..., :1].repeat(1, 1, 1, K)
@@ -250,22 +258,22 @@ class PAConv(nn.Module):
                                     dim=1)
        return xyz_features

-    def forward(self, inputs):
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
        """Forward.

        Args:
-            inputs (tuple(torch.Tensor)):
+            inputs (Tuple[Tensor]):

-                - features (torch.Tensor): (B, in_c, npoint, K)
-                    Features of the queried points.
-                - points_xyz (torch.Tensor): (B, 3, npoint, K)
-                    Coordinates of the grouped points.
+                - features (Tensor): (B, in_c, npoint, K)
+                  Features of the queried points.
+                - points_xyz (Tensor): (B, 3, npoint, K)
+                  Coordinates of the grouped points.

        Returns:
-            Tuple[torch.Tensor]:
+            Tuple[Tensor]:

-                - new_features: (B, out_c, npoint, K), features after PAConv.
-                - points_xyz: same as input.
+                - new_features: (B, out_c, npoint, K) Features after PAConv.
+                - points_xyz: Same as input.
        """
        features, points_xyz = inputs
        B, _, npoint, K = features.size()
@@ -315,20 +323,22 @@ class PAConvCUDA(PAConv):
    more detailed descriptions.
    """

-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_kernels,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 act_cfg=dict(type='ReLU', inplace=True),
-                 scorenet_input='w_neighbor_dist',
-                 weight_bank_init='kaiming',
-                 kernel_input='w_neighbor',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_kernels: int,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        scorenet_input: str = 'w_neighbor_dist',
+        weight_bank_init: str = 'kaiming',
+        kernel_input: str = 'w_neighbor',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvCUDA, self).__init__(
            in_channels=in_channels,
            out_channels=out_channels,
@@ -343,27 +353,27 @@ class PAConvCUDA(PAConv):
        assert self.kernel_input == 'w_neighbor', \
            'CUDA implemented PAConv only supports w_neighbor kernel_input'

-    def forward(self, inputs):
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
        """Forward.

        Args:
-            inputs (tuple(torch.Tensor)):
+            inputs (Tuple[Tensor]):

-                - features (torch.Tensor): (B, in_c, N)
-                    Features of all points in the current point cloud.
-                    Different from non-CUDA version PAConv, here the features
-                        are not grouped by each center to form a K dim.
-                - points_xyz (torch.Tensor): (B, 3, npoint, K)
-                    Coordinates of the grouped points.
-                - points_idx (torch.Tensor): (B, npoint, K)
-                    Index of the grouped points.
+                - features (Tensor): (B, in_c, N)
+                  Features of all points in the current point cloud.
+                  Different from non-CUDA version PAConv, here the features
+                  are not grouped by each center to form a K dim.
+                - points_xyz (Tensor): (B, 3, npoint, K)
+                  Coordinates of the grouped points.
+                - points_idx (Tensor): (B, npoint, K)
+                  Index of the grouped points.

        Returns:
-            Tuple[torch.Tensor]:
+            Tuple[Tensor]:

-                - new_features: (B, out_c, npoint, K), features after PAConv.
-                - points_xyz: same as input.
-                - points_idx: same as input.
+                - new_features: (B, out_c, npoint, K) Features after PAConv.
+                - points_xyz: Same as input.
+                - points_idx: Same as input.
        """
        features, points_xyz, points_idx = inputs


--- a/mmdet3d/models/layers/paconv/utils.py
+++ b/mmdet3d/models/layers/paconv/utils.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
+from torch import Tensor


-def calc_euclidian_dist(xyz1, xyz2):
+def calc_euclidian_dist(xyz1: Tensor, xyz2: Tensor) -> Tensor:
    """Calculate the Euclidean distance between two sets of points.

    Args:
-        xyz1 (torch.Tensor): (N, 3), the first set of points.
-        xyz2 (torch.Tensor): (N, 3), the second set of points.
+        xyz1 (Tensor): (N, 3) The first set of points.
+        xyz2 (Tensor): (N, 3) The second set of points.

    Returns:
-        torch.Tensor: (N, ), the Euclidean distance between each point pair.
+        Tensor: (N, ) The Euclidean distance between each point pair.
    """
    assert xyz1.shape[0] == xyz2.shape[0], 'number of points are not the same'
    assert xyz1.shape[1] == xyz2.shape[1] == 3, \
@@ -18,25 +21,25 @@ def calc_euclidian_dist(xyz1, xyz2):
    return torch.norm(xyz1 - xyz2, dim=-1)


-def assign_score(scores, point_features):
+def assign_score(scores: Tensor, point_features: Tensor) -> Tensor:
    """Perform weighted sum to aggregate output features according to scores.
    This function is used in non-CUDA version of PAConv.

    Compared to the cuda op assigh_score_withk, this pytorch implementation
-        pre-computes output features for the neighbors of all centers, and then
-        performs aggregation. It consumes more GPU memories.
+    pre-computes output features for the neighbors of all centers, and then
+    performs aggregation. It consumes more GPU memories.

    Args:
-        scores (torch.Tensor): (B, npoint, K, M), predicted scores to
+        scores (Tensor): (B, npoint, K, M) Predicted scores to
            aggregate weight matrices in the weight bank.
            `npoint` is the number of sampled centers.
            `K` is the number of queried neighbors.
            `M` is the number of weight matrices in the weight bank.
-        point_features (torch.Tensor): (B, npoint, K, M, out_dim)
+        point_features (Tensor): (B, npoint, K, M, out_dim)
            Pre-computed point features to be aggregated.

    Returns:
-        torch.Tensor: (B, npoint, K, out_dim), the aggregated features.
+        Tensor: (B, npoint, K, out_dim) The aggregated features.
    """
    B, npoint, K, M = scores.size()
    scores = scores.view(B, npoint, K, 1, M)
@@ -44,21 +47,22 @@ def assign_score(scores, point_features):
    return output


-def assign_kernel_withoutk(features, kernels, M):
+def assign_kernel_withoutk(features: Tensor, kernels: Tensor,
+                           M: int) -> Tuple[Tensor]:
    """Pre-compute features with weight matrices in weight bank. This function
    is used before cuda op assign_score_withk in CUDA version PAConv.

    Args:
-        features (torch.Tensor): (B, in_dim, N), input features of all points.
+        features (Tensor): (B, in_dim, N) Input features of all points.
            `N` is the number of points in current point cloud.
-        kernels (torch.Tensor): (2 * in_dim, M * out_dim), weight matrices in
+        kernels (Tensor): (2 * in_dim, M * out_dim) Weight matrices in
            the weight bank, transformed from (M, 2 * in_dim, out_dim).
            `2 * in_dim` is because the input features are concatenation of
            (point_features - center_features, point_features).
        M (int): Number of weight matrices in the weight bank.

    Returns:
-        Tuple[torch.Tensor]: both of shape (B, N, M, out_dim):
+        Tuple[Tensor]: Both of shape (B, N, M, out_dim).

            - point_features: Pre-computed features for points.
            - center_features: Pre-computed features for centers.

--- a/mmdet3d/models/layers/pointnet_modules/__init__.py
+++ b/mmdet3d/models/layers/pointnet_modules/__init__.py
@@ -4,9 +4,10 @@ from .paconv_sa_module import (PAConvCUDASAModule, PAConvCUDASAModuleMSG,
                               PAConvSAModule, PAConvSAModuleMSG)
 from .point_fp_module import PointFPModule
 from .point_sa_module import PointSAModule, PointSAModuleMSG
+from .stack_point_sa_module import StackedSAModuleMSG

 __all__ = [
    'build_sa_module', 'PointSAModuleMSG', 'PointSAModule', 'PointFPModule',
    'PAConvSAModule', 'PAConvSAModuleMSG', 'PAConvCUDASAModule',
-    'PAConvCUDASAModuleMSG'
+    'PAConvCUDASAModuleMSG', 'StackedSAModuleMSG'
 ]
--- a/mmdet3d/models/layers/pointnet_modules/builder.py
+++ b/mmdet3d/models/layers/pointnet_modules/builder.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
 from mmengine.registry import Registry
+from torch import nn as nn

 SA_MODULES = Registry('point_sa_module')


-def build_sa_module(cfg, *args, **kwargs):
+def build_sa_module(cfg: Union[dict, None], *args, **kwargs) -> nn.Module:
    """Build PointNet2 set abstraction (SA) module.

    Args:
-        cfg (None or dict): The SA module config, which should contain:
+        cfg (dict or None): The SA module config, which should contain:
+
            - type (str): Module type.
            - module args: Args needed to instantiate an SA module.
        args (argument list): Arguments passed to the `__init__`

--- a/mmdet3d/models/layers/pointnet_modules/paconv_sa_module.py
+++ b/mmdet3d/models/layers/pointnet_modules/paconv_sa_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
 import torch
+from torch import Tensor
 from torch import nn as nn

 from mmdet3d.models.layers.paconv import PAConv, PAConvCUDA
+from mmdet3d.utils import ConfigType
 from .builder import SA_MODULES
 from .point_sa_module import BasePointSAModule

@@ -16,52 +20,81 @@ class PAConvSAModuleMSG(BasePointSAModule):
    See the `paper <https://arxiv.org/abs/2103.14635>`_ for more details.

    Args:
-        paconv_num_kernels (list[list[int]]): Number of kernel weights in the
+        num_point (int): Number of points.
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
+            the global pooling for each scale.
+        paconv_num_kernels (List[List[int]]): Number of kernel weights in the
            weight banks of each layer's PAConv.
-        paconv_kernel_input (str, optional): Input features to be multiplied
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: Using feature distances for FPS.
+            - D-FPS: Using Euclidean distances of points for FPS.
+            - FS: Using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d', momentum=0.1).
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
+        paconv_kernel_input (str): Input features to be multiplied
            with kernel weights. Can be 'identity' or 'w_neighbor'.
            Defaults to 'w_neighbor'.
-        scorenet_input (str, optional): Type of the input to ScoreNet.
+        scorenet_input (str): Type of the input to ScoreNet.
            Defaults to 'w_neighbor_dist'. Can be the following values:

            - 'identity': Use xyz coordinates as input.
            - 'w_neighbor': Use xyz coordinates and the difference with center
-                points as input.
+              points as input.
            - 'w_neighbor_dist': Use xyz coordinates, the difference with
-                center points and the Euclidean distance as input.
-
-        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
+              center points and the Euclidean distance as input.
+        scorenet_cfg (dict): Config of the ScoreNet module, which
            may contain the following keys and values:

            - mlp_channels (List[int]): Hidden units of MLPs.
            - score_norm (str): Normalization function of output scores.
-                Can be 'softmax', 'sigmoid' or 'identity'.
+              Can be 'softmax', 'sigmoid' or 'identity'.
            - temp_factor (float): Temperature factor to scale the output
-                scores before softmax.
+              scores before softmax.
            - last_bn (bool): Whether to use BN on the last output of mlps.
+            Defaults to dict(mlp_channels=[16, 16, 16],
+                             score_norm='softmax',
+                             temp_factor=1.0,
+                             last_bn=False).
    """

-    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto',
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        num_point: int,
+        radii: List[float],
+        sample_nums: List[int],
+        mlp_channels: List[List[int]],
+        paconv_num_kernels: List[List[int]],
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        dilated_group: bool = False,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        normalize_xyz: bool = False,
+        bias: Union[bool, str] = 'auto',
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvSAModuleMSG, self).__init__(
            num_point=num_point,
            radii=radii,
@@ -114,25 +147,27 @@ class PAConvSAModule(PAConvSAModuleMSG):
    <https://arxiv.org/abs/2103.14635>`_ for more details.
    """

-    def __init__(self,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False,
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        mlp_channels: List[int],
+        paconv_num_kernels: List[int],
+        num_point: Optional[int] = None,
+        radius: Optional[float] = None,
+        num_sample: Optional[int] = None,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        normalize_xyz: bool = False,
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvSAModule, self).__init__(
            mlp_channels=[mlp_channels],
            paconv_num_kernels=[paconv_num_kernels],
@@ -160,27 +195,29 @@ class PAConvCUDASAModuleMSG(BasePointSAModule):
    for more details.
    """

-    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto',
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        num_point: int,
+        radii: List[float],
+        sample_nums: List[int],
+        mlp_channels: List[List[int]],
+        paconv_num_kernels: List[List[int]],
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        dilated_group: bool = False,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        normalize_xyz: bool = False,
+        bias: Union[bool, str] = 'auto',
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvCUDASAModuleMSG, self).__init__(
            num_point=num_point,
            radii=radii,
@@ -230,29 +267,31 @@ class PAConvCUDASAModuleMSG(BasePointSAModule):

    def forward(
        self,
-        points_xyz,
-        features=None,
-        indices=None,
-        target_xyz=None,
-    ):
-        """forward.
+        points_xyz: Tensor,
+        features: Optional[Tensor] = None,
+        indices: Optional[Tensor] = None,
+        target_xyz: Optional[Tensor] = None,
+    ) -> Tuple[Tensor]:
+        """Forward.

        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
            features (Tensor, optional): (B, C, N) features of each point.
-                Default: None.
+                Defaults to None.
            indices (Tensor, optional): (B, num_point) Index of the features.
-                Default: None.
+                Defaults to None.
            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
-                Default: None.
+                Defaults to None.

        Returns:
-            Tensor: (B, M, 3) where M is the number of points.
-                New features xyz.
-            Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number
-                of points. New feature descriptors.
-            Tensor: (B, M) where M is the number of points.
-                Index of the features.
+            Tuple[Tensor]:
+
+                - new_xyz: (B, M, 3) where M is the number of points.
+                  New features xyz.
+                - new_features: (B, M, sum_k(mlps[k][-1])) where M is the
+                  number of points. New feature descriptors.
+                - indices: (B, M) where M is the number of points.
+                  Index of the features.
        """
        new_features_list = []

@@ -306,25 +345,27 @@ class PAConvCUDASAModule(PAConvCUDASAModuleMSG):
    for more details.
    """

-    def __init__(self,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False,
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        mlp_channels: List[int],
+        paconv_num_kernels: List[int],
+        num_point: Optional[int] = None,
+        radius: Optional[float] = None,
+        num_sample: Optional[int] = None,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        normalize_xyz: bool = False,
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvCUDASAModule, self).__init__(
            mlp_channels=[mlp_channels],
            paconv_num_kernels=[paconv_num_kernels],

--- a/mmdet3d/models/layers/pointnet_modules/point_fp_module.py
+++ b/mmdet3d/models/layers/pointnet_modules/point_fp_module.py
@@ -5,8 +5,11 @@ import torch
 from mmcv.cnn import ConvModule
 from mmcv.ops import three_interpolate, three_nn
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 class PointFPModule(BaseModule):
    """Point feature propagation module used in PointNets.
@@ -15,16 +18,17 @@ class PointFPModule(BaseModule):

    Args:
        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
    """

    def __init__(self,
                 mlp_channels: List[int],
-                 norm_cfg: dict = dict(type='BN2d'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.fp16_enabled = False
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(PointFPModule, self).__init__(init_cfg=init_cfg)
        self.mlps = nn.Sequential()
        for i in range(len(mlp_channels) - 1):
            self.mlps.add_module(
@@ -37,23 +41,22 @@ class PointFPModule(BaseModule):
                    conv_cfg=dict(type='Conv2d'),
                    norm_cfg=norm_cfg))

-    def forward(self, target: torch.Tensor, source: torch.Tensor,
-                target_feats: torch.Tensor,
-                source_feats: torch.Tensor) -> torch.Tensor:
-        """forward.
+    def forward(self, target: Tensor, source: Tensor, target_feats: Tensor,
+                source_feats: Tensor) -> Tensor:
+        """Forward.

        Args:
-            target (Tensor): (B, n, 3) tensor of the xyz positions of
+            target (Tensor): (B, n, 3) Tensor of the xyz positions of
                the target features.
-            source (Tensor): (B, m, 3) tensor of the xyz positions of
+            source (Tensor): (B, m, 3) Tensor of the xyz positions of
                the source features.
-            target_feats (Tensor): (B, C1, n) tensor of the features to be
+            target_feats (Tensor): (B, C1, n) Tensor of the features to be
                propagated to.
-            source_feats (Tensor): (B, C2, m) tensor of features
+            source_feats (Tensor): (B, C2, m) Tensor of features
                to be propagated.

        Return:
-            Tensor: (B, M, N) M = mlp[-1], tensor of the target features.
+            Tensor: (B, M, N) M = mlp[-1], Tensor of the target features.
        """
        if source is not None:
            dist, idx = three_nn(target, source)

--- a/mmdet3d/models/layers/pointnet_modules/point_sa_module.py
+++ b/mmdet3d/models/layers/pointnet_modules/point_sa_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
 import torch
 from mmcv.cnn import ConvModule
 from mmcv.ops import GroupAll
 from mmcv.ops import PointsSampler as Points_Sampler
 from mmcv.ops import QueryAndGroup, gather_points
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

 from mmdet3d.models.layers import PAConv
+from mmdet3d.utils import ConfigType
 from .builder import SA_MODULES


@@ -16,44 +20,43 @@ class BasePointSAModule(nn.Module):

    Args:
        num_point (int): Number of points.
-        radii (list[float]): List of radius in each ball query.
-        sample_nums (list[int]): Number of samples in each ball query.
-        mlp_channels (list[list[int]]): Specify of the pointnet before
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
            the global pooling for each scale.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-            F-FPS: using feature distances for FPS.
-            D-FPS: using Euclidean distances of points for FPS.
-            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int], optional):
-            Range of points to apply FPS. Default: [-1].
-        dilated_group (bool, optional): Whether to use dilated ball query.
-            Default: False.
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
-        grouper_return_grouped_xyz (bool, optional): Whether to return
-            grouped xyz in `QueryAndGroup`. Defaults to False.
-        grouper_return_grouped_idx (bool, optional): Whether to return
-            grouped idx in `QueryAndGroup`. Defaults to False.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: using feature distances for FPS.
+            - D-FPS: using Euclidean distances of points for FPS.
+            - FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        grouper_return_grouped_xyz (bool): Whether to return grouped xyz
+            in `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool): Whether to return grouped idx
+            in `QueryAndGroup`. Defaults to False.
    """

    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 grouper_return_grouped_xyz=False,
-                 grouper_return_grouped_idx=False):
+                 num_point: int,
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 dilated_group: bool = False,
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 normalize_xyz: bool = False,
+                 grouper_return_grouped_xyz: bool = False,
+                 grouper_return_grouped_idx: bool = False) -> None:
        super(BasePointSAModule, self).__init__()

        assert len(radii) == len(sample_nums) == len(mlp_channels)
@@ -109,7 +112,8 @@ class BasePointSAModule(nn.Module):
                grouper = GroupAll(use_xyz)
            self.groupers.append(grouper)

-    def _sample_points(self, points_xyz, features, indices, target_xyz):
+    def _sample_points(self, points_xyz: Tensor, features: Tensor,
+                       indices: Tensor, target_xyz: Tensor) -> Tuple[Tensor]:
        """Perform point sampling based on inputs.

        If `indices` is specified, directly sample corresponding points.
@@ -118,13 +122,15 @@ class BasePointSAModule(nn.Module):

        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor): (B, C, N) features of each point.
+            features (Tensor): (B, C, N) Features of each point.
            indices (Tensor): (B, num_point) Index of the features.
            target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs.

        Returns:
-            Tensor: (B, num_point, 3) sampled xyz coordinates of points.
-            Tensor: (B, num_point) sampled points' index.
+            Tuple[Tensor]:
+
+            - new_xyz: (B, num_point, 3) Sampled xyz coordinates of points.
+            - indices: (B, num_point) Sampled points' index.
        """
        xyz_flipped = points_xyz.transpose(1, 2).contiguous()
        if indices is not None:
@@ -143,16 +149,15 @@ class BasePointSAModule(nn.Module):

        return new_xyz, indices

-    def _pool_features(self, features):
+    def _pool_features(self, features: Tensor) -> Tensor:
        """Perform feature aggregation using pooling operation.

        Args:
-            features (torch.Tensor): (B, C, N, K)
-                Features of locally grouped points before pooling.
+            features (Tensor): (B, C, N, K) Features of locally grouped
+                points before pooling.

        Returns:
-            torch.Tensor: (B, C, N)
-                Pooled features aggregating local information.
+            Tensor: (B, C, N) Pooled features aggregating local information.
        """
        if self.pool_mod == 'max':
            # (B, C, N, 1)
@@ -169,29 +174,31 @@ class BasePointSAModule(nn.Module):

    def forward(
        self,
-        points_xyz,
-        features=None,
-        indices=None,
-        target_xyz=None,
-    ):
-        """forward.
+        points_xyz: Tensor,
+        features: Optional[Tensor] = None,
+        indices: Optional[Tensor] = None,
+        target_xyz: Optional[Tensor] = None,
+    ) -> Tuple[Tensor]:
+        """Forward.

        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor, optional): (B, C, N) features of each point.
-                Default: None.
+            features (Tensor, optional): (B, C, N) Features of each point.
+                Defaults to None.
            indices (Tensor, optional): (B, num_point) Index of the features.
-                Default: None.
-            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
-                Default: None.
+                Defaults to None.
+            target_xyz (Tensor, optional): (B, M, 3) New coords of the outputs.
+                Defaults to None.

        Returns:
-            Tensor: (B, M, 3) where M is the number of points.
-                New features xyz.
-            Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number
-                of points. New feature descriptors.
-            Tensor: (B, M) where M is the number of points.
-                Index of the features.
+            Tuple[Tensor]:
+
+                - new_xyz: (B, M, 3) Where M is the number of points.
+                  New features xyz.
+                - new_features: (B, M, sum_k(mlps[k][-1])) Where M is the
+                  number of points. New feature descriptors.
+                - indices: (B, M) Where M is the number of points.
+                  Index of the features.
        """
        new_features_list = []

@@ -229,45 +236,44 @@ class PointSAModuleMSG(BasePointSAModule):

    Args:
        num_point (int): Number of points.
-        radii (list[float]): List of radius in each ball query.
-        sample_nums (list[int]): Number of samples in each ball query.
-        mlp_channels (list[list[int]]): Specify of the pointnet before
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
            the global pooling for each scale.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-            F-FPS: using feature distances for FPS.
-            D-FPS: using Euclidean distances of points for FPS.
-            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int], optional): Range of points to
-            apply FPS. Default: [-1].
-        dilated_group (bool, optional): Whether to use dilated ball query.
-            Default: False.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
-        bias (bool | str, optional): If specified as `auto`, it will be
-            decided by `norm_cfg`. `bias` will be set as True if
-            `norm_cfg` is None, otherwise False. Default: 'auto'.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: using feature distances for FPS.
+            - D-FPS: using Euclidean distances of points for FPS.
+            - FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
    """

    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d'),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto'):
+                 num_point: int,
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 dilated_group: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 normalize_xyz: bool = False,
+                 bias: Union[bool, str] = 'auto') -> None:
        super(PointSAModuleMSG, self).__init__(
            num_point=num_point,
            radii=radii,
@@ -306,39 +312,35 @@ class PointSAModule(PointSAModuleMSG):
    PointNets.

    Args:
-        mlp_channels (list[int]): Specify of the pointnet before
+        mlp_channels (List[int]): Specify of the pointnet before
            the global pooling for each scale.
-        num_point (int, optional): Number of points.
-            Default: None.
-        radius (float, optional): Radius to group with.
-            Default: None.
+        num_point (int, optional): Number of points. Defaults to None.
+        radius (float, optional): Radius to group with. Defaults to None.
        num_sample (int, optional): Number of samples in each ball query.
-            Default: None.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-        fps_sample_range_list (list[int], optional): Range of points
-            to apply FPS. Default: [-1].
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
+            Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Default to dict(type='BN2d').
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
    """

    def __init__(self,
-                 mlp_channels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d'),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False):
+                 mlp_channels: List[int],
+                 num_point: Optional[int] = None,
+                 radius: Optional[float] = None,
+                 num_sample: Optional[int] = None,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 normalize_xyz: bool = False) -> None:
        super(PointSAModule, self).__init__(
            mlp_channels=[mlp_channels],
            num_point=num_point,

--- a/mmdet3d/models/layers/pointnet_modules/stack_point_sa_module.py
+++ b/mmdet3d/models/layers/pointnet_modules/stack_point_sa_module.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import ball_query, grouping_operation
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+
+
+class StackQueryAndGroup(BaseModule):
+    """Find nearby points in spherical space.
+
+    Args:
+        radius (float): List of radius in each ball query.
+        sample_nums (int): Number of samples in each ball query.
+        use_xyz (bool): Whether to use xyz. Default: True.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+    """
+
+    def __init__(self,
+                 radius: float,
+                 sample_nums: int,
+                 use_xyz: bool = True,
+                 init_cfg: dict = None):
+        super().__init__(init_cfg=init_cfg)
+        self.radius, self.sample_nums, self.use_xyz = \
+            radius, sample_nums, use_xyz
+
+    def forward(self,
+                xyz: torch.Tensor,
+                xyz_batch_cnt: torch.Tensor,
+                new_xyz: torch.Tensor,
+                new_xyz_batch_cnt: torch.Tensor,
+                features: torch.Tensor = None) -> Tuple[Tensor, Tensor]:
+        """Forward.
+
+        Args:
+            xyz (Tensor): Tensor of the xyz coordinates
+                of the features shape with (N1 + N2 ..., 3).
+            xyz_batch_cnt: (Tensor): Stacked input xyz coordinates nums in
+                each batch, just like (N1, N2, ...).
+            new_xyz (Tensor): New coords of the outputs shape with
+                (M1 + M2 ..., 3).
+            new_xyz_batch_cnt: (Tensor): Stacked new xyz coordinates nums
+                in each batch, just like (M1, M2, ...).
+            features (Tensor, optional): Features of each point with shape
+                (N1 + N2 ..., C). C is features channel number. Default: None.
+        """
+        assert xyz.shape[0] == xyz_batch_cnt.sum(
+        ), f'xyz: {str(xyz.shape)}, xyz_batch_cnt: str(new_xyz_batch_cnt)'
+        assert new_xyz.shape[0] == new_xyz_batch_cnt.sum(), \
+            'new_xyz: str(new_xyz.shape), new_xyz_batch_cnt: ' \
+            'str(new_xyz_batch_cnt)'
+
+        # idx: (M1 + M2 ..., nsample)
+        idx = ball_query(0, self.radius, self.sample_nums, xyz, new_xyz,
+                         xyz_batch_cnt, new_xyz_batch_cnt)
+        empty_ball_mask = (idx[:, 0] == -1)
+        idx[empty_ball_mask] = 0
+        grouped_xyz = grouping_operation(
+            xyz, idx, xyz_batch_cnt,
+            new_xyz_batch_cnt)  # (M1 + M2, 3, nsample)
+        grouped_xyz -= new_xyz.unsqueeze(-1)
+
+        grouped_xyz[empty_ball_mask] = 0
+        if features is not None:
+            grouped_features = grouping_operation(
+                features, idx, xyz_batch_cnt,
+                new_xyz_batch_cnt)  # (M1 + M2, C, nsample)
+            grouped_features[empty_ball_mask] = 0
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features],
+                    dim=1)  # (M1 + M2 ..., C + 3, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert self.use_xyz, 'Cannot have not features and not' \
+                                 ' use xyz as a feature!'
+            new_features = grouped_xyz
+        return new_features, idx
+
+
+@MODELS.register_module()
+class StackedSAModuleMSG(BaseModule):
+    """Stack point set abstraction module.
+
+    Args:
+        in_channels (int): Input channels.
+        radius (list[float]): List of radius in each ball query.
+        sample_nums (list[int]): Number of samples in each ball query.
+        mlp_channels (list[list[int]]): Specify mlp channels of the
+            pointnet before the global pooling for each scale to encode
+            point features.
+        use_xyz (bool): Whether to use xyz. Default: True.
+        pool_mod (str): Type of pooling method.
+            Default: 'max_pool'.
+        norm_cfg (dict): Type of normalization method. Defaults to
+            dict(type='BN2d', eps=1e-5, momentum=0.01).
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 radius: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 use_xyz: bool = True,
+                 pool_mod='max',
+                 norm_cfg: dict = dict(type='BN2d', eps=1e-5, momentum=0.01),
+                 init_cfg: dict = None,
+                 **kwargs) -> None:
+        super(StackedSAModuleMSG, self).__init__(init_cfg=init_cfg)
+        assert len(radius) == len(sample_nums) == len(mlp_channels)
+
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radius)):
+            cin = in_channels
+            if use_xyz:
+                cin += 3
+            cur_radius = radius[i]
+            nsample = sample_nums[i]
+            mlp_spec = mlp_channels[i]
+
+            self.groupers.append(
+                StackQueryAndGroup(cur_radius, nsample, use_xyz=use_xyz))
+
+            mlp = nn.Sequential()
+            for i in range(len(mlp_spec)):
+                cout = mlp_spec[i]
+                mlp.add_module(
+                    f'layer{i}',
+                    ConvModule(
+                        cin,
+                        cout,
+                        kernel_size=(1, 1),
+                        stride=(1, 1),
+                        conv_cfg=dict(type='Conv2d'),
+                        norm_cfg=norm_cfg,
+                        bias=False))
+                cin = cout
+            self.mlps.append(mlp)
+        self.pool_mod = pool_mod
+
+    def forward(self,
+                xyz: Tensor,
+                xyz_batch_cnt: Tensor,
+                new_xyz: Tensor,
+                new_xyz_batch_cnt: Tensor,
+                features: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+        """Forward.
+
+        Args:
+            xyz (Tensor): Tensor of the xyz coordinates
+                of the features shape with (N1 + N2 ..., 3).
+            xyz_batch_cnt: (Tensor): Stacked input xyz coordinates nums in
+                each batch, just like (N1, N2, ...).
+            new_xyz (Tensor): New coords of the outputs shape with
+                (M1 + M2 ..., 3).
+            new_xyz_batch_cnt: (Tensor): Stacked new xyz coordinates nums
+                in each batch, just like (M1, M2, ...).
+            features (Tensor, optional): Features of each point with shape
+                (N1 + N2 ..., C). C is features channel number. Default: None.
+
+        Returns:
+            Return new points coordinates and features:
+                - new_xyz  (Tensor): Target points coordinates with shape
+                    (N1 + N2 ..., 3).
+                - new_features (Tensor): Target points features with shape
+                    (M1 + M2 ..., sum_k(mlps[k][-1])).
+        """
+        new_features_list = []
+        for k in range(len(self.groupers)):
+            grouped_features, ball_idxs = self.groupers[k](
+                xyz, xyz_batch_cnt, new_xyz, new_xyz_batch_cnt,
+                features)  # (M1 + M2, Cin, nsample)
+            grouped_features = grouped_features.permute(1, 0,
+                                                        2).unsqueeze(dim=0)
+            new_features = self.mlps[k](grouped_features)
+            # (M1 + M2 ..., Cout, nsample)
+            if self.pool_mod == 'max':
+                new_features = new_features.max(-1).values
+            elif self.pool_mod == 'avg':
+                new_features = new_features.mean(-1)
+            else:
+                raise NotImplementedError
+            new_features = new_features.squeeze(dim=0).permute(1, 0)
+            new_features_list.append(new_features)
+
+        new_features = torch.cat(new_features_list, dim=1)
+
+        return new_xyz, new_features
--- a/mmdet3d/models/layers/sparse_block.py
+++ b/mmdet3d/models/layers/sparse_block.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
 from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
 from torch import nn

-from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+from mmdet3d.utils import OptConfigType
 from .spconv import IS_SPCONV2_AVAILABLE

 if IS_SPCONV2_AVAILABLE:
-    from spconv.pytorch import SparseModule, SparseSequential
+    from spconv.pytorch import SparseConvTensor, SparseModule, SparseSequential
 else:
-    from mmcv.ops import SparseModule, SparseSequential
+    from mmcv.ops import SparseConvTensor, SparseModule, SparseSequential


-def replace_feature(out, new_features):
+def replace_feature(out: SparseConvTensor,
+                    new_features: SparseConvTensor) -> SparseConvTensor:
    if 'replace_feature' in out.__dir__():
        # spconv 2.x behaviour
        return out.replace_feature(new_features)
@@ -26,25 +30,26 @@ class SparseBottleneck(Bottleneck, SparseModule):
    Bottleneck block implemented with submanifold sparse convolution.

    Args:
-        inplanes (int): inplanes of block.
-        planes (int): planes of block.
-        stride (int, optional): stride of the first block. Default: 1.
-        downsample (Module, optional): down sample module for block.
-        conv_cfg (dict, optional): dictionary to construct and config conv
-            layer. Default: None.
-        norm_cfg (dict, optional): dictionary to construct and config norm
-            layer. Default: dict(type='BN').
+        inplanes (int): Inplanes of block.
+        planes (int): Planes of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        downsample (Module, optional): Down sample module for block.
+            Defaults to None.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
    """

    expansion = 4

    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 downsample=None,
-                 conv_cfg=None,
-                 norm_cfg=None):
+                 inplanes: int,
+                 planes: int,
+                 stride: Union[int, Tuple[int]] = 1,
+                 downsample: nn.Module = None,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:

        SparseModule.__init__(self)
        Bottleneck.__init__(
@@ -56,7 +61,7 @@ class SparseBottleneck(Bottleneck, SparseModule):
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg)

-    def forward(self, x):
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
        identity = x.features

        out = self.conv1(x)
@@ -85,25 +90,26 @@ class SparseBasicBlock(BasicBlock, SparseModule):
    Sparse basic block implemented with submanifold sparse convolution.

    Args:
-        inplanes (int): inplanes of block.
-        planes (int): planes of block.
-        stride (int, optional): stride of the first block. Default: 1.
-        downsample (Module, optional): down sample module for block.
-        conv_cfg (dict, optional): dictionary to construct and config conv
-            layer. Default: None.
-        norm_cfg (dict, optional): dictionary to construct and config norm
-            layer. Default: dict(type='BN').
+        inplanes (int): Inplanes of block.
+        planes (int): Planes of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        downsample (Module, optional): Down sample module for block.
+            Defaults to None.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
    """

    expansion = 1

    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 downsample=None,
-                 conv_cfg=None,
-                 norm_cfg=None):
+                 inplanes: int,
+                 planes: int,
+                 stride: Union[int, Tuple[int]] = 1,
+                 downsample: nn.Module = None,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:
        SparseModule.__init__(self)
        BasicBlock.__init__(
            self,
@@ -114,7 +120,7 @@ class SparseBasicBlock(BasicBlock, SparseModule):
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg)

-    def forward(self, x):
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
        identity = x.features

        assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}'
@@ -134,29 +140,33 @@ class SparseBasicBlock(BasicBlock, SparseModule):
        return out


-def make_sparse_convmodule(in_channels,
-                           out_channels,
-                           kernel_size,
-                           indice_key,
-                           stride=1,
-                           padding=0,
-                           conv_type='SubMConv3d',
-                           norm_cfg=None,
-                           order=('conv', 'norm', 'act')):
+def make_sparse_convmodule(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: Union[int, Tuple[int]],
+    indice_key: str,
+    stride: Union[int, Tuple[int]] = 1,
+    padding: Union[int, Tuple[int]] = 0,
+    conv_type: str = 'SubMConv3d',
+    norm_cfg: OptConfigType = None,
+    order: Tuple[str] = ('conv', 'norm', 'act')
+) -> SparseSequential:
    """Make sparse convolution module.

    Args:
-        in_channels (int): the number of input channels
-        out_channels (int): the number of out channels
-        kernel_size (int|tuple(int)): kernel size of convolution
-        indice_key (str): the indice key used for sparse tensor
-        stride (int|tuple(int)): the stride of convolution
-        padding (int or list[int]): the padding number of input
-        conv_type (str): sparse conv type in spconv
-        norm_cfg (dict[str]): config of normalization layer
-        order (tuple[str]): The order of conv/norm/activation layers. It is a
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of out channels.
+        kernel_size (int | Tuple[int]): Kernel size of convolution.
+        indice_key (str): The indice key used for sparse tensor.
+        stride (int or tuple[int]): The stride of convolution.
+        padding (int or tuple[int]): The padding number of input.
+        conv_type (str): Sparse conv type in spconv. Defaults to 'SubMConv3d'.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        order (Tuple[str]): The order of conv/norm/activation layers. It is a
            sequence of "conv", "norm" and "act". Common examples are
            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Defaults to ('conv', 'norm', 'act').

    Returns:
        spconv.SparseSequential: sparse convolution module.

--- a/mmdet3d/models/layers/spconv/overwrite_spconv/write_spconv2.py
+++ b/mmdet3d/models/layers/spconv/overwrite_spconv/write_spconv2.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import itertools
+from typing import List, OrderedDict

 from mmengine.registry import MODELS
 from torch.nn.parameter import Parameter


-def register_spconv2():
+def register_spconv2() -> bool:
    """This func registers spconv2.0 spconv ops to overwrite the default mmcv
    spconv ops."""
    try:
@@ -39,8 +40,10 @@ def register_spconv2():
        return True


-def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                          missing_keys, unexpected_keys, error_msgs):
+def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+                          local_metadata: dict, strict: bool,
+                          missing_keys: List[str], unexpected_keys: List[str],
+                          error_msgs: List[str]) -> None:
    """Rewrite this func to compat the convolutional kernel weights between
    spconv 1.x in MMCV and 2.x in spconv2.x.


--- a/mmdet3d/models/layers/transformer.py
+++ b/mmdet3d/models/layers/transformer.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
 from mmcv.cnn.bricks.transformer import MultiheadAttention
 from mmengine.registry import MODELS
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 @MODELS.register_module()
 class GroupFree3DMHA(MultiheadAttention):
@@ -15,40 +20,42 @@ class GroupFree3DMHA(MultiheadAttention):
        embed_dims (int): The embedding dimension.
        num_heads (int): Parallel attention heads. Same as
            `nn.MultiheadAttention`.
-        attn_drop (float, optional): A Dropout layer on attn_output_weights.
+        attn_drop (float): A Dropout layer on attn_output_weights.
            Defaults to 0.0.
-        proj_drop (float, optional): A Dropout layer. Defaults to 0.0.
-        dropout_layer (obj:`ConfigDict`, optional): The dropout_layer used
-            when adding the shortcut.
-        init_cfg (obj:`mmengine.ConfigDict`, optional): The Config for
-            initialization. Default: None.
-        batch_first (bool, optional): Key, Query and Value are shape of
-            (batch, n, embed_dim)
-            or (n, batch, embed_dim). Defaults to False.
+        proj_drop (float): A Dropout layer. Defaults to 0.0.
+        dropout_layer (ConfigType): The dropout_layer used when adding
+            the shortcut. Defaults to dict(type='DropOut', drop_prob=0.).
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim) or (n, batch, embed_dim).
+            Defaults to False.
    """

    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 dropout_layer=dict(type='DropOut', drop_prob=0.),
-                 init_cfg=None,
-                 batch_first=False,
-                 **kwargs):
-        super().__init__(embed_dims, num_heads, attn_drop, proj_drop,
-                         dropout_layer, init_cfg, batch_first, **kwargs)
+                 embed_dims: int,
+                 num_heads: int,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.,
+                 dropout_layer: ConfigType = dict(
+                     type='DropOut', drop_prob=0.),
+                 init_cfg: OptMultiConfig = None,
+                 batch_first: bool = False,
+                 **kwargs) -> None:
+        super(GroupFree3DMHA,
+              self).__init__(embed_dims, num_heads, attn_drop, proj_drop,
+                             dropout_layer, init_cfg, batch_first, **kwargs)

    def forward(self,
-                query,
-                key,
-                value,
-                identity,
-                query_pos=None,
-                key_pos=None,
-                attn_mask=None,
-                key_padding_mask=None,
-                **kwargs):
+                query: Tensor,
+                key: Tensor,
+                value: Tensor,
+                identity: Tensor,
+                query_pos: Optional[Tensor] = None,
+                key_pos: Optional[Tensor] = None,
+                attn_mask: Optional[Tensor] = None,
+                key_padding_mask: Optional[Tensor] = None,
+                **kwargs) -> Tensor:
        """Forward function for `GroupFree3DMHA`.

        **kwargs allow passing a more general data flow when combining
@@ -81,7 +88,7 @@ class GroupFree3DMHA(MultiheadAttention):
                Defaults to None.

        Returns:
-            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+            Tensor: Forwarded results with shape [num_queries, bs, embed_dims].
        """

        if hasattr(self, 'operation_name'):
@@ -113,26 +120,26 @@ class ConvBNPositionalEncoding(nn.Module):
    """Absolute position embedding with Conv learning.

    Args:
-        input_channel (int): input features dim.
-        num_pos_feats (int, optional): output position features dim.
+        input_channel (int): Input features dim.
+        num_pos_feats (int): Output position features dim.
            Defaults to 288 to be consistent with seed features dim.
    """

-    def __init__(self, input_channel, num_pos_feats=288):
-        super().__init__()
+    def __init__(self, input_channel: int, num_pos_feats: int = 288) -> None:
+        super(ConvBNPositionalEncoding, self).__init__()
        self.position_embedding_head = nn.Sequential(
            nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
            nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
            nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))

-    def forward(self, xyz):
+    def forward(self, xyz: Tensor) -> Tensor:
        """Forward pass.

        Args:
-            xyz (Tensor)： (B, N, 3) the coordinates to embed.
+            xyz (Tensor): (B, N, 3) The coordinates to embed.

        Returns:
-            Tensor: (B, num_pos_feats, N) the embedded position features.
+            Tensor: (B, num_pos_feats, N) The embedded position features.
        """
        xyz = xyz.permute(0, 2, 1)
        position_embedding = self.position_embedding_head(xyz)

--- a/mmdet3d/models/layers/vote_module.py
+++ b/mmdet3d/models/layers/vote_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import torch
 from mmcv.cnn import ConvModule
 from mmengine import is_tuple_of
+from torch import Tensor
 from torch import nn as nn

-from mmdet3d.models.builder import build_loss
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType


 class VoteModule(nn.Module):
@@ -14,41 +18,41 @@ class VoteModule(nn.Module):

    Args:
        in_channels (int): Number of channels of seed point features.
-        vote_per_seed (int, optional): Number of votes generated from
-            each seed point. Default: 1.
-        gt_per_seed (int, optional): Number of ground truth votes generated
-            from each seed point. Default: 3.
-        num_points (int, optional): Number of points to be used for voting.
-            Default: 1.
-        conv_channels (tuple[int], optional): Out channels of vote
-            generating convolution. Default: (16, 16).
-        conv_cfg (dict, optional): Config of convolution.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d').
-        norm_feats (bool, optional): Whether to normalize features.
-            Default: True.
-        with_res_feat (bool, optional): Whether to predict residual features.
-            Default: True.
-        vote_xyz_range (list[float], optional):
-            The range of points translation. Default: None.
-        vote_loss (dict, optional): Config of vote loss. Default: None.
+        vote_per_seed (int): Number of votes generated from each seed point.
+            Defaults to 1.
+        gt_per_seed (int): Number of ground truth votes generated from each
+            seed point. Defaults to 3.
+        num_points (int): Number of points to be used for voting.
+            Defaults to 1.
+        conv_channels (tuple[int]): Out channels of vote generating
+            convolution. Defaults to (16, 16).
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layer. Defaults to dict(type='Conv1d').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        norm_feats (bool): Whether to normalize features. Default to True.
+        with_res_feat (bool): Whether to predict residual features.
+            Defaults to True.
+        vote_xyz_range (List[float], optional): The range of points
+            translation. Defaults to None.
+        vote_loss (:obj:`ConfigDict` or dict, optional): Config of vote loss.
+            Defaults to None.
    """

    def __init__(self,
-                 in_channels,
-                 vote_per_seed=1,
-                 gt_per_seed=3,
-                 num_points=-1,
-                 conv_channels=(16, 16),
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 norm_feats=True,
-                 with_res_feat=True,
-                 vote_xyz_range=None,
-                 vote_loss=None):
-        super().__init__()
+                 in_channels: int,
+                 vote_per_seed: int = 1,
+                 gt_per_seed: int = 3,
+                 num_points: int = -1,
+                 conv_channels: Tuple[int] = (16, 16),
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 norm_feats: bool = True,
+                 with_res_feat: bool = True,
+                 vote_xyz_range: List[float] = None,
+                 vote_loss: OptConfigType = None) -> None:
+        super(VoteModule, self).__init__()
        self.in_channels = in_channels
        self.vote_per_seed = vote_per_seed
        self.gt_per_seed = gt_per_seed
@@ -60,7 +64,7 @@ class VoteModule(nn.Module):
        self.vote_xyz_range = vote_xyz_range

        if vote_loss is not None:
-            self.vote_loss = build_loss(vote_loss)
+            self.vote_loss = MODELS.build(vote_loss)

        prev_channels = in_channels
        vote_conv_list = list()
@@ -86,23 +90,24 @@ class VoteModule(nn.Module):
            out_channel = 3 * self.vote_per_seed
        self.conv_out = nn.Conv1d(prev_channels, out_channel, 1)

-    def forward(self, seed_points, seed_feats):
-        """forward.
+    def forward(self, seed_points: Tensor,
+                seed_feats: Tensor) -> Tuple[Tensor]:
+        """Forward.

        Args:
-            seed_points (torch.Tensor): Coordinate of the seed
-                points in shape (B, N, 3).
-            seed_feats (torch.Tensor): Features of the seed points in shape
+            seed_points (Tensor): Coordinate of the seed points in shape
+                (B, N, 3).
+            seed_feats (Tensor): Features of the seed points in shape
                (B, C, N).

        Returns:
-            tuple[torch.Tensor]:
+            Tuple[torch.Tensor]:

                - vote_points: Voted xyz based on the seed points
-                    with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
+                  with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
                - vote_features: Voted features based on the seed points with
-                    shape (B, C, M) where ``M=num_seed*vote_per_seed``,
-                    ``C=vote_feature_dim``.
+                  shape (B, C, M) where ``M=num_seed*vote_per_seed``,
+                  ``C=vote_feature_dim``.
        """
        if self.num_points != -1:
            assert self.num_points < seed_points.shape[1], \
@@ -150,19 +155,20 @@ class VoteModule(nn.Module):
            vote_feats = seed_feats
        return vote_points, vote_feats, offset

-    def get_loss(self, seed_points, vote_points, seed_indices,
-                 vote_targets_mask, vote_targets):
+    def get_loss(self, seed_points: Tensor, vote_points: Tensor,
+                 seed_indices: Tensor, vote_targets_mask: Tensor,
+                 vote_targets: Tensor) -> Tensor:
        """Calculate loss of voting module.

        Args:
-            seed_points (torch.Tensor): Coordinate of the seed points.
-            vote_points (torch.Tensor): Coordinate of the vote points.
-            seed_indices (torch.Tensor): Indices of seed points in raw points.
-            vote_targets_mask (torch.Tensor): Mask of valid vote targets.
-            vote_targets (torch.Tensor): Targets of votes.
+            seed_points (Tensor): Coordinate of the seed points.
+            vote_points (Tensor): Coordinate of the vote points.
+            seed_indices (Tensor): Indices of seed points in raw points.
+            vote_targets_mask (Tensor): Mask of valid vote targets.
+            vote_targets (Tensor): Targets of votes.

        Returns:
-            torch.Tensor: Weighted vote loss.
+            Tensor: Weighted vote loss.
        """
        batch_size, num_seed = seed_points.shape[:2]