[Enhance] Add typehint for models/layers (#2014)

* add typeints for models/layers * Update builder.py

[Enhance] Add typehint for models/layers (#2014)
* add typeints for models/layers * Update builder.py
6a9fd47c · Xiangxu-0103 · ZwwWayne · 1a47acdd · 6a9fd47c · 6a9fd47c
Commit 6a9fd47c authored Nov 29, 2022 by Xiangxu-0103 Committed by ZwwWayne Dec 03, 2022
20 changed files
--- a/mmdet3d/models/layers/box3d_nms.py
+++ b/mmdet3d/models/layers/box3d_nms.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
 import numba
 import numpy as np
 import torch
 from mmcv.ops import nms, nms_rotated
-
-
-def box3d_multiclass_nms(mlvl_bboxes,
-                         mlvl_bboxes_for_nms,
-                         mlvl_scores,
-                         score_thr,
-                         max_num,
-                         cfg,
-                         mlvl_dir_scores=None,
-                         mlvl_attr_scores=None,
-                         mlvl_bboxes2d=None):
+from torch import Tensor
+
+
+def box3d_multiclass_nms(
+        mlvl_bboxes: Tensor,
+        mlvl_bboxes_for_nms: Tensor,
+        mlvl_scores: Tensor,
+        score_thr: float,
+        max_num: int,
+        cfg: dict,
+        mlvl_dir_scores: Optional[Tensor] = None,
+        mlvl_attr_scores: Optional[Tensor] = None,
+        mlvl_bboxes2d: Optional[Tensor] = None) -> Tuple[Tensor]:
    """Multi-class NMS for 3D boxes. The IoU used for NMS is defined as the 2D
    IoU between BEV boxes.

    Args:
-        mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M).
+        mlvl_bboxes (Tensor): Multi-level boxes with shape (N, M).
            M is the dimensions of boxes.
-        mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape
-            (N, 5) ([x1, y1, x2, y2, ry]). N is the number of boxes.
+        mlvl_bboxes_for_nms (Tensor): Multi-level boxes with shape (N, 5)
+            ([x1, y1, x2, y2, ry]). N is the number of boxes.
            The coordinate system of the BEV boxes is counterclockwise.
-        mlvl_scores (torch.Tensor): Multi-level boxes with shape
-            (N, C + 1). N is the number of boxes. C is the number of classes.
-        score_thr (float): Score threshold to filter boxes with low
-            confidence.
+        mlvl_scores (Tensor): Multi-level boxes with shape (N, C + 1).
+            N is the number of boxes. C is the number of classes.
+        score_thr (float): Score threshold to filter boxes with low confidence.
        max_num (int): Maximum number of boxes will be kept.
        cfg (dict): Configuration dict of NMS.
-        mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
-            of direction classifier. Defaults to None.
-        mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
-            of attribute classifier. Defaults to None.
-        mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
-            boxes. Defaults to None.
+        mlvl_dir_scores (Tensor, optional): Multi-level scores of direction
+            classifier. Defaults to None.
+        mlvl_attr_scores (Tensor, optional): Multi-level scores of attribute
+            classifier. Defaults to None.
+        mlvl_bboxes2d (Tensor, optional): Multi-level 2D bounding boxes.
+            Defaults to None.

    Returns:
-        tuple[torch.Tensor]: Return results after nms, including 3D
-            bounding boxes, scores, labels, direction scores, attribute
-            scores (optional) and 2D bounding boxes (optional).
+        Tuple[Tensor]: Return results after nms, including 3D bounding boxes,
+        scores, labels, direction scores, attribute scores (optional) and
+        2D bounding boxes (optional).
    """
    # do multi class nms
    # the fg class id range: [0, num_classes-1]
@@ -128,17 +131,18 @@ def box3d_multiclass_nms(mlvl_bboxes,
    return results


-def aligned_3d_nms(boxes, scores, classes, thresh):
+def aligned_3d_nms(boxes: Tensor, scores: Tensor, classes: Tensor,
+                   thresh: float) -> Tensor:
    """3D NMS for aligned boxes.

    Args:
-        boxes (torch.Tensor): Aligned box with shape [n, 6].
-        scores (torch.Tensor): Scores of each box.
-        classes (torch.Tensor): Class of each box.
+        boxes (Tensor): Aligned box with shape [N, 6].
+        scores (Tensor): Scores of each box.
+        classes (Tensor): Class of each box.
        thresh (float): IoU threshold for nms.

    Returns:
-        torch.Tensor: Indices of selected boxes.
+        Tensor: Indices of selected boxes.
    """
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
@@ -179,21 +183,20 @@ def aligned_3d_nms(boxes, scores, classes, thresh):


 @numba.jit(nopython=True)
-def circle_nms(dets, thresh, post_max_size=83):
+def circle_nms(dets: Tensor, thresh: float, post_max_size: int = 83) -> Tensor:
    """Circular NMS.

-    An object is only counted as positive if no other center
-    with a higher confidence exists within a radius r using a
-    bird-eye view distance metric.
+    An object is only counted as positive if no other center with a higher
+    confidence exists within a radius r using a bird-eye view distance metric.

    Args:
-        dets (torch.Tensor): Detection results with the shape of [N, 3].
+        dets (Tensor): Detection results with the shape of [N, 3].
        thresh (float): Value of threshold.
-        post_max_size (int, optional): Max number of prediction to be kept.
+        post_max_size (int): Max number of prediction to be kept.
            Defaults to 83.

    Returns:
-        torch.Tensor: Indexes of the detections to be kept.
+        Tensor: Indexes of the detections to be kept.
    """
    x1 = dets[:, 0]
    y1 = dets[:, 1]
@@ -228,24 +231,28 @@ def circle_nms(dets, thresh, post_max_size=83):
 # This function duplicates functionality of mmcv.ops.iou_3d.nms_bev
 # from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms_rotated.
 # Nms api will be unified in mmdetection3d one day.
-def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
+def nms_bev(boxes: Tensor,
+            scores: Tensor,
+            thresh: float,
+            pre_max_size: Optional[int] = None,
+            post_max_size: Optional[int] = None) -> Tensor:
    """NMS function GPU implementation (for BEV boxes). The overlap of two
    boxes for IoU calculation is defined as the exact overlapping area of the
    two boxes. In this function, one can also set ``pre_max_size`` and
    ``post_max_size``.

    Args:
-        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
+        boxes (Tensor): Input boxes with the shape of [N, 5]
            ([x1, y1, x2, y2, ry]).
-        scores (torch.Tensor): Scores of boxes with the shape of [N].
+        scores (Tensor): Scores of boxes with the shape of [N].
        thresh (float): Overlap threshold of NMS.
        pre_max_size (int, optional): Max size of boxes before NMS.
-            Default: None.
+            Defaults to None.
        post_max_size (int, optional): Max size of boxes after NMS.
-            Default: None.
+            Defaults to None.

    Returns:
-        torch.Tensor: Indexes after NMS.
+        Tensor: Indexes after NMS.
    """
    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
    order = scores.sort(0, descending=True)[1]
@@ -271,18 +278,18 @@ def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
 # This function duplicates functionality of mmcv.ops.iou_3d.nms_normal_bev
 # from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms.
 # Nms api will be unified in mmdetection3d one day.
-def nms_normal_bev(boxes, scores, thresh):
+def nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:
    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
    two boxes for IoU calculation is defined as the exact overlapping area of
    the two boxes WITH their yaw angle set to 0.

    Args:
-        boxes (torch.Tensor): Input boxes with shape (N, 5).
-        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        boxes (Tensor): Input boxes with shape (N, 5).
+        scores (Tensor): Scores of predicted boxes with shape (N).
        thresh (float): Overlap threshold of NMS.

    Returns:
-        torch.Tensor: Remaining indices with scores in descending order.
+        Tensor: Remaining indices with scores in descending order.
    """
    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
    return nms(boxes[:, :-1], scores, thresh)[1]
--- a/mmdet3d/models/layers/dgcnn_modules/dgcnn_fa_module.py
+++ b/mmdet3d/models/layers/dgcnn_modules/dgcnn_fa_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
 import torch
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 class DGCNNFAModule(BaseModule):
    """Point feature aggregation module used in DGCNN.
@@ -11,21 +16,21 @@ class DGCNNFAModule(BaseModule):
    Aggregate all the features of points.

    Args:
-        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN1d').
-        act_cfg (dict, optional): Type of activation method.
+        mlp_channels (List[int]): List of mlp channels.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to dict(type='ReLU').
-        init_cfg (dict, optional): Initialization config. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
    """

    def __init__(self,
-                 mlp_channels,
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.fp16_enabled = False
+                 mlp_channels: List[int],
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(DGCNNFAModule, self).__init__(init_cfg=init_cfg)
        self.mlps = nn.Sequential()
        for i in range(len(mlp_channels) - 1):
            self.mlps.add_module(
@@ -39,14 +44,14 @@ class DGCNNFAModule(BaseModule):
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg))

-    def forward(self, points):
+    def forward(self, points: List[Tensor]) -> Tensor:
        """forward.

        Args:
-            points (List[Tensor]): tensor of the features to be aggregated.
+            points (List[Tensor]): Tensor of the features to be aggregated.

        Returns:
-            Tensor: (B, N, M) M = mlp[-1], tensor of the output points.
+            Tensor: (B, N, M) M = mlp[-1]. Tensor of the output points.
        """

        if len(points) > 1:

--- a/mmdet3d/models/layers/dgcnn_modules/dgcnn_fp_module.py
+++ b/mmdet3d/models/layers/dgcnn_modules/dgcnn_fp_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 class DGCNNFPModule(BaseModule):
    """Point feature propagation module used in DGCNN.
@@ -10,21 +15,21 @@ class DGCNNFPModule(BaseModule):
    Propagate the features from one set to another.

    Args:
-        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict, optional): Type of activation method.
-            Defaults to dict(type='BN1d').
-        act_cfg (dict, optional): Type of activation method.
+        mlp_channels (List[int]): List of mlp channels.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to dict(type='ReLU').
-        init_cfg (dict, optional): Initialization config. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
    """

    def __init__(self,
-                 mlp_channels,
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.fp16_enabled = False
+                 mlp_channels: List[int],
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(DGCNNFPModule, self).__init__(init_cfg=init_cfg)
        self.mlps = nn.Sequential()
        for i in range(len(mlp_channels) - 1):
            self.mlps.add_module(
@@ -38,14 +43,14 @@ class DGCNNFPModule(BaseModule):
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg))

-    def forward(self, points):
-        """forward.
+    def forward(self, points: Tensor) -> Tensor:
+        """Forward.

        Args:
-            points (Tensor): (B, N, C) tensor of the input points.
+            points (Tensor): (B, N, C) Tensor of the input points.

        Returns:
-            Tensor: (B, N, M) M = mlp[-1], tensor of the new points.
+            Tensor: (B, N, M) M = mlp[-1]. Tensor of the new points.
        """

        if points is not None:

--- a/mmdet3d/models/layers/dgcnn_modules/dgcnn_gf_module.py
+++ b/mmdet3d/models/layers/dgcnn_modules/dgcnn_gf_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
 import torch
 from mmcv.cnn import ConvModule
 from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

+from mmdet3d.utils import ConfigType
+

 class BaseDGCNNGFModule(nn.Module):
    """Base module for point graph feature module used in DGCNN.

    Args:
-        radii (list[float]): List of radius in each knn or ball query.
-        sample_nums (list[int]): Number of samples in each knn or ball query.
-        mlp_channels (list[list[int]]): Specify of the dgcnn before
-            the global pooling for each graph feature module.
-        knn_modes (list[str], optional): Type of KNN method, valid mode
-            ['F-KNN', 'D-KNN'], Defaults to ['F-KNN'].
-        dilated_group (bool, optional): Whether to use dilated ball query.
+        radii (List[float]): List of radius in each knn or ball query.
+        sample_nums (List[int]): Number of samples in each knn or ball query.
+        mlp_channels (List[List[int]]): Specify of the dgcnn before the global
+            pooling for each graph feature module.
+        knn_modes (List[str]): Type of KNN method, valid mode
+            ['F-KNN', 'D-KNN']. Defaults to ['F-KNN'].
+        dilated_group (bool): Whether to use dilated ball query.
            Defaults to False.
-        use_xyz (bool, optional): Whether to use xyz as point features.
+        use_xyz (bool): Whether to use xyz as point features.
            Defaults to True.
-        pool_mode (str, optional): Type of pooling method. Defaults to 'max'.
-        normalize_xyz (bool, optional): If ball query, whether to normalize
-            local XYZ with radius. Defaults to False.
-        grouper_return_grouped_xyz (bool, optional): Whether to return grouped
-            xyz in `QueryAndGroup`. Defaults to False.
-        grouper_return_grouped_idx (bool, optional): Whether to return grouped
-            idx in `QueryAndGroup`. Defaults to False.
+        pool_mode (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): If ball query, whether to normalize local XYZ
+            with radius. Defaults to False.
+        grouper_return_grouped_xyz (bool): Whether to return grouped xyz in
+            `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool): Whether to return grouped idx in
+            `QueryAndGroup`. Defaults to False.
    """

    def __init__(self,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 knn_modes=['F-KNN'],
-                 dilated_group=False,
-                 use_xyz=True,
-                 pool_mode='max',
-                 normalize_xyz=False,
-                 grouper_return_grouped_xyz=False,
-                 grouper_return_grouped_idx=False):
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 knn_modes: List[str] = ['F-KNN'],
+                 dilated_group: bool = False,
+                 use_xyz: bool = True,
+                 pool_mode: str = 'max',
+                 normalize_xyz: bool = False,
+                 grouper_return_grouped_xyz: bool = False,
+                 grouper_return_grouped_idx: bool = False) -> None:
        super(BaseDGCNNGFModule, self).__init__()

        assert len(sample_nums) == len(
@@ -82,16 +87,15 @@ class BaseDGCNNGFModule(nn.Module):
                grouper = GroupAll(use_xyz)
            self.groupers.append(grouper)

-    def _pool_features(self, features):
+    def _pool_features(self, features: Tensor) -> Tensor:
        """Perform feature aggregation using pooling operation.

        Args:
-            features (torch.Tensor): (B, C, N, K)
-                Features of locally grouped points before pooling.
+            features (Tensor): (B, C, N, K) Features of locally grouped
+                points before pooling.

        Returns:
-            torch.Tensor: (B, C, N)
-                Pooled features aggregating local information.
+            Tensor: (B, C, N) Pooled features aggregating local information.
        """
        if self.pool_mode == 'max':
            # (B, C, N, 1)
@@ -106,15 +110,15 @@ class BaseDGCNNGFModule(nn.Module):

        return new_features.squeeze(-1).contiguous()

-    def forward(self, points):
+    def forward(self, points: Tensor) -> Tensor:
        """forward.

        Args:
-            points (Tensor): (B, N, C) input points.
+            points (Tensor): (B, N, C) Input points.

        Returns:
-            List[Tensor]: (B, N, C1) new points generated from each graph
-                feature module.
+            Tensor: (B, N, C1) New points generated from each graph
+            feature module.
        """
        new_points_list = [points]

@@ -155,43 +159,40 @@ class DGCNNGFModule(BaseDGCNNGFModule):
    """Point graph feature module used in DGCNN.

    Args:
-        mlp_channels (list[int]): Specify of the dgcnn before
-            the global pooling for each graph feature module.
+        mlp_channels (List[int]): Specify of the dgcnn before the global
+            pooling for each graph feature module.
        num_sample (int, optional): Number of samples in each knn or ball
            query. Defaults to None.
-        knn_mode (str, optional): Type of KNN method, valid mode
-            ['F-KNN', 'D-KNN']. Defaults to 'F-KNN'.
-        radius (float, optional): Radius to group with.
-            Defaults to None.
-        dilated_group (bool, optional): Whether to use dilated ball query.
+        knn_mode (str): Type of KNN method, valid mode ['F-KNN', 'D-KNN'].
+            Defaults to 'F-KNN'.
+        radius (float, optional): Radius to group with. Defaults to None.
+        dilated_group (bool): Whether to use dilated ball query.
            Defaults to False.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d').
-        act_cfg (dict, optional): Type of activation method.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to dict(type='ReLU').
-        use_xyz (bool, optional): Whether to use xyz as point features.
-            Defaults to True.
-        pool_mode (str, optional): Type of pooling method.
-            Defaults to 'max'.
-        normalize_xyz (bool, optional): If ball query, whether to normalize
-            local XYZ with radius. Defaults to False.
-        bias (bool | str, optional): If specified as `auto`, it will be decided
-            by the norm_cfg. Bias will be set as True if `norm_cfg` is None,
+        use_xyz (bool): Whether to use xyz as point features. Defaults to True.
+        pool_mode (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): If ball query, whether to normalize local XYZ
+            with radius. Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
            otherwise False. Defaults to 'auto'.
    """

    def __init__(self,
-                 mlp_channels,
-                 num_sample=None,
-                 knn_mode='F-KNN',
-                 radius=None,
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d'),
-                 act_cfg=dict(type='ReLU'),
-                 use_xyz=True,
-                 pool_mode='max',
-                 normalize_xyz=False,
-                 bias='auto'):
+                 mlp_channels: List[int],
+                 num_sample: Optional[int] = None,
+                 knn_mode: str = 'F-KNN',
+                 radius: Optional[float] = None,
+                 dilated_group: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 use_xyz: bool = True,
+                 pool_mode: str = 'max',
+                 normalize_xyz: bool = False,
+                 bias: Union[bool, str] = 'auto') -> None:
        super(DGCNNGFModule, self).__init__(
            mlp_channels=[mlp_channels],
            sample_nums=[num_sample],

--- a/mmdet3d/models/layers/edge_fusion_module.py
+++ b/mmdet3d/models/layers/edge_fusion_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

+from mmdet3d.utils import ConfigType
+

 class EdgeFusionModule(BaseModule):
    """Edge Fusion Module for feature map.
@@ -12,21 +17,22 @@ class EdgeFusionModule(BaseModule):
        out_channels (int): The number of output channels.
        feat_channels (int): The number of channels in feature map
            during edge feature fusion.
-        kernel_size (int, optional): Kernel size of convolution.
-            Default: 3.
-        act_cfg (dict, optional): Config of activation.
-            Default: dict(type='ReLU').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d')).
+        kernel_size (int): Kernel size of convolution. Defaults to 3.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
    """

-    def __init__(self,
-                 out_channels,
-                 feat_channels,
-                 kernel_size=3,
-                 act_cfg=dict(type='ReLU'),
-                 norm_cfg=dict(type='BN1d')):
-        super().__init__()
+    def __init__(
+        self,
+        out_channels: int,
+        feat_channels: int,
+        kernel_size: int = 3,
+        act_cfg: ConfigType = dict(type='ReLU'),
+        norm_cfg: ConfigType = dict(type='BN1d')
+    ) -> None:
+        super(EdgeFusionModule, self).__init__()
        self.edge_convs = nn.Sequential(
            ConvModule(
                feat_channels,
@@ -39,22 +45,22 @@ class EdgeFusionModule(BaseModule):
            nn.Conv1d(feat_channels, out_channels, kernel_size=1))
        self.feat_channels = feat_channels

-    def forward(self, features, fused_features, edge_indices, edge_lens,
-                output_h, output_w):
+    def forward(self, features: Tensor, fused_features: Tensor,
+                edge_indices: Tensor, edge_lens: List[int], output_h: int,
+                output_w: int) -> Tensor:
        """Forward pass.

        Args:
-            features (torch.Tensor): Different representative features
-                for fusion.
-            fused_features (torch.Tensor): Different representative
-                features to be fused.
-            edge_indices (torch.Tensor): Batch image edge indices.
-            edge_lens (list[int]): List of edge length of each image.
+            features (Tensor): Different representative features for fusion.
+            fused_features (Tensor): Different representative features
+                to be fused.
+            edge_indices (Tensor): Batch image edge indices.
+            edge_lens (List[int]): List of edge length of each image.
            output_h (int): Height of output feature map.
            output_w (int): Width of output feature map.

        Returns:
-            torch.Tensor: Fused feature maps.
+            Tensor: Fused feature maps.
        """
        batch_size = features.shape[0]
        # normalize

--- a/mmdet3d/models/layers/fusion_layers/coord_transform.py
+++ b/mmdet3d/models/layers/fusion_layers/coord_transform.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from functools import partial
+from typing import Tuple

 import torch
+from torch import Tensor

 from mmdet3d.structures.points import get_points_type


-def apply_3d_transformation(pcd, coord_type, img_meta, reverse=False):
+def apply_3d_transformation(pcd: Tensor,
+                            coord_type: str,
+                            img_meta: dict,
+                            reverse: bool = False) -> Tensor:
    """Apply transformation to input point cloud.

    Args:
-        pcd (torch.Tensor): The point cloud to be transformed.
+        pcd (Tensor): The point cloud to be transformed.
        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
        img_meta(dict): Meta info regarding data transformation.
-        reverse (bool): Reversed transformation or not.
+        reverse (bool): Reversed transformation or not. Defaults to False.

    Note:
        The elements in img_meta['transformation_3d_flow']:
-        "T" stands for translation;
-        "S" stands for scale;
-        "R" stands for rotation;
-        "HF" stands for horizontal flip;
-        "VF" stands for vertical flip.
+
+            - "T" stands for translation;
+            - "S" stands for scale;
+            - "R" stands for rotation;
+            - "HF" stands for horizontal flip;
+            - "VF" stands for vertical flip.

    Returns:
-        torch.Tensor: The transformed point cloud.
+        Tensor: The transformed point cloud.
    """

    dtype = pcd.dtype
@@ -92,16 +98,18 @@ def apply_3d_transformation(pcd, coord_type, img_meta, reverse=False):
    return pcd.coord


-def extract_2d_info(img_meta, tensor):
+def extract_2d_info(
+        img_meta: dict,
+        tensor: Tensor) -> Tuple[int, int, int, int, Tensor, bool, Tensor]:
    """Extract image augmentation information from img_meta.

    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        tensor(torch.Tensor): Input tensor used to create new ones.
+        img_meta (dict): Meta info regarding data transformation.
+        tensor (Tensor): Input tensor used to create new ones.

    Returns:
-        (int, int, int, int, torch.Tensor, bool, torch.Tensor):
-            The extracted information.
+        Tuple[int, int, int, int, torch.Tensor, bool, torch.Tensor]:
+        The extracted information.
    """
    img_shape = img_meta['img_shape']
    ori_shape = img_meta['ori_shape']
@@ -120,17 +128,17 @@ def extract_2d_info(img_meta, tensor):
            img_crop_offset)


-def bbox_2d_transform(img_meta, bbox_2d, ori2new):
+def bbox_2d_transform(img_meta: dict, bbox_2d: Tensor,
+                      ori2new: bool) -> Tensor:
    """Transform 2d bbox according to img_meta.

    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        bbox_2d (torch.Tensor): Shape (..., >4)
-            The input 2d bboxes to transform.
+        img_meta (dict): Meta info regarding data transformation.
+        bbox_2d (Tensor): Shape (..., >4) The input 2d bboxes to transform.
        ori2new (bool): Origin img coord system to new or not.

    Returns:
-        torch.Tensor: The transformed 2d bboxes.
+        Tensor: The transformed 2d bboxes.
    """

    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
@@ -174,17 +182,17 @@ def bbox_2d_transform(img_meta, bbox_2d, ori2new):
    return bbox_2d_new


-def coord_2d_transform(img_meta, coord_2d, ori2new):
+def coord_2d_transform(img_meta: dict, coord_2d: Tensor,
+                       ori2new: bool) -> Tensor:
    """Transform 2d pixel coordinates according to img_meta.

    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        coord_2d (torch.Tensor): Shape (..., 2)
-            The input 2d coords to transform.
+        img_meta (dict): Meta info regarding data transformation.
+        coord_2d (Tensor): Shape (..., 2) The input 2d coords to transform.
        ori2new (bool): Origin img coord system to new or not.

    Returns:
-        torch.Tensor: The transformed 2d coordinates.
+        Tensor: The transformed 2d coordinates.
    """

    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \

--- a/mmdet3d/models/layers/fusion_layers/point_fusion.py
+++ b/mmdet3d/models/layers/fusion_layers/point_fusion.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
 import torch
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

 from mmdet3d.registry import MODELS
 from mmdet3d.structures.bbox_3d import (get_proj_mat_by_coord_type,
                                        points_cam2img, points_img2cam)
+from mmdet3d.utils import OptConfigType, OptMultiConfig
 from . import apply_3d_transformation


-def point_sample(img_meta,
-                 img_features,
-                 points,
-                 proj_mat,
-                 coord_type,
-                 img_scale_factor,
-                 img_crop_offset,
-                 img_flip,
-                 img_pad_shape,
-                 img_shape,
-                 aligned=True,
-                 padding_mode='zeros',
-                 align_corners=True,
-                 valid_flag=False):
+def point_sample(img_meta: dict,
+                 img_features: Tensor,
+                 points: Tensor,
+                 proj_mat: Tensor,
+                 coord_type: str,
+                 img_scale_factor: Tensor,
+                 img_crop_offset: Tensor,
+                 img_flip: bool,
+                 img_pad_shape: Tuple[int],
+                 img_shape: Tuple[int],
+                 aligned: bool = True,
+                 padding_mode: str = 'zeros',
+                 align_corners: bool = True,
+                 valid_flag: bool = False) -> Tensor:
    """Obtain image features using points.

    Args:
        img_meta (dict): Meta info.
-        img_features (torch.Tensor): 1 x C x H x W image features.
-        points (torch.Tensor): Nx3 point cloud in LiDAR coordinates.
-        proj_mat (torch.Tensor): 4x4 transformation matrix.
+        img_features (Tensor): 1 x C x H x W image features.
+        points (Tensor): Nx3 point cloud in LiDAR coordinates.
+        proj_mat (Tensor): 4x4 transformation matrix.
        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
-        img_scale_factor (torch.Tensor): Scale factor with shape of
+        img_scale_factor (Tensor): Scale factor with shape of
            (w_scale, h_scale).
-        img_crop_offset (torch.Tensor): Crop offset used to crop
-            image during data augmentation with shape of (w_offset, h_offset).
+        img_crop_offset (Tensor): Crop offset used to crop image during
+            data augmentation with shape of (w_offset, h_offset).
        img_flip (bool): Whether the image is flipped.
-        img_pad_shape (tuple[int]): int tuple indicates the h & w after
-            padding, this is necessary to obtain features in feature map.
-        img_shape (tuple[int]): int tuple indicates the h & w before padding
-            after scaling, this is necessary for flipping coordinates.
-        aligned (bool): Whether use bilinear interpolation when
+        img_pad_shape (Tuple[int]): Int tuple indicates the h & w after
+            padding. This is necessary to obtain features in feature map.
+        img_shape (Tuple[int]): Int tuple indicates the h & w before padding
+            after scaling. This is necessary for flipping coordinates.
+        aligned (bool): Whether to use bilinear interpolation when
            sampling image features for each point. Defaults to True.
        padding_mode (str): Padding mode when padding values for
            features of out-of-image points. Defaults to 'zeros'.
        align_corners (bool): Whether to align corners when
            sampling image features for each point. Defaults to True.
-        valid_flag (bool): Whether to filter out the points that
-            outside the image and with depth smaller than 0. Defaults to
-            False.
+        valid_flag (bool): Whether to filter out the points that outside
+            the image and with depth smaller than 0. Defaults to False.

    Returns:
-        torch.Tensor: NxC image features sampled by point coordinates.
+        Tensor: NxC image features sampled by point coordinates.
    """

    # apply transformation based on info in img_meta
@@ -114,55 +117,55 @@ class PointFusion(BaseModule):
    """Fuse image features from multi-scale features.

    Args:
-        img_channels (list[int] | int): Channels of image features.
+        img_channels (List[int] or int): Channels of image features.
            It could be a list if the input is multi-scale image features.
        pts_channels (int): Channels of point features
        mid_channels (int): Channels of middle layers
        out_channels (int): Channels of output fused features
-        img_levels (int, optional): Number of image levels. Defaults to 3.
-        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
-            Defaults to 'LIDAR'.
-        conv_cfg (dict, optional): Dict config of conv layers of middle
-            layers. Defaults to None.
-        norm_cfg (dict, optional): Dict config of norm layers of middle
-            layers. Defaults to None.
-        act_cfg (dict, optional): Dict config of activatation layers.
+        img_levels (List[int] or int): Number of image levels. Defaults to 3.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Defaults to 'LIDAR'.
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layers of middle layers. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layers of middle layers. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to None.
-        activate_out (bool, optional): Whether to apply relu activation
-            to output features. Defaults to True.
-        fuse_out (bool, optional): Whether apply conv layer to the fused
-            features. Defaults to False.
-        dropout_ratio (int, float, optional): Dropout ratio of image
-            features to prevent overfitting. Defaults to 0.
-        aligned (bool, optional): Whether apply aligned feature fusion.
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+        activate_out (bool): Whether to apply relu activation to output
+            features. Defaults to True.
+        fuse_out (bool): Whether to apply conv layer to the fused features.
+            Defaults to False.
+        dropout_ratio (int or float): Dropout ratio of image features to
+            prevent overfitting. Defaults to 0.
+        aligned (bool): Whether to apply aligned feature fusion.
+            Defaults to True.
+        align_corners (bool): Whether to align corner when sampling features
+            according to points. Defaults to True.
+        padding_mode (str): Mode used to pad the features of points that do not
+            have corresponding image features. Defaults to 'zeros'.
+        lateral_conv (bool): Whether to apply lateral convs to image features.
            Defaults to True.
-        align_corners (bool, optional): Whether to align corner when
-            sampling features according to points. Defaults to True.
-        padding_mode (str, optional): Mode used to pad the features of
-            points that do not have corresponding image features.
-            Defaults to 'zeros'.
-        lateral_conv (bool, optional): Whether to apply lateral convs
-            to image features. Defaults to True.
    """

    def __init__(self,
-                 img_channels,
-                 pts_channels,
-                 mid_channels,
-                 out_channels,
-                 img_levels=3,
-                 coord_type='LIDAR',
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=None,
-                 init_cfg=None,
-                 activate_out=True,
-                 fuse_out=False,
-                 dropout_ratio=0,
-                 aligned=True,
-                 align_corners=True,
-                 padding_mode='zeros',
-                 lateral_conv=True):
+                 img_channels: Union[List[int], int],
+                 pts_channels: int,
+                 mid_channels: int,
+                 out_channels: int,
+                 img_levels: Union[List[int], int] = 3,
+                 coord_type: str = 'LIDAR',
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 act_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 activate_out: bool = True,
+                 fuse_out: bool = False,
+                 dropout_ratio: Union[int, float] = 0,
+                 aligned: bool = True,
+                 align_corners: bool = True,
+                 padding_mode: str = 'zeros',
+                 lateral_conv: bool = True) -> None:
        super(PointFusion, self).__init__(init_cfg=init_cfg)
        if isinstance(img_levels, int):
            img_levels = [img_levels]
@@ -225,18 +228,19 @@ class PointFusion(BaseModule):
                dict(type='Xavier', layer='Linear', distribution='uniform')
            ]

-    def forward(self, img_feats, pts, pts_feats, img_metas):
+    def forward(self, img_feats: List[Tensor], pts: List[Tensor],
+                pts_feats: Tensor, img_metas: List[dict]) -> Tensor:
        """Forward function.

        Args:
-            img_feats (list[torch.Tensor]): Image features.
-            pts: [list[torch.Tensor]]: A batch of points with shape N x 3.
-            pts_feats (torch.Tensor): A tensor consist of point features of the
+            img_feats (List[Tensor]): Image features.
+            pts: (List[Tensor]): A batch of points with shape N x 3.
+            pts_feats (Tensor): A tensor consist of point features of the
                total batch.
-            img_metas (list[dict]): Meta information of images.
+            img_metas (List[dict]): Meta information of images.

        Returns:
-            torch.Tensor: Fused features of each point.
+            Tensor: Fused features of each point.
        """
        img_pts = self.obtain_mlvl_feats(img_feats, pts, img_metas)
        img_pre_fuse = self.img_transform(img_pts)
@@ -252,17 +256,18 @@ class PointFusion(BaseModule):

        return fuse_out

-    def obtain_mlvl_feats(self, img_feats, pts, img_metas):
+    def obtain_mlvl_feats(self, img_feats: List[Tensor], pts: List[Tensor],
+                          img_metas: List[dict]) -> Tensor:
        """Obtain multi-level features for each point.

        Args:
-            img_feats (list(torch.Tensor)): Multi-scale image features produced
+            img_feats (List[Tensor]): Multi-scale image features produced
                by image backbone in shape (N, C, H, W).
-            pts (list[torch.Tensor]): Points of each sample.
-            img_metas (list[dict]): Meta information for each sample.
+            pts (List[Tensor]): Points of each sample.
+            img_metas (List[dict]): Meta information for each sample.

        Returns:
-            torch.Tensor: Corresponding image features of each point.
+            Tensor: Corresponding image features of each point.
        """
        if self.lateral_convs is not None:
            img_ins = [
@@ -285,17 +290,17 @@ class PointFusion(BaseModule):
        img_pts = torch.cat(img_feats_per_point, dim=0)
        return img_pts

-    def sample_single(self, img_feats, pts, img_meta):
+    def sample_single(self, img_feats: Tensor, pts: Tensor,
+                      img_meta: dict) -> Tensor:
        """Sample features from single level image feature map.

        Args:
-            img_feats (torch.Tensor): Image feature map in shape
-                (1, C, H, W).
-            pts (torch.Tensor): Points of a single sample.
+            img_feats (Tensor): Image feature map in shape (1, C, H, W).
+            pts (Tensor): Points of a single sample.
            img_meta (dict): Meta information of the single sample.

        Returns:
-            torch.Tensor: Single level image features of each point.
+            Tensor: Single level image features of each point.
        """
        # TODO: image transformation also extracted
        img_scale_factor = (
@@ -324,49 +329,47 @@ class PointFusion(BaseModule):
        return img_pts


-def voxel_sample(voxel_features,
-                 voxel_range,
-                 voxel_size,
-                 depth_samples,
-                 proj_mat,
-                 downsample_factor,
-                 img_scale_factor,
-                 img_crop_offset,
-                 img_flip,
-                 img_pad_shape,
-                 img_shape,
-                 aligned=True,
-                 padding_mode='zeros',
-                 align_corners=True):
+def voxel_sample(voxel_features: Tensor,
+                 voxel_range: List[float],
+                 voxel_size: List[float],
+                 depth_samples: Tensor,
+                 proj_mat: Tensor,
+                 downsample_factor: int,
+                 img_scale_factor: Tensor,
+                 img_crop_offset: Tensor,
+                 img_flip: bool,
+                 img_pad_shape: Tuple[int],
+                 img_shape: Tuple[int],
+                 aligned: bool = True,
+                 padding_mode: str = 'zeros',
+                 align_corners: bool = True) -> Tensor:
    """Obtain image features using points.

    Args:
-        voxel_features (torch.Tensor): 1 x C x Nx x Ny x Nz voxel features.
-        voxel_range (list): The range of voxel features.
-        voxel_size (:obj:`ConfigDict` or dict): The voxel size of voxel
-            features.
-        depth_samples (torch.Tensor): N depth samples in LiDAR coordinates.
-        proj_mat (torch.Tensor): ORIGINAL LiDAR2img projection matrix
-            for N views.
+        voxel_features (Tensor): 1 x C x Nx x Ny x Nz voxel features.
+        voxel_range (List[float]): The range of voxel features.
+        voxel_size (List[float]): The voxel size of voxel features.
+        depth_samples (Tensor): N depth samples in LiDAR coordinates.
+        proj_mat (Tensor): ORIGINAL LiDAR2img projection matrix for N views.
        downsample_factor (int): The downsample factor in rescaling.
-        img_scale_factor (tuple[torch.Tensor]): Scale factor with shape of
+        img_scale_factor (Tensor): Scale factor with shape of
            (w_scale, h_scale).
-        img_crop_offset (tuple[torch.Tensor]): Crop offset used to crop
-            image during data augmentation with shape of (w_offset, h_offset).
+        img_crop_offset (Tensor): Crop offset used to crop image during
+            data augmentation with shape of (w_offset, h_offset).
        img_flip (bool): Whether the image is flipped.
-        img_pad_shape (tuple[int]): int tuple indicates the h & w after
-            padding, this is necessary to obtain features in feature map.
-        img_shape (tuple[int]): int tuple indicates the h & w before padding
-            after scaling, this is necessary for flipping coordinates.
-        aligned (bool, optional): Whether use bilinear interpolation when
+        img_pad_shape (Tuple[int]): Int tuple indicates the h & w after
+            padding. This is necessary to obtain features in feature map.
+        img_shape (Tuple[int]): Int tuple indicates the h & w before padding
+            after scaling. This is necessary for flipping coordinates.
+        aligned (bool): Whether to use bilinear interpolation when
            sampling image features for each point. Defaults to True.
-        padding_mode (str, optional): Padding mode when padding values for
+        padding_mode (str): Padding mode when padding values for
            features of out-of-image points. Defaults to 'zeros'.
-        align_corners (bool, optional): Whether to align corners when
+        align_corners (bool): Whether to align corners when
            sampling image features for each point. Defaults to True.

    Returns:
-        torch.Tensor: 1xCxDxHxW frustum features sampled from voxel features.
+        Tensor: 1xCxDxHxW frustum features sampled from voxel features.
    """
    # construct frustum grid
    device = voxel_features.device

--- a/mmdet3d/models/layers/fusion_layers/vote_fusion.py
+++ b/mmdet3d/models/layers/fusion_layers/vote_fusion.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import torch
+from torch import Tensor
 from torch import nn as nn

 from mmdet3d.registry import MODELS
@@ -14,27 +17,33 @@ class VoteFusion(nn.Module):
    """Fuse 2d features from 3d seeds.

    Args:
-        num_classes (int): number of classes.
-        max_imvote_per_pixel (int): max number of imvotes.
+        num_classes (int): Number of classes.
+        max_imvote_per_pixel (int): Max number of imvotes.
    """

-    def __init__(self, num_classes=10, max_imvote_per_pixel=3):
+    def __init__(self,
+                 num_classes: int = 10,
+                 max_imvote_per_pixel: int = 3) -> None:
        super(VoteFusion, self).__init__()
        self.num_classes = num_classes
        self.max_imvote_per_pixel = max_imvote_per_pixel

-    def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas):
+    def forward(self, imgs: List[Tensor], bboxes_2d_rescaled: List[Tensor],
+                seeds_3d_depth: List[Tensor],
+                img_metas: List[dict]) -> Tuple[Tensor]:
        """Forward function.

        Args:
-            imgs (list[torch.Tensor]): Image features.
-            bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes.
-            seeds_3d_depth (torch.Tensor): 3D seeds.
-            img_metas (list[dict]): Meta information of images.
+            imgs (List[Tensor]): Image features.
+            bboxes_2d_rescaled (List[Tensor]): 2D bboxes.
+            seeds_3d_depth (List[Tensor]): 3D seeds.
+            img_metas (List[dict]): Meta information of images.

        Returns:
-            torch.Tensor: Concatenated cues of each point.
-            torch.Tensor: Validity mask of each feature.
+            Tuple[Tensor]:
+
+                - img_features: Concatenated cues of each point.
+                - masks: Validity mask of each feature.
        """
        img_features = []
        masks = []

--- a/mmdet3d/models/layers/mlp.py
+++ b/mmdet3d/models/layers/mlp.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 from mmcv.cnn import ConvModule
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 class MLP(BaseModule):
    """A simple MLP module.
@@ -10,26 +15,28 @@ class MLP(BaseModule):
    Pass features (B, C, N) through an MLP.

    Args:
-        in_channels (int, optional): Number of channels of input features.
-            Default: 18.
-        conv_channels (tuple[int], optional): Out channels of the convolution.
-            Default: (256, 256).
-        conv_cfg (dict, optional): Config of convolution.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d').
-        act_cfg (dict, optional): Config of activation.
-            Default: dict(type='ReLU').
+        in_channels (int): Number of channels of input features.
+            Defaults to 18.
+        conv_channels (Tuple[int]): Out channels of the convolution.
+            Defaults to (256, 256).
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layer. Defaults to dict(type='Conv1d').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
    """

    def __init__(self,
-                 in_channel=18,
-                 conv_channels=(256, 256),
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
+                 in_channel: int = 18,
+                 conv_channels: Tuple[int] = (256, 256),
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(MLP, self).__init__(init_cfg=init_cfg)
        self.mlp = nn.Sequential()
        prev_channels = in_channel
        for i, conv_channel in enumerate(conv_channels):
@@ -47,5 +54,5 @@ class MLP(BaseModule):
                    inplace=True))
            prev_channels = conv_channels[i]

-    def forward(self, img_features):
+    def forward(self, img_features: Tensor) -> Tensor:
        return self.mlp(img_features)
--- a/mmdet3d/models/layers/norm.py
+++ b/mmdet3d/models/layers/norm.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from mmengine.registry import MODELS
+from torch import Tensor
 from torch import distributed as dist
 from torch import nn as nn
 from torch.autograd.function import Function
@@ -9,7 +10,7 @@ from torch.autograd.function import Function
 class AllReduce(Function):

    @staticmethod
-    def forward(ctx, input):
+    def forward(ctx, input: Tensor) -> Tensor:
        input_list = [
            torch.zeros_like(input) for k in range(dist.get_world_size())
        ]
@@ -19,7 +20,7 @@ class AllReduce(Function):
        return torch.sum(inputs, dim=0)

    @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: Tensor) -> Tensor:
        dist.all_reduce(grad_output, async_op=False)
        return grad_output

@@ -43,20 +44,18 @@ class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
        It is slower than `nn.SyncBatchNorm`.
    """

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.fp16_enabled = False
+    def __init__(self, *args: list, **kwargs: dict) -> None:
+        super(NaiveSyncBatchNorm1d, self).__init__(*args, **kwargs)

-    def forward(self, input):
+    def forward(self, input: Tensor) -> Tensor:
        """
        Args:
-            input (tensor): Has shape (N, C) or (N, C, L), where N is
+            input (Tensor): Has shape (N, C) or (N, C, L), where N is
                the batch size, C is the number of features or
                channels, and L is the sequence length

        Returns:
-            tensor: Has shape (N, C) or (N, C, L), has same shape
-            as input.
+            Tensor: Has shape (N, C) or (N, C, L), same shape as input.
        """
        assert input.dtype == torch.float32, \
            f'input should be in float32 type, got {input.dtype}'
@@ -112,17 +111,16 @@ class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
        It is slower than `nn.SyncBatchNorm`.
    """

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.fp16_enabled = False
+    def __init__(self, *args: list, **kwargs: dict) -> None:
+        super(NaiveSyncBatchNorm2d, self).__init__(*args, **kwargs)

-    def forward(self, input):
+    def forward(self, input: Tensor) -> Tensor:
        """
        Args:
-            Input (tensor): Feature has shape (N, C, H, W).
+            Input (Tensor): Feature has shape (N, C, H, W).

        Returns:
-            tensor: Has shape (N, C, H, W), same shape as input.
+            Tensor: Has shape (N, C, H, W), same shape as input.
        """
        assert input.dtype == torch.float32, \
            f'input should be in float32 type, got {input.dtype}'

--- a/mmdet3d/models/layers/paconv/paconv.py
+++ b/mmdet3d/models/layers/paconv/paconv.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+from typing import List, Tuple, Union

 import torch
 from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
 from mmcv.ops import assign_score_withk as assign_score_cuda
 from mmengine.model import constant_init
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

+from mmdet3d.utils import ConfigType
 from .utils import assign_kernel_withoutk, assign_score, calc_euclidian_dist


@@ -17,33 +20,33 @@ class ScoreNet(nn.Module):

    Args:
        mlp_channels (List[int]): Hidden unit sizes of SharedMLP layers.
-        last_bn (bool, optional): Whether to use BN on the last output of mlps.
+        last_bn (bool): Whether to use BN on the last output of mlps.
            Defaults to False.
-        score_norm (str, optional): Normalization function of output scores.
+        score_norm (str): Normalization function of output scores.
            Can be 'softmax', 'sigmoid' or 'identity'. Defaults to 'softmax'.
-        temp_factor (float, optional): Temperature factor to scale the output
+        temp_factor (float): Temperature factor to scale the output
            scores before softmax. Defaults to 1.0.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d').
-        bias (bool | str, optional): If specified as `auto`, it will be decided
-            by the norm_cfg. Bias will be set as True if `norm_cfg` is None,
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
            otherwise False. Defaults to 'auto'.

    Note:
        The official code applies xavier_init to all Conv layers in ScoreNet,
-            see `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg
-            /model/pointnet2/paconv.py#L105>`_. However in our experiments, we
-            did not find much difference in applying such xavier initialization
-            or not. So we neglect this initialization in our implementation.
+        see `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg
+        /model/pointnet2/paconv.py#L105>`_. However in our experiments, we
+        did not find much difference in applying such xavier initialization
+        or not. So we neglect this initialization in our implementation.
    """

    def __init__(self,
-                 mlp_channels,
-                 last_bn=False,
-                 score_norm='softmax',
-                 temp_factor=1.0,
-                 norm_cfg=dict(type='BN2d'),
-                 bias='auto'):
+                 mlp_channels: List[int],
+                 last_bn: bool = False,
+                 score_norm: str = 'softmax',
+                 temp_factor: float = 1.0,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 bias: Union[bool, str] = 'auto') -> None:
        super(ScoreNet, self).__init__()

        assert score_norm in ['softmax', 'sigmoid', 'identity'], \
@@ -79,16 +82,16 @@ class ScoreNet(nn.Module):
                act_cfg=None,
                bias=bias))

-    def forward(self, xyz_features):
+    def forward(self, xyz_features: Tensor) -> Tensor:
        """Forward.

        Args:
-            xyz_features (torch.Tensor): (B, C, N, K), features constructed
-                from xyz coordinates of point pairs. May contain relative
-                positions, Euclidean distance, etc.
+            xyz_features (Tensor): (B, C, N, K) Features constructed from xyz
+                coordinates of point pairs. May contain relative positions,
+                Euclidean distance, etc.

        Returns:
-            torch.Tensor: (B, N, K, M), predicted scores for `M` kernels.
+            Tensor: (B, N, K, M) Predicted scores for `M` kernels.
        """
        scores = self.mlps(xyz_features)  # (B, M, N, K)

@@ -116,43 +119,49 @@ class PAConv(nn.Module):
        in_channels (int): Input channels of point features.
        out_channels (int): Output channels of point features.
        num_kernels (int): Number of kernel weights in the weight bank.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d', momentum=0.1).
-        act_cfg (dict, optional): Type of activation method.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d', momentum=0.1).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
            Defaults to dict(type='ReLU', inplace=True).
-        scorenet_input (str, optional): Type of input to ScoreNet.
+        scorenet_input (str): Type of input to ScoreNet.
            Can be 'identity', 'w_neighbor' or 'w_neighbor_dist'.
            Defaults to 'w_neighbor_dist'.
-        weight_bank_init (str, optional): Init method of weight bank kernels.
+        weight_bank_init (str): Init method of weight bank kernels.
            Can be 'kaiming' or 'xavier'. Defaults to 'kaiming'.
-        kernel_input (str, optional): Input features to be multiplied with
-            kernel weights. Can be 'identity' or 'w_neighbor'.
+        kernel_input (str): Input features to be multiplied with kernel
+            weights. Can be 'identity' or 'w_neighbor'.
            Defaults to 'w_neighbor'.
-        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
-            may contain the following keys and values:
+        scorenet_cfg (dict): Config of the ScoreNet module, which may contain
+            the following keys and values:

            - mlp_channels (List[int]): Hidden units of MLPs.
            - score_norm (str): Normalization function of output scores.
-                Can be 'softmax', 'sigmoid' or 'identity'.
+              Can be 'softmax', 'sigmoid' or 'identity'.
            - temp_factor (float): Temperature factor to scale the output
-                scores before softmax.
+              scores before softmax.
            - last_bn (bool): Whether to use BN on the last output of mlps.
+            Defaults to dict(mlp_channels=[16, 16, 16],
+                             score_norm='softmax',
+                             temp_factor=1.0,
+                             last_bn=False).
    """

-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_kernels,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 act_cfg=dict(type='ReLU', inplace=True),
-                 scorenet_input='w_neighbor_dist',
-                 weight_bank_init='kaiming',
-                 kernel_input='w_neighbor',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_kernels: int,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        scorenet_input: str = 'w_neighbor_dist',
+        weight_bank_init: str = 'kaiming',
+        kernel_input: str = 'w_neighbor',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConv, self).__init__()

        # determine weight kernel size according to used features
@@ -218,21 +227,20 @@ class PAConv(nn.Module):

        self.init_weights()

-    def init_weights(self):
+    def init_weights(self) -> None:
        """Initialize weights of shared MLP layers and BN layers."""
        if self.bn is not None:
            constant_init(self.bn, val=1, bias=0)

-    def _prepare_scorenet_input(self, points_xyz):
+    def _prepare_scorenet_input(self, points_xyz: Tensor) -> Tensor:
        """Prepare input point pairs features for self.ScoreNet.

        Args:
-            points_xyz (torch.Tensor): (B, 3, npoint, K)
-                Coordinates of the grouped points.
+            points_xyz (Tensor): (B, 3, npoint, K) Coordinates of the
+                grouped points.

        Returns:
-            torch.Tensor: (B, C, npoint, K)
-                The generated features per point pair.
+            Tensor: (B, C, npoint, K) The generated features per point pair.
        """
        B, _, npoint, K = points_xyz.size()
        center_xyz = points_xyz[..., :1].repeat(1, 1, 1, K)
@@ -250,22 +258,22 @@ class PAConv(nn.Module):
                                     dim=1)
        return xyz_features

-    def forward(self, inputs):
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
        """Forward.

        Args:
-            inputs (tuple(torch.Tensor)):
+            inputs (Tuple[Tensor]):

-                - features (torch.Tensor): (B, in_c, npoint, K)
-                    Features of the queried points.
-                - points_xyz (torch.Tensor): (B, 3, npoint, K)
-                    Coordinates of the grouped points.
+                - features (Tensor): (B, in_c, npoint, K)
+                  Features of the queried points.
+                - points_xyz (Tensor): (B, 3, npoint, K)
+                  Coordinates of the grouped points.

        Returns:
-            Tuple[torch.Tensor]:
+            Tuple[Tensor]:

-                - new_features: (B, out_c, npoint, K), features after PAConv.
-                - points_xyz: same as input.
+                - new_features: (B, out_c, npoint, K) Features after PAConv.
+                - points_xyz: Same as input.
        """
        features, points_xyz = inputs
        B, _, npoint, K = features.size()
@@ -315,20 +323,22 @@ class PAConvCUDA(PAConv):
    more detailed descriptions.
    """

-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_kernels,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 act_cfg=dict(type='ReLU', inplace=True),
-                 scorenet_input='w_neighbor_dist',
-                 weight_bank_init='kaiming',
-                 kernel_input='w_neighbor',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_kernels: int,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        scorenet_input: str = 'w_neighbor_dist',
+        weight_bank_init: str = 'kaiming',
+        kernel_input: str = 'w_neighbor',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvCUDA, self).__init__(
            in_channels=in_channels,
            out_channels=out_channels,
@@ -343,27 +353,27 @@ class PAConvCUDA(PAConv):
        assert self.kernel_input == 'w_neighbor', \
            'CUDA implemented PAConv only supports w_neighbor kernel_input'

-    def forward(self, inputs):
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
        """Forward.

        Args:
-            inputs (tuple(torch.Tensor)):
+            inputs (Tuple[Tensor]):

-                - features (torch.Tensor): (B, in_c, N)
-                    Features of all points in the current point cloud.
-                    Different from non-CUDA version PAConv, here the features
-                        are not grouped by each center to form a K dim.
-                - points_xyz (torch.Tensor): (B, 3, npoint, K)
-                    Coordinates of the grouped points.
-                - points_idx (torch.Tensor): (B, npoint, K)
-                    Index of the grouped points.
+                - features (Tensor): (B, in_c, N)
+                  Features of all points in the current point cloud.
+                  Different from non-CUDA version PAConv, here the features
+                  are not grouped by each center to form a K dim.
+                - points_xyz (Tensor): (B, 3, npoint, K)
+                  Coordinates of the grouped points.
+                - points_idx (Tensor): (B, npoint, K)
+                  Index of the grouped points.

        Returns:
-            Tuple[torch.Tensor]:
+            Tuple[Tensor]:

-                - new_features: (B, out_c, npoint, K), features after PAConv.
-                - points_xyz: same as input.
-                - points_idx: same as input.
+                - new_features: (B, out_c, npoint, K) Features after PAConv.
+                - points_xyz: Same as input.
+                - points_idx: Same as input.
        """
        features, points_xyz, points_idx = inputs


--- a/mmdet3d/models/layers/paconv/utils.py
+++ b/mmdet3d/models/layers/paconv/utils.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
+from torch import Tensor


-def calc_euclidian_dist(xyz1, xyz2):
+def calc_euclidian_dist(xyz1: Tensor, xyz2: Tensor) -> Tensor:
    """Calculate the Euclidean distance between two sets of points.

    Args:
-        xyz1 (torch.Tensor): (N, 3), the first set of points.
-        xyz2 (torch.Tensor): (N, 3), the second set of points.
+        xyz1 (Tensor): (N, 3) The first set of points.
+        xyz2 (Tensor): (N, 3) The second set of points.

    Returns:
-        torch.Tensor: (N, ), the Euclidean distance between each point pair.
+        Tensor: (N, ) The Euclidean distance between each point pair.
    """
    assert xyz1.shape[0] == xyz2.shape[0], 'number of points are not the same'
    assert xyz1.shape[1] == xyz2.shape[1] == 3, \
@@ -18,25 +21,25 @@ def calc_euclidian_dist(xyz1, xyz2):
    return torch.norm(xyz1 - xyz2, dim=-1)


-def assign_score(scores, point_features):
+def assign_score(scores: Tensor, point_features: Tensor) -> Tensor:
    """Perform weighted sum to aggregate output features according to scores.
    This function is used in non-CUDA version of PAConv.

    Compared to the cuda op assigh_score_withk, this pytorch implementation
-        pre-computes output features for the neighbors of all centers, and then
-        performs aggregation. It consumes more GPU memories.
+    pre-computes output features for the neighbors of all centers, and then
+    performs aggregation. It consumes more GPU memories.

    Args:
-        scores (torch.Tensor): (B, npoint, K, M), predicted scores to
+        scores (Tensor): (B, npoint, K, M) Predicted scores to
            aggregate weight matrices in the weight bank.
            `npoint` is the number of sampled centers.
            `K` is the number of queried neighbors.
            `M` is the number of weight matrices in the weight bank.
-        point_features (torch.Tensor): (B, npoint, K, M, out_dim)
+        point_features (Tensor): (B, npoint, K, M, out_dim)
            Pre-computed point features to be aggregated.

    Returns:
-        torch.Tensor: (B, npoint, K, out_dim), the aggregated features.
+        Tensor: (B, npoint, K, out_dim) The aggregated features.
    """
    B, npoint, K, M = scores.size()
    scores = scores.view(B, npoint, K, 1, M)
@@ -44,21 +47,22 @@ def assign_score(scores, point_features):
    return output


-def assign_kernel_withoutk(features, kernels, M):
+def assign_kernel_withoutk(features: Tensor, kernels: Tensor,
+                           M: int) -> Tuple[Tensor]:
    """Pre-compute features with weight matrices in weight bank. This function
    is used before cuda op assign_score_withk in CUDA version PAConv.

    Args:
-        features (torch.Tensor): (B, in_dim, N), input features of all points.
+        features (Tensor): (B, in_dim, N) Input features of all points.
            `N` is the number of points in current point cloud.
-        kernels (torch.Tensor): (2 * in_dim, M * out_dim), weight matrices in
+        kernels (Tensor): (2 * in_dim, M * out_dim) Weight matrices in
            the weight bank, transformed from (M, 2 * in_dim, out_dim).
            `2 * in_dim` is because the input features are concatenation of
            (point_features - center_features, point_features).
        M (int): Number of weight matrices in the weight bank.

    Returns:
-        Tuple[torch.Tensor]: both of shape (B, N, M, out_dim):
+        Tuple[Tensor]: Both of shape (B, N, M, out_dim).

            - point_features: Pre-computed features for points.
            - center_features: Pre-computed features for centers.

--- a/mmdet3d/models/layers/pointnet_modules/builder.py
+++ b/mmdet3d/models/layers/pointnet_modules/builder.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
 from mmengine.registry import Registry
+from torch import nn as nn

 SA_MODULES = Registry('point_sa_module')


-def build_sa_module(cfg, *args, **kwargs):
+def build_sa_module(cfg: Union[dict, None], *args, **kwargs) -> nn.Module:
    """Build PointNet2 set abstraction (SA) module.

    Args:
-        cfg (None or dict): The SA module config, which should contain:
+        cfg (dict or None): The SA module config, which should contain:
+
            - type (str): Module type.
            - module args: Args needed to instantiate an SA module.
        args (argument list): Arguments passed to the `__init__`

--- a/mmdet3d/models/layers/pointnet_modules/paconv_sa_module.py
+++ b/mmdet3d/models/layers/pointnet_modules/paconv_sa_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
 import torch
+from torch import Tensor
 from torch import nn as nn

 from mmdet3d.models.layers.paconv import PAConv, PAConvCUDA
+from mmdet3d.utils import ConfigType
 from .builder import SA_MODULES
 from .point_sa_module import BasePointSAModule

@@ -16,52 +20,81 @@ class PAConvSAModuleMSG(BasePointSAModule):
    See the `paper <https://arxiv.org/abs/2103.14635>`_ for more details.

    Args:
-        paconv_num_kernels (list[list[int]]): Number of kernel weights in the
+        num_point (int): Number of points.
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
+            the global pooling for each scale.
+        paconv_num_kernels (List[List[int]]): Number of kernel weights in the
            weight banks of each layer's PAConv.
-        paconv_kernel_input (str, optional): Input features to be multiplied
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: Using feature distances for FPS.
+            - D-FPS: Using Euclidean distances of points for FPS.
+            - FS: Using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d', momentum=0.1).
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
+        paconv_kernel_input (str): Input features to be multiplied
            with kernel weights. Can be 'identity' or 'w_neighbor'.
            Defaults to 'w_neighbor'.
-        scorenet_input (str, optional): Type of the input to ScoreNet.
+        scorenet_input (str): Type of the input to ScoreNet.
            Defaults to 'w_neighbor_dist'. Can be the following values:

            - 'identity': Use xyz coordinates as input.
            - 'w_neighbor': Use xyz coordinates and the difference with center
-                points as input.
+              points as input.
            - 'w_neighbor_dist': Use xyz coordinates, the difference with
-                center points and the Euclidean distance as input.
-
-        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
+              center points and the Euclidean distance as input.
+        scorenet_cfg (dict): Config of the ScoreNet module, which
            may contain the following keys and values:

            - mlp_channels (List[int]): Hidden units of MLPs.
            - score_norm (str): Normalization function of output scores.
-                Can be 'softmax', 'sigmoid' or 'identity'.
+              Can be 'softmax', 'sigmoid' or 'identity'.
            - temp_factor (float): Temperature factor to scale the output
-                scores before softmax.
+              scores before softmax.
            - last_bn (bool): Whether to use BN on the last output of mlps.
+            Defaults to dict(mlp_channels=[16, 16, 16],
+                             score_norm='softmax',
+                             temp_factor=1.0,
+                             last_bn=False).
    """

-    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto',
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        num_point: int,
+        radii: List[float],
+        sample_nums: List[int],
+        mlp_channels: List[List[int]],
+        paconv_num_kernels: List[List[int]],
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        dilated_group: bool = False,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        normalize_xyz: bool = False,
+        bias: Union[bool, str] = 'auto',
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvSAModuleMSG, self).__init__(
            num_point=num_point,
            radii=radii,
@@ -114,25 +147,27 @@ class PAConvSAModule(PAConvSAModuleMSG):
    <https://arxiv.org/abs/2103.14635>`_ for more details.
    """

-    def __init__(self,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False,
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        mlp_channels: List[int],
+        paconv_num_kernels: List[int],
+        num_point: Optional[int] = None,
+        radius: Optional[float] = None,
+        num_sample: Optional[int] = None,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        normalize_xyz: bool = False,
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvSAModule, self).__init__(
            mlp_channels=[mlp_channels],
            paconv_num_kernels=[paconv_num_kernels],
@@ -160,27 +195,29 @@ class PAConvCUDASAModuleMSG(BasePointSAModule):
    for more details.
    """

-    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto',
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        num_point: int,
+        radii: List[float],
+        sample_nums: List[int],
+        mlp_channels: List[List[int]],
+        paconv_num_kernels: List[List[int]],
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        dilated_group: bool = False,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        normalize_xyz: bool = False,
+        bias: Union[bool, str] = 'auto',
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvCUDASAModuleMSG, self).__init__(
            num_point=num_point,
            radii=radii,
@@ -230,29 +267,31 @@ class PAConvCUDASAModuleMSG(BasePointSAModule):

    def forward(
        self,
-        points_xyz,
-        features=None,
-        indices=None,
-        target_xyz=None,
-    ):
-        """forward.
+        points_xyz: Tensor,
+        features: Optional[Tensor] = None,
+        indices: Optional[Tensor] = None,
+        target_xyz: Optional[Tensor] = None,
+    ) -> Tuple[Tensor]:
+        """Forward.

        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
            features (Tensor, optional): (B, C, N) features of each point.
-                Default: None.
+                Defaults to None.
            indices (Tensor, optional): (B, num_point) Index of the features.
-                Default: None.
+                Defaults to None.
            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
-                Default: None.
+                Defaults to None.

        Returns:
-            Tensor: (B, M, 3) where M is the number of points.
-                New features xyz.
-            Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number
-                of points. New feature descriptors.
-            Tensor: (B, M) where M is the number of points.
-                Index of the features.
+            Tuple[Tensor]:
+
+                - new_xyz: (B, M, 3) where M is the number of points.
+                  New features xyz.
+                - new_features: (B, M, sum_k(mlps[k][-1])) where M is the
+                  number of points. New feature descriptors.
+                - indices: (B, M) where M is the number of points.
+                  Index of the features.
        """
        new_features_list = []

@@ -306,25 +345,27 @@ class PAConvCUDASAModule(PAConvCUDASAModuleMSG):
    for more details.
    """

-    def __init__(self,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False,
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
+    def __init__(
+        self,
+        mlp_channels: List[int],
+        paconv_num_kernels: List[int],
+        num_point: Optional[int] = None,
+        radius: Optional[float] = None,
+        num_sample: Optional[int] = None,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        normalize_xyz: bool = False,
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
        super(PAConvCUDASAModule, self).__init__(
            mlp_channels=[mlp_channels],
            paconv_num_kernels=[paconv_num_kernels],

--- a/mmdet3d/models/layers/pointnet_modules/point_fp_module.py
+++ b/mmdet3d/models/layers/pointnet_modules/point_fp_module.py
@@ -5,8 +5,11 @@ import torch
 from mmcv.cnn import ConvModule
 from mmcv.ops import three_interpolate, three_nn
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 class PointFPModule(BaseModule):
    """Point feature propagation module used in PointNets.
@@ -15,16 +18,17 @@ class PointFPModule(BaseModule):

    Args:
        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
    """

    def __init__(self,
                 mlp_channels: List[int],
-                 norm_cfg: dict = dict(type='BN2d'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.fp16_enabled = False
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(PointFPModule, self).__init__(init_cfg=init_cfg)
        self.mlps = nn.Sequential()
        for i in range(len(mlp_channels) - 1):
            self.mlps.add_module(
@@ -37,23 +41,22 @@ class PointFPModule(BaseModule):
                    conv_cfg=dict(type='Conv2d'),
                    norm_cfg=norm_cfg))

-    def forward(self, target: torch.Tensor, source: torch.Tensor,
-                target_feats: torch.Tensor,
-                source_feats: torch.Tensor) -> torch.Tensor:
-        """forward.
+    def forward(self, target: Tensor, source: Tensor, target_feats: Tensor,
+                source_feats: Tensor) -> Tensor:
+        """Forward.

        Args:
-            target (Tensor): (B, n, 3) tensor of the xyz positions of
+            target (Tensor): (B, n, 3) Tensor of the xyz positions of
                the target features.
-            source (Tensor): (B, m, 3) tensor of the xyz positions of
+            source (Tensor): (B, m, 3) Tensor of the xyz positions of
                the source features.
-            target_feats (Tensor): (B, C1, n) tensor of the features to be
+            target_feats (Tensor): (B, C1, n) Tensor of the features to be
                propagated to.
-            source_feats (Tensor): (B, C2, m) tensor of features
+            source_feats (Tensor): (B, C2, m) Tensor of features
                to be propagated.

        Return:
-            Tensor: (B, M, N) M = mlp[-1], tensor of the target features.
+            Tensor: (B, M, N) M = mlp[-1], Tensor of the target features.
        """
        if source is not None:
            dist, idx = three_nn(target, source)

--- a/mmdet3d/models/layers/pointnet_modules/point_sa_module.py
+++ b/mmdet3d/models/layers/pointnet_modules/point_sa_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
 import torch
 from mmcv.cnn import ConvModule
 from mmcv.ops import GroupAll
 from mmcv.ops import PointsSampler as Points_Sampler
 from mmcv.ops import QueryAndGroup, gather_points
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

 from mmdet3d.models.layers import PAConv
+from mmdet3d.utils import ConfigType
 from .builder import SA_MODULES


@@ -16,44 +20,43 @@ class BasePointSAModule(nn.Module):

    Args:
        num_point (int): Number of points.
-        radii (list[float]): List of radius in each ball query.
-        sample_nums (list[int]): Number of samples in each ball query.
-        mlp_channels (list[list[int]]): Specify of the pointnet before
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
            the global pooling for each scale.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-            F-FPS: using feature distances for FPS.
-            D-FPS: using Euclidean distances of points for FPS.
-            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int], optional):
-            Range of points to apply FPS. Default: [-1].
-        dilated_group (bool, optional): Whether to use dilated ball query.
-            Default: False.
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
-        grouper_return_grouped_xyz (bool, optional): Whether to return
-            grouped xyz in `QueryAndGroup`. Defaults to False.
-        grouper_return_grouped_idx (bool, optional): Whether to return
-            grouped idx in `QueryAndGroup`. Defaults to False.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: using feature distances for FPS.
+            - D-FPS: using Euclidean distances of points for FPS.
+            - FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        grouper_return_grouped_xyz (bool): Whether to return grouped xyz
+            in `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool): Whether to return grouped idx
+            in `QueryAndGroup`. Defaults to False.
    """

    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 grouper_return_grouped_xyz=False,
-                 grouper_return_grouped_idx=False):
+                 num_point: int,
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 dilated_group: bool = False,
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 normalize_xyz: bool = False,
+                 grouper_return_grouped_xyz: bool = False,
+                 grouper_return_grouped_idx: bool = False) -> None:
        super(BasePointSAModule, self).__init__()

        assert len(radii) == len(sample_nums) == len(mlp_channels)
@@ -109,7 +112,8 @@ class BasePointSAModule(nn.Module):
                grouper = GroupAll(use_xyz)
            self.groupers.append(grouper)

-    def _sample_points(self, points_xyz, features, indices, target_xyz):
+    def _sample_points(self, points_xyz: Tensor, features: Tensor,
+                       indices: Tensor, target_xyz: Tensor) -> Tuple[Tensor]:
        """Perform point sampling based on inputs.

        If `indices` is specified, directly sample corresponding points.
@@ -118,13 +122,15 @@ class BasePointSAModule(nn.Module):

        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor): (B, C, N) features of each point.
+            features (Tensor): (B, C, N) Features of each point.
            indices (Tensor): (B, num_point) Index of the features.
            target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs.

        Returns:
-            Tensor: (B, num_point, 3) sampled xyz coordinates of points.
-            Tensor: (B, num_point) sampled points' index.
+            Tuple[Tensor]:
+
+            - new_xyz: (B, num_point, 3) Sampled xyz coordinates of points.
+            - indices: (B, num_point) Sampled points' index.
        """
        xyz_flipped = points_xyz.transpose(1, 2).contiguous()
        if indices is not None:
@@ -143,16 +149,15 @@ class BasePointSAModule(nn.Module):

        return new_xyz, indices

-    def _pool_features(self, features):
+    def _pool_features(self, features: Tensor) -> Tensor:
        """Perform feature aggregation using pooling operation.

        Args:
-            features (torch.Tensor): (B, C, N, K)
-                Features of locally grouped points before pooling.
+            features (Tensor): (B, C, N, K) Features of locally grouped
+                points before pooling.

        Returns:
-            torch.Tensor: (B, C, N)
-                Pooled features aggregating local information.
+            Tensor: (B, C, N) Pooled features aggregating local information.
        """
        if self.pool_mod == 'max':
            # (B, C, N, 1)
@@ -169,29 +174,31 @@ class BasePointSAModule(nn.Module):

    def forward(
        self,
-        points_xyz,
-        features=None,
-        indices=None,
-        target_xyz=None,
-    ):
-        """forward.
+        points_xyz: Tensor,
+        features: Optional[Tensor] = None,
+        indices: Optional[Tensor] = None,
+        target_xyz: Optional[Tensor] = None,
+    ) -> Tuple[Tensor]:
+        """Forward.

        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor, optional): (B, C, N) features of each point.
-                Default: None.
+            features (Tensor, optional): (B, C, N) Features of each point.
+                Defaults to None.
            indices (Tensor, optional): (B, num_point) Index of the features.
-                Default: None.
-            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
-                Default: None.
+                Defaults to None.
+            target_xyz (Tensor, optional): (B, M, 3) New coords of the outputs.
+                Defaults to None.

        Returns:
-            Tensor: (B, M, 3) where M is the number of points.
-                New features xyz.
-            Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number
-                of points. New feature descriptors.
-            Tensor: (B, M) where M is the number of points.
-                Index of the features.
+            Tuple[Tensor]:
+
+                - new_xyz: (B, M, 3) Where M is the number of points.
+                  New features xyz.
+                - new_features: (B, M, sum_k(mlps[k][-1])) Where M is the
+                  number of points. New feature descriptors.
+                - indices: (B, M) Where M is the number of points.
+                  Index of the features.
        """
        new_features_list = []

@@ -229,45 +236,44 @@ class PointSAModuleMSG(BasePointSAModule):

    Args:
        num_point (int): Number of points.
-        radii (list[float]): List of radius in each ball query.
-        sample_nums (list[int]): Number of samples in each ball query.
-        mlp_channels (list[list[int]]): Specify of the pointnet before
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
            the global pooling for each scale.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-            F-FPS: using feature distances for FPS.
-            D-FPS: using Euclidean distances of points for FPS.
-            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int], optional): Range of points to
-            apply FPS. Default: [-1].
-        dilated_group (bool, optional): Whether to use dilated ball query.
-            Default: False.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
-        bias (bool | str, optional): If specified as `auto`, it will be
-            decided by `norm_cfg`. `bias` will be set as True if
-            `norm_cfg` is None, otherwise False. Default: 'auto'.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: using feature distances for FPS.
+            - D-FPS: using Euclidean distances of points for FPS.
+            - FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
    """

    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d'),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto'):
+                 num_point: int,
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 dilated_group: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 normalize_xyz: bool = False,
+                 bias: Union[bool, str] = 'auto') -> None:
        super(PointSAModuleMSG, self).__init__(
            num_point=num_point,
            radii=radii,
@@ -306,39 +312,35 @@ class PointSAModule(PointSAModuleMSG):
    PointNets.

    Args:
-        mlp_channels (list[int]): Specify of the pointnet before
+        mlp_channels (List[int]): Specify of the pointnet before
            the global pooling for each scale.
-        num_point (int, optional): Number of points.
-            Default: None.
-        radius (float, optional): Radius to group with.
-            Default: None.
+        num_point (int, optional): Number of points. Defaults to None.
+        radius (float, optional): Radius to group with. Defaults to None.
        num_sample (int, optional): Number of samples in each ball query.
-            Default: None.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-        fps_sample_range_list (list[int], optional): Range of points
-            to apply FPS. Default: [-1].
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
+            Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Default to dict(type='BN2d').
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
    """

    def __init__(self,
-                 mlp_channels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d'),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False):
+                 mlp_channels: List[int],
+                 num_point: Optional[int] = None,
+                 radius: Optional[float] = None,
+                 num_sample: Optional[int] = None,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 normalize_xyz: bool = False) -> None:
        super(PointSAModule, self).__init__(
            mlp_channels=[mlp_channels],
            num_point=num_point,

--- a/mmdet3d/models/layers/sparse_block.py
+++ b/mmdet3d/models/layers/sparse_block.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
 from mmcv.cnn import build_conv_layer, build_norm_layer
 from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
 from torch import nn

+from mmdet3d.utils import OptConfigType
 from .spconv import IS_SPCONV2_AVAILABLE

 if IS_SPCONV2_AVAILABLE:
-    from spconv.pytorch import SparseModule, SparseSequential
+    from spconv.pytorch import SparseConvTensor, SparseModule, SparseSequential
 else:
-    from mmcv.ops import SparseModule, SparseSequential
+    from mmcv.ops import SparseConvTensor, SparseModule, SparseSequential


-def replace_feature(out, new_features):
+def replace_feature(out: SparseConvTensor,
+                    new_features: SparseConvTensor) -> SparseConvTensor:
    if 'replace_feature' in out.__dir__():
        # spconv 2.x behaviour
        return out.replace_feature(new_features)
@@ -26,25 +30,26 @@ class SparseBottleneck(Bottleneck, SparseModule):
    Bottleneck block implemented with submanifold sparse convolution.

    Args:
-        inplanes (int): inplanes of block.
-        planes (int): planes of block.
-        stride (int, optional): stride of the first block. Default: 1.
-        downsample (Module, optional): down sample module for block.
-        conv_cfg (dict, optional): dictionary to construct and config conv
-            layer. Default: None.
-        norm_cfg (dict, optional): dictionary to construct and config norm
-            layer. Default: dict(type='BN').
+        inplanes (int): Inplanes of block.
+        planes (int): Planes of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        downsample (Module, optional): Down sample module for block.
+            Defaults to None.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
    """

    expansion = 4

    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 downsample=None,
-                 conv_cfg=None,
-                 norm_cfg=None):
+                 inplanes: int,
+                 planes: int,
+                 stride: Union[int, Tuple[int]] = 1,
+                 downsample: nn.Module = None,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:

        SparseModule.__init__(self)
        Bottleneck.__init__(
@@ -56,7 +61,7 @@ class SparseBottleneck(Bottleneck, SparseModule):
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg)

-    def forward(self, x):
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
        identity = x.features

        out = self.conv1(x)
@@ -85,25 +90,26 @@ class SparseBasicBlock(BasicBlock, SparseModule):
    Sparse basic block implemented with submanifold sparse convolution.

    Args:
-        inplanes (int): inplanes of block.
-        planes (int): planes of block.
-        stride (int, optional): stride of the first block. Default: 1.
-        downsample (Module, optional): down sample module for block.
-        conv_cfg (dict, optional): dictionary to construct and config conv
-            layer. Default: None.
-        norm_cfg (dict, optional): dictionary to construct and config norm
-            layer. Default: dict(type='BN').
+        inplanes (int): Inplanes of block.
+        planes (int): Planes of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        downsample (Module, optional): Down sample module for block.
+            Defaults to None.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
    """

    expansion = 1

    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 downsample=None,
-                 conv_cfg=None,
-                 norm_cfg=None):
+                 inplanes: int,
+                 planes: int,
+                 stride: Union[int, Tuple[int]] = 1,
+                 downsample: nn.Module = None,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:
        SparseModule.__init__(self)
        BasicBlock.__init__(
            self,
@@ -114,7 +120,7 @@ class SparseBasicBlock(BasicBlock, SparseModule):
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg)

-    def forward(self, x):
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
        identity = x.features

        assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}'
@@ -134,29 +140,33 @@ class SparseBasicBlock(BasicBlock, SparseModule):
        return out


-def make_sparse_convmodule(in_channels,
-                           out_channels,
-                           kernel_size,
-                           indice_key,
-                           stride=1,
-                           padding=0,
-                           conv_type='SubMConv3d',
-                           norm_cfg=None,
-                           order=('conv', 'norm', 'act')):
+def make_sparse_convmodule(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: Union[int, Tuple[int]],
+    indice_key: str,
+    stride: Union[int, Tuple[int]] = 1,
+    padding: Union[int, Tuple[int]] = 0,
+    conv_type: str = 'SubMConv3d',
+    norm_cfg: OptConfigType = None,
+    order: Tuple[str] = ('conv', 'norm', 'act')
+) -> SparseSequential:
    """Make sparse convolution module.

    Args:
-        in_channels (int): the number of input channels
-        out_channels (int): the number of out channels
-        kernel_size (int|tuple(int)): kernel size of convolution
-        indice_key (str): the indice key used for sparse tensor
-        stride (int|tuple(int)): the stride of convolution
-        padding (int or list[int]): the padding number of input
-        conv_type (str): sparse conv type in spconv
-        norm_cfg (dict[str]): config of normalization layer
-        order (tuple[str]): The order of conv/norm/activation layers. It is a
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of out channels.
+        kernel_size (int | Tuple[int]): Kernel size of convolution.
+        indice_key (str): The indice key used for sparse tensor.
+        stride (int or tuple[int]): The stride of convolution.
+        padding (int or tuple[int]): The padding number of input.
+        conv_type (str): Sparse conv type in spconv. Defaults to 'SubMConv3d'.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        order (Tuple[str]): The order of conv/norm/activation layers. It is a
            sequence of "conv", "norm" and "act". Common examples are
            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Defaults to ('conv', 'norm', 'act').

    Returns:
        spconv.SparseSequential: sparse convolution module.

--- a/mmdet3d/models/layers/spconv/overwrite_spconv/write_spconv2.py
+++ b/mmdet3d/models/layers/spconv/overwrite_spconv/write_spconv2.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import itertools
+from typing import List, OrderedDict

 from mmengine.registry import MODELS
 from torch.nn.parameter import Parameter


-def register_spconv2():
+def register_spconv2() -> bool:
    """This func registers spconv2.0 spconv ops to overwrite the default mmcv
    spconv ops."""
    try:
@@ -39,8 +40,10 @@ def register_spconv2():
        return True


-def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                          missing_keys, unexpected_keys, error_msgs):
+def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+                          local_metadata: dict, strict: bool,
+                          missing_keys: List[str], unexpected_keys: List[str],
+                          error_msgs: List[str]) -> None:
    """Rewrite this func to compat the convolutional kernel weights between
    spconv 1.x in MMCV and 2.x in spconv2.x.


--- a/mmdet3d/models/layers/transformer.py
+++ b/mmdet3d/models/layers/transformer.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
 from mmcv.cnn.bricks.transformer import MultiheadAttention
 from mmengine.registry import MODELS
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.utils import ConfigType, OptMultiConfig
+

 @MODELS.register_module()
 class GroupFree3DMHA(MultiheadAttention):
@@ -15,40 +20,42 @@ class GroupFree3DMHA(MultiheadAttention):
        embed_dims (int): The embedding dimension.
        num_heads (int): Parallel attention heads. Same as
            `nn.MultiheadAttention`.
-        attn_drop (float, optional): A Dropout layer on attn_output_weights.
+        attn_drop (float): A Dropout layer on attn_output_weights.
            Defaults to 0.0.
-        proj_drop (float, optional): A Dropout layer. Defaults to 0.0.
-        dropout_layer (obj:`ConfigDict`, optional): The dropout_layer used
-            when adding the shortcut.
-        init_cfg (obj:`mmengine.ConfigDict`, optional): The Config for
-            initialization. Default: None.
-        batch_first (bool, optional): Key, Query and Value are shape of
-            (batch, n, embed_dim)
-            or (n, batch, embed_dim). Defaults to False.
+        proj_drop (float): A Dropout layer. Defaults to 0.0.
+        dropout_layer (ConfigType): The dropout_layer used when adding
+            the shortcut. Defaults to dict(type='DropOut', drop_prob=0.).
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim) or (n, batch, embed_dim).
+            Defaults to False.
    """

    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 dropout_layer=dict(type='DropOut', drop_prob=0.),
-                 init_cfg=None,
-                 batch_first=False,
-                 **kwargs):
-        super().__init__(embed_dims, num_heads, attn_drop, proj_drop,
-                         dropout_layer, init_cfg, batch_first, **kwargs)
+                 embed_dims: int,
+                 num_heads: int,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.,
+                 dropout_layer: ConfigType = dict(
+                     type='DropOut', drop_prob=0.),
+                 init_cfg: OptMultiConfig = None,
+                 batch_first: bool = False,
+                 **kwargs) -> None:
+        super(GroupFree3DMHA,
+              self).__init__(embed_dims, num_heads, attn_drop, proj_drop,
+                             dropout_layer, init_cfg, batch_first, **kwargs)

    def forward(self,
-                query,
-                key,
-                value,
-                identity,
-                query_pos=None,
-                key_pos=None,
-                attn_mask=None,
-                key_padding_mask=None,
-                **kwargs):
+                query: Tensor,
+                key: Tensor,
+                value: Tensor,
+                identity: Tensor,
+                query_pos: Optional[Tensor] = None,
+                key_pos: Optional[Tensor] = None,
+                attn_mask: Optional[Tensor] = None,
+                key_padding_mask: Optional[Tensor] = None,
+                **kwargs) -> Tensor:
        """Forward function for `GroupFree3DMHA`.

        **kwargs allow passing a more general data flow when combining
@@ -81,7 +88,7 @@ class GroupFree3DMHA(MultiheadAttention):
                Defaults to None.

        Returns:
-            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+            Tensor: Forwarded results with shape [num_queries, bs, embed_dims].
        """

        if hasattr(self, 'operation_name'):
@@ -113,26 +120,26 @@ class ConvBNPositionalEncoding(nn.Module):
    """Absolute position embedding with Conv learning.

    Args:
-        input_channel (int): input features dim.
-        num_pos_feats (int, optional): output position features dim.
+        input_channel (int): Input features dim.
+        num_pos_feats (int): Output position features dim.
            Defaults to 288 to be consistent with seed features dim.
    """

-    def __init__(self, input_channel, num_pos_feats=288):
-        super().__init__()
+    def __init__(self, input_channel: int, num_pos_feats: int = 288) -> None:
+        super(ConvBNPositionalEncoding, self).__init__()
        self.position_embedding_head = nn.Sequential(
            nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
            nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
            nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))

-    def forward(self, xyz):
+    def forward(self, xyz: Tensor) -> Tensor:
        """Forward pass.

        Args:
-            xyz (Tensor)： (B, N, 3) the coordinates to embed.
+            xyz (Tensor): (B, N, 3) The coordinates to embed.

        Returns:
-            Tensor: (B, num_pos_feats, N) the embedded position features.
+            Tensor: (B, num_pos_feats, N) The embedded position features.
        """
        xyz = xyz.permute(0, 2, 1)
        position_embedding = self.position_embedding_head(xyz)

--- a/mmdet3d/models/layers/vote_module.py
+++ b/mmdet3d/models/layers/vote_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import torch
 from mmcv.cnn import ConvModule
 from mmengine import is_tuple_of
+from torch import Tensor
 from torch import nn as nn

-from mmdet3d.models.builder import build_loss
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType


 class VoteModule(nn.Module):
@@ -14,41 +18,41 @@ class VoteModule(nn.Module):

    Args:
        in_channels (int): Number of channels of seed point features.
-        vote_per_seed (int, optional): Number of votes generated from
-            each seed point. Default: 1.
-        gt_per_seed (int, optional): Number of ground truth votes generated
-            from each seed point. Default: 3.
-        num_points (int, optional): Number of points to be used for voting.
-            Default: 1.
-        conv_channels (tuple[int], optional): Out channels of vote
-            generating convolution. Default: (16, 16).
-        conv_cfg (dict, optional): Config of convolution.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d').
-        norm_feats (bool, optional): Whether to normalize features.
-            Default: True.
-        with_res_feat (bool, optional): Whether to predict residual features.
-            Default: True.
-        vote_xyz_range (list[float], optional):
-            The range of points translation. Default: None.
-        vote_loss (dict, optional): Config of vote loss. Default: None.
+        vote_per_seed (int): Number of votes generated from each seed point.
+            Defaults to 1.
+        gt_per_seed (int): Number of ground truth votes generated from each
+            seed point. Defaults to 3.
+        num_points (int): Number of points to be used for voting.
+            Defaults to 1.
+        conv_channels (tuple[int]): Out channels of vote generating
+            convolution. Defaults to (16, 16).
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layer. Defaults to dict(type='Conv1d').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        norm_feats (bool): Whether to normalize features. Default to True.
+        with_res_feat (bool): Whether to predict residual features.
+            Defaults to True.
+        vote_xyz_range (List[float], optional): The range of points
+            translation. Defaults to None.
+        vote_loss (:obj:`ConfigDict` or dict, optional): Config of vote loss.
+            Defaults to None.
    """

    def __init__(self,
-                 in_channels,
-                 vote_per_seed=1,
-                 gt_per_seed=3,
-                 num_points=-1,
-                 conv_channels=(16, 16),
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 norm_feats=True,
-                 with_res_feat=True,
-                 vote_xyz_range=None,
-                 vote_loss=None):
-        super().__init__()
+                 in_channels: int,
+                 vote_per_seed: int = 1,
+                 gt_per_seed: int = 3,
+                 num_points: int = -1,
+                 conv_channels: Tuple[int] = (16, 16),
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 norm_feats: bool = True,
+                 with_res_feat: bool = True,
+                 vote_xyz_range: List[float] = None,
+                 vote_loss: OptConfigType = None) -> None:
+        super(VoteModule, self).__init__()
        self.in_channels = in_channels
        self.vote_per_seed = vote_per_seed
        self.gt_per_seed = gt_per_seed
@@ -60,7 +64,7 @@ class VoteModule(nn.Module):
        self.vote_xyz_range = vote_xyz_range

        if vote_loss is not None:
-            self.vote_loss = build_loss(vote_loss)
+            self.vote_loss = MODELS.build(vote_loss)

        prev_channels = in_channels
        vote_conv_list = list()
@@ -86,23 +90,24 @@ class VoteModule(nn.Module):
            out_channel = 3 * self.vote_per_seed
        self.conv_out = nn.Conv1d(prev_channels, out_channel, 1)

-    def forward(self, seed_points, seed_feats):
-        """forward.
+    def forward(self, seed_points: Tensor,
+                seed_feats: Tensor) -> Tuple[Tensor]:
+        """Forward.

        Args:
-            seed_points (torch.Tensor): Coordinate of the seed
-                points in shape (B, N, 3).
-            seed_feats (torch.Tensor): Features of the seed points in shape
+            seed_points (Tensor): Coordinate of the seed points in shape
+                (B, N, 3).
+            seed_feats (Tensor): Features of the seed points in shape
                (B, C, N).

        Returns:
-            tuple[torch.Tensor]:
+            Tuple[torch.Tensor]:

                - vote_points: Voted xyz based on the seed points
-                    with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
+                  with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
                - vote_features: Voted features based on the seed points with
-                    shape (B, C, M) where ``M=num_seed*vote_per_seed``,
-                    ``C=vote_feature_dim``.
+                  shape (B, C, M) where ``M=num_seed*vote_per_seed``,
+                  ``C=vote_feature_dim``.
        """
        if self.num_points != -1:
            assert self.num_points < seed_points.shape[1], \
@@ -150,19 +155,20 @@ class VoteModule(nn.Module):
            vote_feats = seed_feats
        return vote_points, vote_feats, offset

-    def get_loss(self, seed_points, vote_points, seed_indices,
-                 vote_targets_mask, vote_targets):
+    def get_loss(self, seed_points: Tensor, vote_points: Tensor,
+                 seed_indices: Tensor, vote_targets_mask: Tensor,
+                 vote_targets: Tensor) -> Tensor:
        """Calculate loss of voting module.

        Args:
-            seed_points (torch.Tensor): Coordinate of the seed points.
-            vote_points (torch.Tensor): Coordinate of the vote points.
-            seed_indices (torch.Tensor): Indices of seed points in raw points.
-            vote_targets_mask (torch.Tensor): Mask of valid vote targets.
-            vote_targets (torch.Tensor): Targets of votes.
+            seed_points (Tensor): Coordinate of the seed points.
+            vote_points (Tensor): Coordinate of the vote points.
+            seed_indices (Tensor): Indices of seed points in raw points.
+            vote_targets_mask (Tensor): Mask of valid vote targets.
+            vote_targets (Tensor): Targets of votes.

        Returns:
-            torch.Tensor: Weighted vote loss.
+            Tensor: Weighted vote loss.
        """
        batch_size, num_seed = seed_points.shape[:2]