[Enhance] refine docstring and typehint in `segmentor` (#2254)

* refactor segmentor * replace meta info during inference * Update decode_head.py * update * update docs

[Enhance] refine docstring and typehint in `segmentor` (#2254)
* refactor segmentor * replace meta info during inference * Update decode_head.py * update * update docs
1587d48f · Xiang Xu · GitHub · 68102441 · 1587d48f · 1587d48f
Unverified Commit 1587d48f authored Feb 14, 2023 by Xiang Xu Committed by GitHub Feb 14, 2023
6 changed files
--- a/mmdet3d/models/decode_heads/decode_head.py
+++ b/mmdet3d/models/decode_heads/decode_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from typing import List
+from typing import Dict, List

 import torch
 from mmengine.model import BaseModule, normal_init
@@ -9,7 +9,7 @@ from torch import nn as nn

 from mmdet3d.registry import MODELS
 from mmdet3d.structures.det3d_data_sample import SampleList
-from mmdet3d.utils.typing_utils import ConfigType
+from mmdet3d.utils.typing_utils import ConfigType, OptMultiConfig


 class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
@@ -42,35 +42,35 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        channels (int): Channels after modules, before conv_seg.
        num_classes (int): Number of classes.
        dropout_ratio (float): Ratio of dropout layer. Defaults to 0.5.
-        conv_cfg (dict): Config of conv layers.
+        conv_cfg (dict or :obj:`ConfigDict`): Config of conv layers.
            Defaults to dict(type='Conv1d').
-        norm_cfg (dict): Config of norm layers.
+        norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers.
            Defaults to dict(type='BN1d').
-        act_cfg (dict): Config of activation layers.
+        act_cfg (dict or :obj:`ConfigDict`): Config of activation layers.
            Defaults to dict(type='ReLU').
-        loss_decode (dict): Config of decode loss.
-            Defaults to dict(type='CrossEntropyLoss').
-        ignore_index (int): The label index to be ignored.
-            When using masked BCE loss, ignore_index should be set to None.
-            Defaults to 255.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-            Defaults to None.
+        loss_decode (dict or :obj:`ConfigDict`): Config of decode loss.
+            Defaults to dict(type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            class_weight=None, loss_weight=1.0).
+        ignore_index (int): The label index to be ignored. When using masked
+            BCE loss, ignore_index should be set to None. Defaults to 255.
+        init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`],
+            optional): Initialization config dict. Defaults to None.
    """

    def __init__(self,
-                 channels,
-                 num_classes,
-                 dropout_ratio=0.5,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 loss_decode=dict(
+                 channels: int,
+                 num_classes: int,
+                 dropout_ratio: float = 0.5,
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 loss_decode: ConfigType = dict(
                     type='mmdet.CrossEntropyLoss',
                     use_sigmoid=False,
                     class_weight=None,
                     loss_weight=1.0),
-                 ignore_index=255,
-                 init_cfg=None) -> None:
+                 ignore_index: int = 255,
+                 init_cfg: OptMultiConfig = None) -> None:
        super(Base3DDecodeHead, self).__init__(init_cfg=init_cfg)
        self.channels = channels
        self.num_classes = num_classes
@@ -87,13 +87,13 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        else:
            self.dropout = None

-    def init_weights(self):
+    def init_weights(self) -> None:
        """Initialize weights of classification layer."""
        super().init_weights()
        normal_init(self.conv_seg, mean=0, std=0.01)

    @abstractmethod
-    def forward(self, feats_dict: dict):
+    def forward(self, feats_dict: dict) -> Tensor:
        """Placeholder of forward function."""
        pass

@@ -104,34 +104,33 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        output = self.conv_seg(feat)
        return output

-    def loss(self, inputs: List[Tensor], batch_data_samples: SampleList,
-             train_cfg: ConfigType) -> dict:
+    def loss(self, inputs: dict, batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> Dict[str, Tensor]:
        """Forward function for training.

        Args:
-            inputs (list[torch.Tensor]): List of multi-level point features.
-            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
-                data samples. It usually includes information such
-                as `metainfo` and `gt_pts_seg`.
-            train_cfg (dict): The training config.
+            inputs (dict): Feature dict from backbone.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+            train_cfg (dict or :obj:`ConfigDict`): The training config.

        Returns:
-            dict[str, Tensor]: a dictionary of loss components
+            Dict[str, Tensor]: A dictionary of loss components.
        """
        seg_logits = self.forward(inputs)
        losses = self.loss_by_feat(seg_logits, batch_data_samples)
        return losses

-    def predict(self, inputs: List[Tensor], batch_input_metas: List[dict],
-                test_cfg: ConfigType) -> List[Tensor]:
+    def predict(self, inputs: dict, batch_input_metas: List[dict],
+                test_cfg: ConfigType) -> Tensor:
        """Forward function for testing.

        Args:
-            inputs (list[Tensor]): List of multi-level point features.
-            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
-                data samples. It usually includes information such
-                as `metainfo` and `gt_pts_seg`.
-            test_cfg (dict): The testing config.
+            inputs (dict): Feature dict from backbone.
+            batch_input_metas (List[dict]): Meta information of a batch of
+                samples.
+            test_cfg (dict or :obj:`ConfigDict`): The testing config.

        Returns:
            Tensor: Output segmentation map.
@@ -148,15 +147,18 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        return torch.stack(gt_semantic_segs, dim=0)

    def loss_by_feat(self, seg_logit: Tensor,
-                     batch_data_samples: SampleList) -> dict:
+                     batch_data_samples: SampleList) -> Dict[str, Tensor]:
        """Compute semantic segmentation loss.

        Args:
-            seg_logit (torch.Tensor): Predicted per-point segmentation logits
-                of shape [B, num_classes, N].
-            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
-                data samples. It usually includes information such
-                as `metainfo` and `gt_pts_seg`.
+            seg_logit (Tensor): Predicted per-point segmentation logits of
+                shape [B, num_classes, N].
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
        """
        seg_label = self._stack_batch_gt(batch_data_samples)
        loss = dict()

--- a/mmdet3d/models/decode_heads/dgcnn_head.py
+++ b/mmdet3d/models/decode_heads/dgcnn_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+from typing import Sequence

 from mmcv.cnn.bricks import ConvModule
 from torch import Tensor
@@ -18,11 +18,12 @@ class DGCNNHead(Base3DDecodeHead):
    `reimplementation code <https://github.com/AnTao97/dgcnn.pytorch>`_.

    Args:
-        fp_channels (tuple[int], optional): Tuple of mlp channels in feature
+        fp_channels (Sequence[int]): Tuple of mlp channels in feature
            propagation (FP) modules. Defaults to (1216, 512).
    """

-    def __init__(self, fp_channels: Tuple = (1216, 512), **kwargs) -> None:
+    def __init__(self, fp_channels: Sequence[int] = (1216, 512),
+                 **kwargs) -> None:
        super(DGCNNHead, self).__init__(**kwargs)

        self.FP_module = DGCNNFPModule(
@@ -45,7 +46,7 @@ class DGCNNHead(Base3DDecodeHead):
            feat_dict (dict): Feature dict from backbone.

        Returns:
-            torch.Tensor: points for decoder.
+            torch.Tensor: Points for decoder.
        """
        fa_points = feat_dict['fa_points']

@@ -58,7 +59,7 @@ class DGCNNHead(Base3DDecodeHead):
            feat_dict (dict): Feature dict from backbone.

        Returns:
-            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+            Tensor: Segmentation map of shape [B, num_classes, N].
        """
        fa_points = self._extract_input(feat_dict)


--- a/mmdet3d/models/decode_heads/paconv_head.py
+++ b/mmdet3d/models/decode_heads/paconv_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+from typing import Sequence

 from mmcv.cnn.bricks import ConvModule
 from torch import Tensor
@@ -17,16 +17,19 @@ class PAConvHead(PointNet2Head):
    Refer to the `official code <https://github.com/CVMI-Lab/PAConv>`_.

    Args:
-        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
-        fp_norm_cfg (dict): Config of norm layers used in FP modules.
-            Default: dict(type='BN2d').
+        fp_channels (Sequence[Sequence[int]]): Tuple of mlp channels in FP
+            modules. Defaults to ((768, 256, 256), (384, 256, 256),
+            (320, 256, 128), (128 + 6, 128, 128, 128)).
+        fp_norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers used in
+            FP modules. Defaults to dict(type='BN2d').
    """

    def __init__(self,
-                 fp_channels: Tuple[Tuple[int]] = ((768, 256, 256),
-                                                   (384, 256, 256), (320, 256,
-                                                                     128),
-                                                   (128 + 6, 128, 128, 128)),
+                 fp_channels: Sequence[Sequence[int]] = ((768, 256, 256),
+                                                         (384, 256, 256),
+                                                         (320, 256,
+                                                          128), (128 + 6, 128,
+                                                                 128, 128)),
                 fp_norm_cfg: ConfigType = dict(type='BN2d'),
                 **kwargs) -> None:
        super(PAConvHead, self).__init__(

--- a/mmdet3d/models/decode_heads/pointnet2_head.py
+++ b/mmdet3d/models/decode_heads/pointnet2_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+from typing import List, Sequence, Tuple

 from mmcv.cnn.bricks import ConvModule
 from torch import Tensor
@@ -19,15 +19,17 @@ class PointNet2Head(Base3DDecodeHead):
    Refer to the `official code <https://github.com/charlesq34/pointnet2>`_.

    Args:
-        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
-        fp_norm_cfg (dict): Config of norm layers used in FP modules.
-            Default: dict(type='BN2d').
+        fp_channels (Sequence[Sequence[int]]): Tuple of mlp channels in FP
+            modules. Defaults to ((768, 256, 256), (384, 256, 256),
+            (320, 256, 128), (128, 128, 128, 128)).
+        fp_norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers used
+            in FP modules. Defaults to dict(type='BN2d').
    """

    def __init__(self,
-                 fp_channels: Tuple[Tuple[int]] = ((768, 256, 256),
-                                                   (384, 256, 256), (320, 256,
-                                                                     128),
+                 fp_channels: Sequence[Sequence[int]] = ((768, 256, 256),
+                                                         (384, 256, 256),
+                                                         (320, 256, 128),
                                                         (128, 128, 128, 128)),
                 fp_norm_cfg: ConfigType = dict(type='BN2d'),
                 **kwargs) -> None:
@@ -49,15 +51,16 @@ class PointNet2Head(Base3DDecodeHead):
            norm_cfg=self.norm_cfg,
            act_cfg=self.act_cfg)

-    def _extract_input(self, feat_dict: dict) -> Tensor:
+    def _extract_input(self,
+                       feat_dict: dict) -> Tuple[List[Tensor], List[Tensor]]:
        """Extract inputs from features dictionary.

        Args:
            feat_dict (dict): Feature dict from backbone.

        Returns:
-            list[torch.Tensor]: Coordinates of multiple levels of points.
-            list[torch.Tensor]: Features of multiple levels of points.
+            Tuple[List[Tensor], List[Tensor]]: Coordinates and features of
+            multiple levels of points.
        """
        sa_xyz = feat_dict['sa_xyz']
        sa_features = feat_dict['sa_features']
@@ -72,7 +75,7 @@ class PointNet2Head(Base3DDecodeHead):
            feat_dict (dict): Feature dict from backbone.

        Returns:
-            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+            Tensor: Segmentation map of shape [B, num_classes, N].
        """
        sa_xyz, sa_features = self._extract_input(feat_dict)


--- a/mmdet3d/models/segmentors/base.py
+++ b/mmdet3d/models/segmentors/base.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from typing import List, Tuple, Union
+from typing import Dict, List, Union

 from mmengine.model import BaseModel
 from torch import Tensor

-from mmdet3d.structures import Det3DDataSample, PointData
+from mmdet3d.structures import PointData
 from mmdet3d.structures.det3d_data_sample import (ForwardResults,
                                                  OptSampleList, SampleList)
 from mmdet3d.utils import OptConfigType, OptMultiConfig
@@ -15,12 +15,12 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
    """Base class for 3D segmentors.

    Args:
-        data_preprocessor (dict, optional): Model preprocessing config
-            for processing the input data. it usually includes
-            ``to_rgb``, ``pad_size_divisor``, ``pad_val``,
-            ``mean`` and ``std``. Default to None.
-       init_cfg (dict, optional): the config to control the
-           initialization. Default to None.
+        data_preprocessor (dict or ConfigDict, optional): Model preprocessing
+            config for processing the input data. it usually includes
+            ``to_rgb``, ``pad_size_divisor``, ``pad_val``, ``mean`` and
+            ``std``. Defaults to None.
+       init_cfg (dict or ConfigDict, optional): The config to control the
+           initialization. Defaults to None.
    """

    def __init__(self,
@@ -31,34 +31,34 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):

    @property
    def with_neck(self) -> bool:
-        """bool: whether the segmentor has neck"""
+        """bool: Whether the segmentor has neck."""
        return hasattr(self, 'neck') and self.neck is not None

    @property
    def with_auxiliary_head(self) -> bool:
-        """bool: whether the segmentor has auxiliary head"""
+        """bool: Whether the segmentor has auxiliary head."""
        return hasattr(self,
                       'auxiliary_head') and self.auxiliary_head is not None

    @property
    def with_decode_head(self) -> bool:
-        """bool: whether the segmentor has decode head"""
+        """bool: Whether the segmentor has decode head."""
        return hasattr(self, 'decode_head') and self.decode_head is not None

    @property
    def with_regularization_loss(self) -> bool:
-        """bool: whether the segmentor has regularization loss for weight"""
+        """bool: Whether the segmentor has regularization loss for weight."""
        return hasattr(self, 'loss_regularization') and \
            self.loss_regularization is not None

    @abstractmethod
-    def extract_feat(self, batch_inputs: Tensor) -> bool:
+    def extract_feat(self, batch_inputs: Tensor) -> dict:
        """Placeholder for extract features from images."""
        pass

    @abstractmethod
    def encode_decode(self, batch_inputs: Tensor,
-                      batch_data_samples: SampleList):
+                      batch_data_samples: SampleList) -> Tensor:
        """Placeholder for encode images with backbone and decode into a
        semantic segmentation map of the same size as input."""
        pass
@@ -82,12 +82,12 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
        optimizer updating, which are done in the :meth:`train_step`.

        Args:
-            inputs (dict | List[dict]): Input sample dict which
-                includes 'points' and 'imgs' keys.
+            inputs (dict or List[dict]): Input sample dict which includes
+                'points' and 'imgs' keys.

-                - points (list[torch.Tensor]): Point cloud of each sample.
-                - imgs (torch.Tensor): Image tensor has shape (B, C, H, W).
-            data_samples (list[:obj:`Det3DDataSample`], optional):
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor): Image tensor has shape (B, C, H, W).
+            data_samples (List[:obj:`Det3DDataSample`], optional):
                The annotation data of every samples. Defaults to None.
            mode (str): Return what kind of value. Defaults to 'tensor'.

@@ -109,23 +109,22 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
                               'Only supports loss, predict and tensor mode')

    @abstractmethod
-    def loss(self, batch_inputs: Tensor,
-             batch_data_samples: SampleList) -> dict:
+    def loss(self, batch_inputs: dict,
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
        """Calculate losses from a batch of inputs and data samples."""
        pass

    @abstractmethod
-    def predict(self, batch_inputs: Tensor,
+    def predict(self, batch_inputs: dict,
                batch_data_samples: SampleList) -> SampleList:
        """Predict results from a batch of inputs and data samples with post-
        processing."""
        pass

    @abstractmethod
-    def _forward(
-            self,
-            batch_inputs: Tensor,
-            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+    def _forward(self,
+                 batch_inputs: dict,
+                 batch_data_samples: OptSampleList = None) -> Tensor:
        """Network forward process.

        Usually includes backbone, neck and head forward without any post-
@@ -134,33 +133,31 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
        pass

    @abstractmethod
-    def aug_test(self, batch_inputs, batch_img_metas):
+    def aug_test(self, batch_inputs, batch_data_samples):
        """Placeholder for augmentation test."""
        pass

    def postprocess_result(self, seg_pred_list: List[dict],
-                           batch_img_metas: List[dict]) -> list:
+                           batch_data_samples: SampleList) -> SampleList:
        """Convert results list to `Det3DDataSample`.

        Args:
            seg_logits_list (List[dict]): List of segmentation results,
                seg_logits from model of each input point clouds sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.

        Returns:
-            list[:obj:`Det3DDataSample`]: Segmentation results of the
-            input images. Each Det3DDataSample usually contain:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:

-            - ``pred_pts_seg``(PixelData): Prediction of 3D
-                semantic segmentation.
+            - ``pred_pts_seg`` (PixelData): Prediction of 3D semantic
+              segmentation.
        """
-        predictions = []

        for i in range(len(seg_pred_list)):
-            img_meta = batch_img_metas[i]
            seg_pred = seg_pred_list[i]
-            prediction = Det3DDataSample(**{'metainfo': img_meta.metainfo})
-            prediction.set_data({'eval_ann_info': img_meta.eval_ann_info})
-            prediction.set_data(
+            batch_data_samples[i].set_data(
                {'pred_pts_seg': PointData(**{'pts_semantic_mask': seg_pred})})
-            predictions.append(prediction)
-        return predictions
+        return batch_data_samples
--- a/mmdet3d/models/segmentors/encoder_decoder.py
+++ b/mmdet3d/models/segmentors/encoder_decoder.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple
+from typing import Dict, List, Tuple

 import numpy as np
 import torch
@@ -36,7 +36,7 @@ class EncoderDecoder3D(Base3DSegmentor):
    2. The ``predict`` method is used to predict segmentation results,
    which includes two steps: (1) Run inference function to obtain the list of
    seg_logits (2) Call post-processing function to obtain list of
-    ``SegDataSampel`` including ``pred_sem_seg`` and ``seg_logits``.
+    ``Det3DDataSample`` including ``pred_pts_seg``.

    .. code:: text

@@ -47,36 +47,43 @@ class EncoderDecoder3D(Base3DSegmentor):

    4 The ``_forward`` method is used to output the tensor by running the model,
    which includes two steps: (1) Extracts features to obtain the feature maps
-    (2)Call the decode head forward function to forward decode head model.
+    (2) Call the decode head forward function to forward decode head model.

    .. code:: text

    _forward(): extract_feat() -> _decode_head.forward()

    Args:
-
-        backbone (ConfigType): The config for the backnone of segmentor.
-        decode_head (ConfigType): The config for the decode head of segmentor.
-        neck (OptConfigType): The config for the neck of segmentor.
-            Defaults to None.
-        auxiliary_head (OptConfigType): The config for the auxiliary head of
+        backbone (dict or :obj:`ConfigDict`): The config for the backnone of
+            segmentor.
+        decode_head (dict or :obj:`ConfigDict`): The config for the decode
+            head of segmentor.
+        neck (dict or :obj:`ConfigDict`, optional): The config for the neck of
+            segmentor. Defaults to None.
+        auxiliary_head (dict or :obj:`ConfigDict` or List[dict or
+            :obj:`ConfigDict`], optional): The config for the auxiliary head of
            segmentor. Defaults to None.
-        loss_regularization (OptiConfigType): The config for the regularization
+        loss_regularization (dict or :obj:`ConfigDict` or List[dict or
+            :obj:`ConfigDict`], optional): The config for the regularization
            loass. Defaults to None.
-        train_cfg (OptConfigType): The config for training. Defaults to None.
-        test_cfg (OptConfigType): The config for testing. Defaults to None.
-        data_preprocessor (OptConfigType): The pre-process config of
-            :class:`BaseDataPreprocessor`. Defaults to None.
-        init_cfg (OptMultiConfig): The weight initialized config for
-            :class:`BaseModule`. Defaults to None.
+        train_cfg (dict or :obj:`ConfigDict`, optional): The config for
+            training. Defaults to None.
+        test_cfg (dict or :obj:`ConfigDict`, optional): The config for testing.
+            Defaults to None.
+        data_preprocessor (dict or :obj:`ConfigDict`, optional): The
+            pre-process config of :class:`BaseDataPreprocessor`.
+            Defaults to None.
+        init_cfg (dict or :obj:`ConfigDict` or List[dict or :obj:`ConfigDict`],
+            optional): The weight initialized config for :class:`BaseModule`.
+            Defaults to None.
    """  # noqa: E501

    def __init__(self,
                 backbone: ConfigType,
                 decode_head: ConfigType,
                 neck: OptConfigType = None,
-                 auxiliary_head: OptConfigType = None,
-                 loss_regularization: OptConfigType = None,
+                 auxiliary_head: OptMultiConfig = None,
+                 loss_regularization: OptMultiConfig = None,
                 train_cfg: OptConfigType = None,
                 test_cfg: OptConfigType = None,
                 data_preprocessor: OptConfigType = None,
@@ -97,12 +104,13 @@ class EncoderDecoder3D(Base3DSegmentor):
            '3D EncoderDecoder Segmentor should have a decode_head'

    def _init_decode_head(self, decode_head: ConfigType) -> None:
-        """Initialize ``decode_head``"""
+        """Initialize ``decode_head``."""
        self.decode_head = MODELS.build(decode_head)
        self.num_classes = self.decode_head.num_classes

-    def _init_auxiliary_head(self, auxiliary_head: ConfigType) -> None:
-        """Initialize ``auxiliary_head``"""
+    def _init_auxiliary_head(self,
+                             auxiliary_head: OptMultiConfig = None) -> None:
+        """Initialize ``auxiliary_head``."""
        if auxiliary_head is not None:
            if isinstance(auxiliary_head, list):
                self.auxiliary_head = nn.ModuleList()
@@ -112,8 +120,9 @@ class EncoderDecoder3D(Base3DSegmentor):
                self.auxiliary_head = MODELS.build(auxiliary_head)

    def _init_loss_regularization(self,
-                                  loss_regularization: ConfigType) -> None:
-        """Initialize ``loss_regularization``"""
+                                  loss_regularization: OptMultiConfig = None
+                                  ) -> None:
+        """Initialize ``loss_regularization``."""
        if loss_regularization is not None:
            if isinstance(loss_regularization, list):
                self.loss_regularization = nn.ModuleList()
@@ -122,7 +131,7 @@ class EncoderDecoder3D(Base3DSegmentor):
            else:
                self.loss_regularization = MODELS.build(loss_regularization)

-    def extract_feat(self, batch_inputs: Tensor) -> Tensor:
+    def extract_feat(self, batch_inputs: Tensor) -> dict:
        """Extract features from points."""
        x = self.backbone(batch_inputs)
        if self.with_neck:
@@ -135,21 +144,32 @@ class EncoderDecoder3D(Base3DSegmentor):
        map of the same size as input.

        Args:
-            batch_input (torch.Tensor): Input point cloud sample
-            batch_input_metas (list[dict]): Meta information of each sample.
+            batch_input (Tensor): Input point cloud sample
+            batch_input_metas (List[dict]): Meta information of a batch of
+                samples.

        Returns:
-            torch.Tensor: Segmentation logits of shape [B, num_classes, N].
+            Tensor: Segmentation logits of shape [B, num_classes, N].
        """
        x = self.extract_feat(batch_inputs)
        seg_logits = self.decode_head.predict(x, batch_input_metas,
                                              self.test_cfg)
        return seg_logits

-    def _decode_head_forward_train(self, batch_inputs_dict: dict,
-                                   batch_data_samples: SampleList) -> dict:
-        """Run forward function and calculate loss for decode head in
-        training."""
+    def _decode_head_forward_train(
+            self, batch_inputs_dict: dict,
+            batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """Run forward function and calculate loss for decode head in training.
+
+        Args:
+            batch_input (Tensor): Input point cloud sample
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components for decode head.
+        """
        losses = dict()
        loss_decode = self.decode_head.loss(batch_inputs_dict,
                                            batch_data_samples, self.train_cfg)
@@ -161,9 +181,20 @@ class EncoderDecoder3D(Base3DSegmentor):
        self,
        batch_inputs_dict: dict,
        batch_data_samples: SampleList,
-    ) -> dict:
+    ) -> Dict[str, Tensor]:
        """Run forward function and calculate loss for auxiliary head in
-        training."""
+        training.
+
+        Args:
+            batch_input (Tensor): Input point cloud sample
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components for auxiliary
+            head.
+        """
        losses = dict()
        if isinstance(self.auxiliary_head, nn.ModuleList):
            for idx, aux_head in enumerate(self.auxiliary_head):
@@ -178,7 +209,7 @@ class EncoderDecoder3D(Base3DSegmentor):

        return losses

-    def _loss_regularization_forward_train(self) -> dict:
+    def _loss_regularization_forward_train(self) -> Dict[str, Tensor]:
        """Calculate regularization loss for model weight in training."""
        losses = dict()
        if isinstance(self.loss_regularization, nn.ModuleList):
@@ -194,22 +225,21 @@ class EncoderDecoder3D(Base3DSegmentor):
        return losses

    def loss(self, batch_inputs_dict: dict,
-             batch_data_samples: SampleList) -> dict:
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
        """Calculate losses from a batch of inputs and data samples.

        Args:
            batch_inputs_dict (dict): Input sample dict which
                includes 'points' and 'imgs' keys.

-                - points (list[torch.Tensor]): Point cloud of each sample.
-                - imgs (torch.Tensor, optional): Image tensor has shape
-                  (B, C, H, W).
-            batch_data_samples (list[:obj:`Det3DDataSample`]): The det3d
-                data samples. It usually includes information such
-                as `metainfo` and `gt_pts_sem_seg`.
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.

        Returns:
-            dict[str, Tensor]: a dictionary of loss components.
+            Dict[str, Tensor]: A dictionary of loss components.
        """

        # extract features using backbone
@@ -244,15 +274,15 @@ class EncoderDecoder3D(Base3DSegmentor):
        features. Currently support colors and normalized xyz as features.

        Args:
-            coords (torch.Tensor): Sampled 3D point coordinate of shape [S, 3].
-            patch_center (torch.Tensor): Center coordinate of the patch.
-            coord_max (torch.Tensor): Max coordinate of all 3D points.
-            feats (torch.Tensor): Features of sampled points of shape [S, C].
-            use_normalized_coord (bool, optional): Whether to use normalized
-                xyz as additional features. Defaults to False.
+            coords (Tensor): Sampled 3D point coordinate of shape [S, 3].
+            patch_center (Tensor): Center coordinate of the patch.
+            coord_max (Tensor): Max coordinate of all 3D points.
+            feats (Tensor): Features of sampled points of shape [S, C].
+            use_normalized_coord (bool): Whether to use normalized xyz as
+                additional features. Defaults to False.

        Returns:
-            torch.Tensor: The generated input data of shape [S, 3+C'].
+            Tensor: The generated input data of shape [S, 3+C'].
        """
        # subtract patch center, the z dimension is not centered
        centered_coords = coords.clone()
@@ -281,23 +311,22 @@ class EncoderDecoder3D(Base3DSegmentor):
        Then sample points in each patch to batch points of a certain number.

        Args:
-            points (torch.Tensor): Input points of shape [N, 3+C].
+            points (Tensor): Input points of shape [N, 3+C].
            num_points (int): Number of points to be sampled in each patch.
-            block_size (float, optional): Size of a patch to sample.
-            sample_rate (float, optional): Stride used in sliding patch.
-                Defaults to 0.5.
-            use_normalized_coord (bool, optional): Whether to use normalized
-                xyz as additional features. Defaults to False.
-            eps (float, optional): A value added to patch boundary to guarantee
-                points coverage. Defaults to 1e-3.
+            block_size (float): Size of a patch to sample.
+            sample_rate (float): Stride used in sliding patch. Defaults to 0.5.
+            use_normalized_coord (bool): Whether to use normalized xyz as
+                additional features. Defaults to False.
+            eps (float): A value added to patch boundary to guarantee points
+                coverage. Defaults to 1e-3.

        Returns:
-            tuple:
+            Tuple[Tensor, Tensor]:

-                - patch_points (torch.Tensor): Points of different patches of
-                  shape [K, N, 3+C].
-                - patch_idxs (torch.Tensor): Index of each point in
-                  `patch_points`, of shape [K, N].
+            - patch_points (Tensor): Points of different patches of shape
+              [K, N, 3+C].
+            - patch_idxs (Tensor): Index of each point in `patch_points` of
+              shape [K, N].
        """
        device = points.device
        # we assume the first three dims are points' 3D coordinates
@@ -372,13 +401,13 @@ class EncoderDecoder3D(Base3DSegmentor):

        return patch_points, patch_idxs

-    def slide_inference(self, point: Tensor, img_meta: List[dict],
+    def slide_inference(self, point: Tensor, input_meta: dict,
                        rescale: bool) -> Tensor:
        """Inference by sliding-window with overlap.

        Args:
-            point (torch.Tensor): Input points of shape [N, 3+C].
-            img_meta (dict): Meta information of input sample.
+            point (Tensor): Input points of shape [N, 3+C].
+            input_meta (dict): Meta information of input sample.
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.

@@ -401,7 +430,8 @@ class EncoderDecoder3D(Base3DSegmentor):
            batch_points = patch_points[batch_idx:batch_idx + batch_size]
            batch_points = batch_points.view(-1, num_points, feats_dim)
            # batch_seg_logit is of shape [B, num_classes, N]
-            batch_seg_logit = self.encode_decode(batch_points, img_meta)
+            batch_seg_logit = self.encode_decode(batch_points,
+                                                 [input_meta] * batch_size)
            batch_seg_logit = batch_seg_logit.transpose(1, 2).contiguous()
            seg_logits.append(batch_seg_logit.view(-1, self.num_classes))

@@ -417,20 +447,21 @@ class EncoderDecoder3D(Base3DSegmentor):

        return preds.transpose(0, 1)  # to [num_classes, K*N]

-    def whole_inference(self, points: Tensor, input_metas: List[dict],
+    def whole_inference(self, points: Tensor, batch_input_metas: List[dict],
                        rescale: bool) -> Tensor:
        """Inference with full scene (one forward pass without sliding)."""
-        seg_logit = self.encode_decode(points, input_metas)
+        seg_logit = self.encode_decode(points, batch_input_metas)
        # TODO: if rescale and voxelization segmentor
        return seg_logit

-    def inference(self, points: Tensor, input_metas: List[dict],
+    def inference(self, points: Tensor, batch_input_metas: List[dict],
                  rescale: bool) -> Tensor:
        """Inference with slide/whole style.

        Args:
-            points (torch.Tensor): Input points of shape [B, N, 3+C].
-            input_metas (list[dict]): Meta information of each sample.
+            points (Tensor): Input points of shape [B, N, 3+C].
+            batch_input_metas (List[dict]): Meta information of a batch of
+                samples.
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.

@@ -440,11 +471,12 @@ class EncoderDecoder3D(Base3DSegmentor):
        assert self.test_cfg.mode in ['slide', 'whole']
        if self.test_cfg.mode == 'slide':
            seg_logit = torch.stack([
-                self.slide_inference(point, img_meta, rescale)
-                for point, img_meta in zip(points, input_metas)
+                self.slide_inference(point, input_meta, rescale)
+                for point, input_meta in zip(points, batch_input_metas)
            ], 0)
        else:
-            seg_logit = self.whole_inference(points, input_metas, rescale)
+            seg_logit = self.whole_inference(points, batch_input_metas,
+                                             rescale)
        output = F.softmax(seg_logit, dim=1)
        return output

@@ -455,23 +487,24 @@ class EncoderDecoder3D(Base3DSegmentor):
        """Simple test with single scene.

        Args:
-            batch_inputs_dict (dict): Input sample dict which
-                includes 'points' and 'imgs' keys.
-
-                - points (list[torch.Tensor]): Point cloud of each sample.
-                - imgs (torch.Tensor, optional): Image tensor has shape
-                    (B, C, H, W).
-            batch_data_samples (list[:obj:`Det3DDataSample`]): The det3d
-                data samples. It usually includes information such
-                as `metainfo` and `gt_pts_sem_seg`.
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
+                and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.
                Defaults to True.

        Returns:
-            list[dict]: The output prediction result with following keys:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:

-                - semantic_mask (Tensor): Segmentation mask of shape [N].
+            - ``pred_pts_seg`` (PixelData): Prediction of 3D semantic
+              segmentation.
        """
        # 3D segmentation requires per-point prediction, so it's impossible
        # to use down-sampling to get a batch of scenes with same num_points
@@ -498,15 +531,14 @@ class EncoderDecoder3D(Base3DSegmentor):
        """Network forward process.

        Args:
-            batch_inputs_dict (dict): Input sample dict which
-                includes 'points' and 'imgs' keys.
-
-                - points (list[torch.Tensor]): Point cloud of each sample.
-                - imgs (torch.Tensor, optional): Image tensor has shape
-                  (B, C, H, W).
-            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
-                data samples. It usually includes information such
-                as `metainfo` and `gt_pts_sem_seg`.
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
+                and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.

        Returns:
            Tensor: Forward output of model without any post-processes.