[Enhance] refine docstring and typehint in `segmentor` (#2254)

* refactor segmentor * replace meta info during inference * Update decode_head.py * update * update docs

[Enhance] refine docstring and typehint in `segmentor` (#2254)
* refactor segmentor * replace meta info during inference * Update decode_head.py * update * update docs
1587d48f · Xiang Xu · GitHub · 68102441 · 1587d48f · 1587d48f
Unverified Commit 1587d48f authored Feb 14, 2023 by Xiang Xu Committed by GitHub Feb 14, 2023
6 changed files
--- a/mmdet3d/models/decode_heads/decode_head.py
+++ b/mmdet3d/models/decode_heads/decode_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from typing import List
+from typing import Dict, List
 import torch
 from mmengine.model import BaseModule, normal_init
@@ -9,7 +9,7 @@ from torch import nn as nn
 from mmdet3d.registry import MODELS
 from mmdet3d.structures.det3d_data_sample import SampleList
-from mmdet3d.utils.typing_utils import ConfigType
+from mmdet3d.utils.typing_utils import ConfigType, OptMultiConfig
 class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
@@ -42,35 +42,35 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        channels (int): Channels after modules, before conv_seg.
        num_classes (int): Number of classes.
        dropout_ratio (float): Ratio of dropout layer. Defaults to 0.5.
-        conv_cfg (dict): Config of conv layers.
+        conv_cfg (dict or :obj:`ConfigDict`): Config of conv layers.
            Defaults to dict(type='Conv1d').
-        norm_cfg (dict): Config of norm layers.
+        norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers.
            Defaults to dict(type='BN1d').
-        act_cfg (dict): Config of activation layers.
+        act_cfg (dict or :obj:`ConfigDict`): Config of activation layers.
            Defaults to dict(type='ReLU').
-        loss_decode (dict): Config of decode loss.
+        loss_decode (dict or :obj:`ConfigDict`): Config of decode loss.
-            Defaults to dict(type='CrossEntropyLoss').
+            Defaults to dict(type='mmdet.CrossEntropyLoss', use_sigmoid=False,
-        ignore_index (int): The label index to be ignored.
+            class_weight=None, loss_weight=1.0).
-            When using masked BCE loss, ignore_index should be set to None.
+        ignore_index (int): The label index to be ignored. When using masked
-            Defaults to 255.
+            BCE loss, ignore_index should be set to None. Defaults to 255.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
+        init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`],
-            Defaults to None.
+            optional): Initialization config dict. Defaults to None.
    """
    def __init__(self,
-                 channels,
+                 channels: int,
-                 num_classes,
+                 num_classes: int,
-                 dropout_ratio=0.5,
+                 dropout_ratio: float = 0.5,
-                 conv_cfg=dict(type='Conv1d'),
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
-                 loss_decode=dict(
+                 loss_decode: ConfigType = dict(
                     type='mmdet.CrossEntropyLoss',
                     use_sigmoid=False,
                     class_weight=None,
                     loss_weight=1.0),
-                 ignore_index=255,
+                 ignore_index: int = 255,
-                 init_cfg=None) -> None:
+                 init_cfg: OptMultiConfig = None) -> None:
        super(Base3DDecodeHead, self).__init__(init_cfg=init_cfg)
        self.channels = channels
        self.num_classes = num_classes
@@ -87,13 +87,13 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        else:
            self.dropout = None
-    def init_weights(self):
+    def init_weights(self) -> None:
        """Initialize weights of classification layer."""
        super().init_weights()
        normal_init(self.conv_seg, mean=0, std=0.01)
    @abstractmethod
-    def forward(self, feats_dict: dict):
+    def forward(self, feats_dict: dict) -> Tensor:
        """Placeholder of forward function."""
        pass
@@ -104,34 +104,33 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        output = self.conv_seg(feat)
        return output
-    def loss(self, inputs: List[Tensor], batch_data_samples: SampleList,
+    def loss(self, inputs: dict, batch_data_samples: SampleList,
-             train_cfg: ConfigType) -> dict:
+             train_cfg: ConfigType) -> Dict[str, Tensor]:
        """Forward function for training.
        Args:
-            inputs (list[torch.Tensor]): List of multi-level point features.
+            inputs (dict): Feature dict from backbone.
-            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
-                data samples. It usually includes information such
+                samples. It usually includes information such as `metainfo` and
-                as `metainfo` and `gt_pts_seg`.
+                `gt_pts_seg`.
-            train_cfg (dict): The training config.
+            train_cfg (dict or :obj:`ConfigDict`): The training config.
        Returns:
-            dict[str, Tensor]: a dictionary of loss components
+            Dict[str, Tensor]: A dictionary of loss components.
        """
        seg_logits = self.forward(inputs)
        losses = self.loss_by_feat(seg_logits, batch_data_samples)
        return losses
-    def predict(self, inputs: List[Tensor], batch_input_metas: List[dict],
+    def predict(self, inputs: dict, batch_input_metas: List[dict],
-                test_cfg: ConfigType) -> List[Tensor]:
+                test_cfg: ConfigType) -> Tensor:
        """Forward function for testing.
        Args:
-            inputs (list[Tensor]): List of multi-level point features.
+            inputs (dict): Feature dict from backbone.
-            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
+            batch_input_metas (List[dict]): Meta information of a batch of
-                data samples. It usually includes information such
+                samples.
-                as `metainfo` and `gt_pts_seg`.
+            test_cfg (dict or :obj:`ConfigDict`): The testing config.
-            test_cfg (dict): The testing config.
        Returns:
            Tensor: Output segmentation map.
@@ -148,15 +147,18 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        return torch.stack(gt_semantic_segs, dim=0)
    def loss_by_feat(self, seg_logit: Tensor,
-                     batch_data_samples: SampleList) -> dict:
+                     batch_data_samples: SampleList) -> Dict[str, Tensor]:
        """Compute semantic segmentation loss.
        Args:
-            seg_logit (torch.Tensor): Predicted per-point segmentation logits
+            seg_logit (Tensor): Predicted per-point segmentation logits of
-                of shape [B, num_classes, N].
+                shape [B, num_classes, N].
-            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
-                data samples. It usually includes information such
+                samples. It usually includes information such as `metainfo` and
-                as `metainfo` and `gt_pts_seg`.
+                `gt_pts_seg`.
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
        """
        seg_label = self._stack_batch_gt(batch_data_samples)
        loss = dict()

--- a/mmdet3d/models/decode_heads/dgcnn_head.py
+++ b/mmdet3d/models/decode_heads/dgcnn_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+from typing import Sequence
 from mmcv.cnn.bricks import ConvModule
 from torch import Tensor
@@ -18,11 +18,12 @@ class DGCNNHead(Base3DDecodeHead):
    `reimplementation code <https://github.com/AnTao97/dgcnn.pytorch>`_.
    Args:
-        fp_channels (tuple[int], optional): Tuple of mlp channels in feature
+        fp_channels (Sequence[int]): Tuple of mlp channels in feature
            propagation (FP) modules. Defaults to (1216, 512).
    """
-    def __init__(self, fp_channels: Tuple = (1216, 512), **kwargs) -> None:
+    def __init__(self, fp_channels: Sequence[int] = (1216, 512),
+                 **kwargs) -> None:
        super(DGCNNHead, self).__init__(**kwargs)
        self.FP_module = DGCNNFPModule(
@@ -45,7 +46,7 @@ class DGCNNHead(Base3DDecodeHead):
            feat_dict (dict): Feature dict from backbone.
        Returns:
-            torch.Tensor: points for decoder.
+            torch.Tensor: Points for decoder.
        """
        fa_points = feat_dict['fa_points']
@@ -58,7 +59,7 @@ class DGCNNHead(Base3DDecodeHead):
            feat_dict (dict): Feature dict from backbone.
        Returns:
-            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+            Tensor: Segmentation map of shape [B, num_classes, N].
        """
        fa_points = self._extract_input(feat_dict)

--- a/mmdet3d/models/decode_heads/paconv_head.py
+++ b/mmdet3d/models/decode_heads/paconv_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+from typing import Sequence
 from mmcv.cnn.bricks import ConvModule
 from torch import Tensor
@@ -17,16 +17,19 @@ class PAConvHead(PointNet2Head):
    Refer to the `official code <https://github.com/CVMI-Lab/PAConv>`_.
    Args:
-        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
+        fp_channels (Sequence[Sequence[int]]): Tuple of mlp channels in FP
-        fp_norm_cfg (dict): Config of norm layers used in FP modules.
+            modules. Defaults to ((768, 256, 256), (384, 256, 256),
-            Default: dict(type='BN2d').
+            (320, 256, 128), (128 + 6, 128, 128, 128)).
+        fp_norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers used in
+            FP modules. Defaults to dict(type='BN2d').
    """
    def __init__(self,
-                 fp_channels: Tuple[Tuple[int]] = ((768, 256, 256),
+                 fp_channels: Sequence[Sequence[int]] = ((768, 256, 256),
-                                                   (384, 256, 256), (320, 256,
+                                                         (384, 256, 256),
-                                                                     128),
+                                                         (320, 256,
-                                                   (128 + 6, 128, 128, 128)),
+                                                          128), (128 + 6, 128,
+                                                                 128, 128)),
                 fp_norm_cfg: ConfigType = dict(type='BN2d'),
                 **kwargs) -> None:
        super(PAConvHead, self).__init__(

--- a/mmdet3d/models/decode_heads/pointnet2_head.py
+++ b/mmdet3d/models/decode_heads/pointnet2_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+from typing import List, Sequence, Tuple
 from mmcv.cnn.bricks import ConvModule
 from torch import Tensor
@@ -19,16 +19,18 @@ class PointNet2Head(Base3DDecodeHead):
    Refer to the `official code <https://github.com/charlesq34/pointnet2>`_.
    Args:
-        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
+        fp_channels (Sequence[Sequence[int]]): Tuple of mlp channels in FP
-        fp_norm_cfg (dict): Config of norm layers used in FP modules.
+            modules. Defaults to ((768, 256, 256), (384, 256, 256),
-            Default: dict(type='BN2d').
+            (320, 256, 128), (128, 128, 128, 128)).
+        fp_norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers used
+            in FP modules. Defaults to dict(type='BN2d').
    """
    def __init__(self,
-                 fp_channels: Tuple[Tuple[int]] = ((768, 256, 256),
+                 fp_channels: Sequence[Sequence[int]] = ((768, 256, 256),
-                                                   (384, 256, 256), (320, 256,
+                                                         (384, 256, 256),
-                                                                     128),
+                                                         (320, 256, 128),
-                                                   (128, 128, 128, 128)),
+                                                         (128, 128, 128, 128)),
                 fp_norm_cfg: ConfigType = dict(type='BN2d'),
                 **kwargs) -> None:
        super(PointNet2Head, self).__init__(**kwargs)
@@ -49,15 +51,16 @@ class PointNet2Head(Base3DDecodeHead):
            norm_cfg=self.norm_cfg,
            act_cfg=self.act_cfg)
-    def _extract_input(self, feat_dict: dict) -> Tensor:
+    def _extract_input(self,
+                       feat_dict: dict) -> Tuple[List[Tensor], List[Tensor]]:
        """Extract inputs from features dictionary.
        Args:
            feat_dict (dict): Feature dict from backbone.
        Returns:
-            list[torch.Tensor]: Coordinates of multiple levels of points.
+            Tuple[List[Tensor], List[Tensor]]: Coordinates and features of
-            list[torch.Tensor]: Features of multiple levels of points.
+            multiple levels of points.
        """
        sa_xyz = feat_dict['sa_xyz']
        sa_features = feat_dict['sa_features']
@@ -72,7 +75,7 @@ class PointNet2Head(Base3DDecodeHead):
            feat_dict (dict): Feature dict from backbone.
        Returns:
-            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+            Tensor: Segmentation map of shape [B, num_classes, N].
        """
        sa_xyz, sa_features = self._extract_input(feat_dict)

--- a/mmdet3d/models/segmentors/base.py
+++ b/mmdet3d/models/segmentors/base.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from typing import List, Tuple, Union
+from typing import Dict, List, Union
 from mmengine.model import BaseModel
 from torch import Tensor
-from mmdet3d.structures import Det3DDataSample, PointData
+from mmdet3d.structures import PointData
 from mmdet3d.structures.det3d_data_sample import (ForwardResults,
                                                  OptSampleList, SampleList)
 from mmdet3d.utils import OptConfigType, OptMultiConfig
@@ -15,12 +15,12 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
    """Base class for 3D segmentors.
    Args:
-        data_preprocessor (dict, optional): Model preprocessing config
+        data_preprocessor (dict or ConfigDict, optional): Model preprocessing
-            for processing the input data. it usually includes
+            config for processing the input data. it usually includes
-            ``to_rgb``, ``pad_size_divisor``, ``pad_val``,
+            ``to_rgb``, ``pad_size_divisor``, ``pad_val``, ``mean`` and
-            ``mean`` and ``std``. Default to None.
+            ``std``. Defaults to None.
-       init_cfg (dict, optional): the config to control the
+       init_cfg (dict or ConfigDict, optional): The config to control the
-           initialization. Default to None.
+           initialization. Defaults to None.
    """
    def __init__(self,
@@ -31,34 +31,34 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
    @property
    def with_neck(self) -> bool:
-        """bool: whether the segmentor has neck"""
+        """bool: Whether the segmentor has neck."""
        return hasattr(self, 'neck') and self.neck is not None
    @property
    def with_auxiliary_head(self) -> bool:
-        """bool: whether the segmentor has auxiliary head"""
+        """bool: Whether the segmentor has auxiliary head."""
        return hasattr(self,
                       'auxiliary_head') and self.auxiliary_head is not None
    @property
    def with_decode_head(self) -> bool:
-        """bool: whether the segmentor has decode head"""
+        """bool: Whether the segmentor has decode head."""
        return hasattr(self, 'decode_head') and self.decode_head is not None
    @property
    def with_regularization_loss(self) -> bool:
-        """bool: whether the segmentor has regularization loss for weight"""
+        """bool: Whether the segmentor has regularization loss for weight."""
        return hasattr(self, 'loss_regularization') and \
            self.loss_regularization is not None
    @abstractmethod
-    def extract_feat(self, batch_inputs: Tensor) -> bool:
+    def extract_feat(self, batch_inputs: Tensor) -> dict:
        """Placeholder for extract features from images."""
        pass
    @abstractmethod
    def encode_decode(self, batch_inputs: Tensor,
-                      batch_data_samples: SampleList):
+                      batch_data_samples: SampleList) -> Tensor:
        """Placeholder for encode images with backbone and decode into a
        semantic segmentation map of the same size as input."""
        pass
@@ -72,22 +72,22 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
        The method should accept three modes: "tensor", "predict" and "loss":
        - "tensor": Forward the whole network and return tensor or tuple of
-        tensor without any post-processing, same as a common nn.Module.
+          tensor without any post-processing, same as a common nn.Module.
        - "predict": Forward and return the predictions, which are fully
-        processed to a list of :obj:`SegDataSample`.
+          processed to a list of :obj:`SegDataSample`.
        - "loss": Forward and return a dict of losses according to the given
-        inputs and data samples.
+          inputs and data samples.
        Note that this method doesn't handle neither back propagation nor
        optimizer updating, which are done in the :meth:`train_step`.
        Args:
-            inputs (dict | List[dict]): Input sample dict which
+            inputs (dict or List[dict]): Input sample dict which includes
-                includes 'points' and 'imgs' keys.
+                'points' and 'imgs' keys.
-                - points (list[torch.Tensor]): Point cloud of each sample.
+                - points (List[Tensor]): Point cloud of each sample.
-                - imgs (torch.Tensor): Image tensor has shape (B, C, H, W).
+                - imgs (Tensor): Image tensor has shape (B, C, H, W).
-            data_samples (list[:obj:`Det3DDataSample`], optional):
+            data_samples (List[:obj:`Det3DDataSample`], optional):
                The annotation data of every samples. Defaults to None.
            mode (str): Return what kind of value. Defaults to 'tensor'.
@@ -109,23 +109,22 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
                               'Only supports loss, predict and tensor mode')
    @abstractmethod
-    def loss(self, batch_inputs: Tensor,
+    def loss(self, batch_inputs: dict,
-             batch_data_samples: SampleList) -> dict:
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
        """Calculate losses from a batch of inputs and data samples."""
        pass
    @abstractmethod
-    def predict(self, batch_inputs: Tensor,
+    def predict(self, batch_inputs: dict,
                batch_data_samples: SampleList) -> SampleList:
        """Predict results from a batch of inputs and data samples with post-
        processing."""
        pass
    @abstractmethod
-    def _forward(
+    def _forward(self,
-            self,
+                 batch_inputs: dict,
-            batch_inputs: Tensor,
+                 batch_data_samples: OptSampleList = None) -> Tensor:
-            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
        """Network forward process.
        Usually includes backbone, neck and head forward without any post-
@@ -134,33 +133,31 @@ class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
        pass
    @abstractmethod
-    def aug_test(self, batch_inputs, batch_img_metas):
+    def aug_test(self, batch_inputs, batch_data_samples):
        """Placeholder for augmentation test."""
        pass
    def postprocess_result(self, seg_pred_list: List[dict],
-                           batch_img_metas: List[dict]) -> list:
+                           batch_data_samples: SampleList) -> SampleList:
        """Convert results list to `Det3DDataSample`.
        Args:
            seg_logits_list (List[dict]): List of segmentation results,
                seg_logits from model of each input point clouds sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
        Returns:
-            list[:obj:`Det3DDataSample`]: Segmentation results of the
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
-            input images. Each Det3DDataSample usually contain:
+            points. Each Det3DDataSample usually contains:
-            - ``pred_pts_seg``(PixelData): Prediction of 3D
+            - ``pred_pts_seg`` (PixelData): Prediction of 3D semantic
-                semantic segmentation.
+              segmentation.
        """
-        predictions = []
        for i in range(len(seg_pred_list)):
-            img_meta = batch_img_metas[i]
            seg_pred = seg_pred_list[i]
-            prediction = Det3DDataSample(**{'metainfo': img_meta.metainfo})
+            batch_data_samples[i].set_data(
-            prediction.set_data({'eval_ann_info': img_meta.eval_ann_info})
-            prediction.set_data(
                {'pred_pts_seg': PointData(**{'pts_semantic_mask': seg_pred})})
-            predictions.append(prediction)
+        return batch_data_samples
-        return predictions
--- a/mmdet3d/models/segmentors/encoder_decoder.py
+++ b/mmdet3d/models/segmentors/encoder_decoder.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 import numpy as np
 import torch
@@ -29,14 +29,14 @@ class EncoderDecoder3D(Base3DSegmentor):
    .. code:: text
-     loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+    loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
-     _decode_head_forward_train(): decode_head.loss()
+    _decode_head_forward_train(): decode_head.loss()
-     _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+    _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
    2. The ``predict`` method is used to predict segmentation results,
    which includes two steps: (1) Run inference function to obtain the list of
    seg_logits (2) Call post-processing function to obtain list of
-    ``SegDataSampel`` including ``pred_sem_seg`` and ``seg_logits``.
+    ``Det3DDataSample`` including ``pred_pts_seg``.
    .. code:: text
@@ -47,36 +47,43 @@ class EncoderDecoder3D(Base3DSegmentor):
    4 The ``_forward`` method is used to output the tensor by running the model,
    which includes two steps: (1) Extracts features to obtain the feature maps
-    (2)Call the decode head forward function to forward decode head model.
+    (2) Call the decode head forward function to forward decode head model.
    .. code:: text
    _forward(): extract_feat() -> _decode_head.forward()
    Args:
+        backbone (dict or :obj:`ConfigDict`): The config for the backnone of
-        backbone (ConfigType): The config for the backnone of segmentor.
+            segmentor.
-        decode_head (ConfigType): The config for the decode head of segmentor.
+        decode_head (dict or :obj:`ConfigDict`): The config for the decode
-        neck (OptConfigType): The config for the neck of segmentor.
+            head of segmentor.
-            Defaults to None.
+        neck (dict or :obj:`ConfigDict`, optional): The config for the neck of
-        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        auxiliary_head (dict or :obj:`ConfigDict` or List[dict or
+            :obj:`ConfigDict`], optional): The config for the auxiliary head of
            segmentor. Defaults to None.
-        loss_regularization (OptiConfigType): The config for the regularization
+        loss_regularization (dict or :obj:`ConfigDict` or List[dict or
+            :obj:`ConfigDict`], optional): The config for the regularization
            loass. Defaults to None.
-        train_cfg (OptConfigType): The config for training. Defaults to None.
+        train_cfg (dict or :obj:`ConfigDict`, optional): The config for
-        test_cfg (OptConfigType): The config for testing. Defaults to None.
+            training. Defaults to None.
-        data_preprocessor (OptConfigType): The pre-process config of
+        test_cfg (dict or :obj:`ConfigDict`, optional): The config for testing.
-            :class:`BaseDataPreprocessor`. Defaults to None.
+            Defaults to None.
-        init_cfg (OptMultiConfig): The weight initialized config for
+        data_preprocessor (dict or :obj:`ConfigDict`, optional): The
-            :class:`BaseModule`. Defaults to None.
+            pre-process config of :class:`BaseDataPreprocessor`.
+            Defaults to None.
+        init_cfg (dict or :obj:`ConfigDict` or List[dict or :obj:`ConfigDict`],
+            optional): The weight initialized config for :class:`BaseModule`.
+            Defaults to None.
    """  # noqa: E501
    def __init__(self,
                 backbone: ConfigType,
                 decode_head: ConfigType,
                 neck: OptConfigType = None,
-                 auxiliary_head: OptConfigType = None,
+                 auxiliary_head: OptMultiConfig = None,
-                 loss_regularization: OptConfigType = None,
+                 loss_regularization: OptMultiConfig = None,
                 train_cfg: OptConfigType = None,
                 test_cfg: OptConfigType = None,
                 data_preprocessor: OptConfigType = None,
@@ -97,12 +104,13 @@ class EncoderDecoder3D(Base3DSegmentor):
            '3D EncoderDecoder Segmentor should have a decode_head'
    def _init_decode_head(self, decode_head: ConfigType) -> None:
-        """Initialize ``decode_head``"""
+        """Initialize ``decode_head``."""
        self.decode_head = MODELS.build(decode_head)
        self.num_classes = self.decode_head.num_classes
-    def _init_auxiliary_head(self, auxiliary_head: ConfigType) -> None:
+    def _init_auxiliary_head(self,
-        """Initialize ``auxiliary_head``"""
+                             auxiliary_head: OptMultiConfig = None) -> None:
+        """Initialize ``auxiliary_head``."""
        if auxiliary_head is not None:
            if isinstance(auxiliary_head, list):
                self.auxiliary_head = nn.ModuleList()
@@ -112,8 +120,9 @@ class EncoderDecoder3D(Base3DSegmentor):
                self.auxiliary_head = MODELS.build(auxiliary_head)
    def _init_loss_regularization(self,
-                                  loss_regularization: ConfigType) -> None:
+                                  loss_regularization: OptMultiConfig = None
-        """Initialize ``loss_regularization``"""
+                                  ) -> None:
+        """Initialize ``loss_regularization``."""
        if loss_regularization is not None:
            if isinstance(loss_regularization, list):
                self.loss_regularization = nn.ModuleList()
@@ -122,7 +131,7 @@ class EncoderDecoder3D(Base3DSegmentor):
            else:
                self.loss_regularization = MODELS.build(loss_regularization)
-    def extract_feat(self, batch_inputs: Tensor) -> Tensor:
+    def extract_feat(self, batch_inputs: Tensor) -> dict:
        """Extract features from points."""
        x = self.backbone(batch_inputs)
        if self.with_neck:
@@ -135,21 +144,32 @@ class EncoderDecoder3D(Base3DSegmentor):
        map of the same size as input.
        Args:
-            batch_input (torch.Tensor): Input point cloud sample
+            batch_input (Tensor): Input point cloud sample
-            batch_input_metas (list[dict]): Meta information of each sample.
+            batch_input_metas (List[dict]): Meta information of a batch of
+                samples.
        Returns:
-            torch.Tensor: Segmentation logits of shape [B, num_classes, N].
+            Tensor: Segmentation logits of shape [B, num_classes, N].
        """
        x = self.extract_feat(batch_inputs)
        seg_logits = self.decode_head.predict(x, batch_input_metas,
                                              self.test_cfg)
        return seg_logits
-    def _decode_head_forward_train(self, batch_inputs_dict: dict,
+    def _decode_head_forward_train(
-                                   batch_data_samples: SampleList) -> dict:
+            self, batch_inputs_dict: dict,
-        """Run forward function and calculate loss for decode head in
+            batch_data_samples: SampleList) -> Dict[str, Tensor]:
-        training."""
+        """Run forward function and calculate loss for decode head in training.
+        Args:
+            batch_input (Tensor): Input point cloud sample
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components for decode head.
+        """
        losses = dict()
        loss_decode = self.decode_head.loss(batch_inputs_dict,
                                            batch_data_samples, self.train_cfg)
@@ -161,9 +181,20 @@ class EncoderDecoder3D(Base3DSegmentor):
        self,
        batch_inputs_dict: dict,
        batch_data_samples: SampleList,
-    ) -> dict:
+    ) -> Dict[str, Tensor]:
        """Run forward function and calculate loss for auxiliary head in
-        training."""
+        training.
+        Args:
+            batch_input (Tensor): Input point cloud sample
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components for auxiliary
+            head.
+        """
        losses = dict()
        if isinstance(self.auxiliary_head, nn.ModuleList):
            for idx, aux_head in enumerate(self.auxiliary_head):
@@ -178,7 +209,7 @@ class EncoderDecoder3D(Base3DSegmentor):
        return losses
-    def _loss_regularization_forward_train(self) -> dict:
+    def _loss_regularization_forward_train(self) -> Dict[str, Tensor]:
        """Calculate regularization loss for model weight in training."""
        losses = dict()
        if isinstance(self.loss_regularization, nn.ModuleList):
@@ -194,22 +225,21 @@ class EncoderDecoder3D(Base3DSegmentor):
        return losses
    def loss(self, batch_inputs_dict: dict,
-             batch_data_samples: SampleList) -> dict:
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
        """Calculate losses from a batch of inputs and data samples.
        Args:
            batch_inputs_dict (dict): Input sample dict which
                includes 'points' and 'imgs' keys.
-                - points (list[torch.Tensor]): Point cloud of each sample.
+                - points (List[Tensor]): Point cloud of each sample.
-                - imgs (torch.Tensor, optional): Image tensor has shape
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
-                  (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
-            batch_data_samples (list[:obj:`Det3DDataSample`]): The det3d
+                samples. It usually includes information such as `metainfo` and
-                data samples. It usually includes information such
+                `gt_pts_seg`.
-                as `metainfo` and `gt_pts_sem_seg`.
        Returns:
-            dict[str, Tensor]: a dictionary of loss components.
+            Dict[str, Tensor]: A dictionary of loss components.
        """
        # extract features using backbone
@@ -241,18 +271,18 @@ class EncoderDecoder3D(Base3DSegmentor):
        """Generating model input.
        Generate input by subtracting patch center and adding additional
-            features. Currently support colors and normalized xyz as features.
+        features. Currently support colors and normalized xyz as features.
        Args:
-            coords (torch.Tensor): Sampled 3D point coordinate of shape [S, 3].
+            coords (Tensor): Sampled 3D point coordinate of shape [S, 3].
-            patch_center (torch.Tensor): Center coordinate of the patch.
+            patch_center (Tensor): Center coordinate of the patch.
-            coord_max (torch.Tensor): Max coordinate of all 3D points.
+            coord_max (Tensor): Max coordinate of all 3D points.
-            feats (torch.Tensor): Features of sampled points of shape [S, C].
+            feats (Tensor): Features of sampled points of shape [S, C].
-            use_normalized_coord (bool, optional): Whether to use normalized
+            use_normalized_coord (bool): Whether to use normalized xyz as
-                xyz as additional features. Defaults to False.
+                additional features. Defaults to False.
        Returns:
-            torch.Tensor: The generated input data of shape [S, 3+C'].
+            Tensor: The generated input data of shape [S, 3+C'].
        """
        # subtract patch center, the z dimension is not centered
        centered_coords = coords.clone()
@@ -281,23 +311,22 @@ class EncoderDecoder3D(Base3DSegmentor):
        Then sample points in each patch to batch points of a certain number.
        Args:
-            points (torch.Tensor): Input points of shape [N, 3+C].
+            points (Tensor): Input points of shape [N, 3+C].
            num_points (int): Number of points to be sampled in each patch.
-            block_size (float, optional): Size of a patch to sample.
+            block_size (float): Size of a patch to sample.
-            sample_rate (float, optional): Stride used in sliding patch.
+            sample_rate (float): Stride used in sliding patch. Defaults to 0.5.
-                Defaults to 0.5.
+            use_normalized_coord (bool): Whether to use normalized xyz as
-            use_normalized_coord (bool, optional): Whether to use normalized
+                additional features. Defaults to False.
-                xyz as additional features. Defaults to False.
+            eps (float): A value added to patch boundary to guarantee points
-            eps (float, optional): A value added to patch boundary to guarantee
+                coverage. Defaults to 1e-3.
-                points coverage. Defaults to 1e-3.
        Returns:
-            tuple:
+            Tuple[Tensor, Tensor]:
-                - patch_points (torch.Tensor): Points of different patches of
+            - patch_points (Tensor): Points of different patches of shape
-                  shape [K, N, 3+C].
+              [K, N, 3+C].
-                - patch_idxs (torch.Tensor): Index of each point in
+            - patch_idxs (Tensor): Index of each point in `patch_points` of
-                  `patch_points`, of shape [K, N].
+              shape [K, N].
        """
        device = points.device
        # we assume the first three dims are points' 3D coordinates
@@ -372,13 +401,13 @@ class EncoderDecoder3D(Base3DSegmentor):
        return patch_points, patch_idxs
-    def slide_inference(self, point: Tensor, img_meta: List[dict],
+    def slide_inference(self, point: Tensor, input_meta: dict,
                        rescale: bool) -> Tensor:
        """Inference by sliding-window with overlap.
        Args:
-            point (torch.Tensor): Input points of shape [N, 3+C].
+            point (Tensor): Input points of shape [N, 3+C].
-            img_meta (dict): Meta information of input sample.
+            input_meta (dict): Meta information of input sample.
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.
@@ -401,7 +430,8 @@ class EncoderDecoder3D(Base3DSegmentor):
            batch_points = patch_points[batch_idx:batch_idx + batch_size]
            batch_points = batch_points.view(-1, num_points, feats_dim)
            # batch_seg_logit is of shape [B, num_classes, N]
-            batch_seg_logit = self.encode_decode(batch_points, img_meta)
+            batch_seg_logit = self.encode_decode(batch_points,
+                                                 [input_meta] * batch_size)
            batch_seg_logit = batch_seg_logit.transpose(1, 2).contiguous()
            seg_logits.append(batch_seg_logit.view(-1, self.num_classes))
@@ -417,20 +447,21 @@ class EncoderDecoder3D(Base3DSegmentor):
        return preds.transpose(0, 1)  # to [num_classes, K*N]
-    def whole_inference(self, points: Tensor, input_metas: List[dict],
+    def whole_inference(self, points: Tensor, batch_input_metas: List[dict],
                        rescale: bool) -> Tensor:
        """Inference with full scene (one forward pass without sliding)."""
-        seg_logit = self.encode_decode(points, input_metas)
+        seg_logit = self.encode_decode(points, batch_input_metas)
        # TODO: if rescale and voxelization segmentor
        return seg_logit
-    def inference(self, points: Tensor, input_metas: List[dict],
+    def inference(self, points: Tensor, batch_input_metas: List[dict],
                  rescale: bool) -> Tensor:
        """Inference with slide/whole style.
        Args:
-            points (torch.Tensor): Input points of shape [B, N, 3+C].
+            points (Tensor): Input points of shape [B, N, 3+C].
-            input_metas (list[dict]): Meta information of each sample.
+            batch_input_metas (List[dict]): Meta information of a batch of
+                samples.
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.
@@ -440,11 +471,12 @@ class EncoderDecoder3D(Base3DSegmentor):
        assert self.test_cfg.mode in ['slide', 'whole']
        if self.test_cfg.mode == 'slide':
            seg_logit = torch.stack([
-                self.slide_inference(point, img_meta, rescale)
+                self.slide_inference(point, input_meta, rescale)
-                for point, img_meta in zip(points, input_metas)
+                for point, input_meta in zip(points, batch_input_metas)
            ], 0)
        else:
-            seg_logit = self.whole_inference(points, input_metas, rescale)
+            seg_logit = self.whole_inference(points, batch_input_metas,
+                                             rescale)
        output = F.softmax(seg_logit, dim=1)
        return output
@@ -455,23 +487,24 @@ class EncoderDecoder3D(Base3DSegmentor):
        """Simple test with single scene.
        Args:
-            batch_inputs_dict (dict): Input sample dict which
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
-                includes 'points' and 'imgs' keys.
+                and 'imgs' keys.
-                - points (list[torch.Tensor]): Point cloud of each sample.
+                - points (List[Tensor]): Point cloud of each sample.
-                - imgs (torch.Tensor, optional): Image tensor has shape
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
-                    (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
-            batch_data_samples (list[:obj:`Det3DDataSample`]): The det3d
+                samples. It usually includes information such as `metainfo` and
-                data samples. It usually includes information such
+                `gt_pts_seg`.
-                as `metainfo` and `gt_pts_sem_seg`.
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.
                Defaults to True.
        Returns:
-            list[dict]: The output prediction result with following keys:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:
-                - semantic_mask (Tensor): Segmentation mask of shape [N].
+            - ``pred_pts_seg`` (PixelData): Prediction of 3D semantic
+              segmentation.
        """
        # 3D segmentation requires per-point prediction, so it's impossible
        # to use down-sampling to get a batch of scenes with same num_points
@@ -498,15 +531,14 @@ class EncoderDecoder3D(Base3DSegmentor):
        """Network forward process.
        Args:
-            batch_inputs_dict (dict): Input sample dict which
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
-                includes 'points' and 'imgs' keys.
+                and 'imgs' keys.
-                - points (list[torch.Tensor]): Point cloud of each sample.
+                - points (List[Tensor]): Point cloud of each sample.
-                - imgs (torch.Tensor, optional): Image tensor has shape
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
-                  (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
-            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
+                samples. It usually includes information such as `metainfo` and
-                data samples. It usually includes information such
+                `gt_pts_seg`.
-                as `metainfo` and `gt_pts_sem_seg`.
        Returns:
            Tensor: Forward output of model without any post-processes.