Refactored docstring to google style

ec5086c4 · Liang Bowen · アマデウス · 53b1b6e3 · ec5086c4 · ec5086c4
Commit ec5086c4 authored Mar 25, 2022 by Liang Bowen Committed by アマデウス Mar 29, 2022
20 changed files
--- a/colossalai/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/nn/layer/colossalai_layer/linear.py
@@ -31,22 +31,35 @@ _vocab_parallel_classifier = {


 class Linear(nn.Module):
-    """
-    Linear layer of colossalai
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param kwargs: Kwargs used for particular parallelisms
+    """Linear layer of colossalai.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    Note: ``kwargs`` would contain different parameters when you use different parallelisms.
+
+    The ``kwargs`` should contain parameters below:
+    ::
+
+        Linear1D:
+            gather_output: bool (optional, default to be false)
+            skip_bias_add: bool (optional, default to be false)
+        Linear2D:
+            skip_bias_add: bool (optional, default to be false)
+        Linear2p5D:
+            skip_bias_add: bool (optional, default to be false)
+        Linear3D:
+            None
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -88,21 +101,21 @@ class Linear(nn.Module):


 class Classifier(nn.Module):
-    """
-    Classifier layer of colossalai
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of total classes for the dataset
-    :type num_classes: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    """Classifier layer of colossalai.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,

--- a/colossalai/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/nn/layer/colossalai_layer/normalization.py
@@ -19,18 +19,15 @@ _parallel_layernorm = {


 class LayerNorm(nn.Module):
-    r"""
-    Layer Normalization for colossalai
-
-    :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    r"""Layer Normalization for colossalai.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """

    def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:

--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@@ -28,11 +28,10 @@ class Experts(MoeExperts):
    moe model parallel group, where E is the number of experts. Every expert
    is a instence of the class, 'expert' in initialization parameters.

-    :param expert: The class of all experts
-    :param num_experts: The number of experts
-    :param expert_args: Args used to initialize experts
-
-    :type num_experts: int
+    Args:
+        expert_cls (:class:`torch.nn.Module`): The class of all experts
+        num_experts (int): The number of experts
+        expert_args: Args used to initialize experts, the args could be found in corresponding expert class
    """

    def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):

--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@@ -18,19 +18,13 @@ class Top1Router(nn.Module):
    for routing usage. More deailted function can be found in the paper about Switch Transformer
    of Google.

-    :param capacity_factor_train: Capacity factor in routing during training
-    :param capacity_factor_eval: Capacity factor in routing during evaluation
-    :param min_capacity: The minimum number of the capacity of each expert
-    :param select_policy: The policy about tokens selection
-    :param noisy_func: Noisy function used in logits
-    :param drop_tks: Whether drops tokens in evaluation
-
-    :type capacity_factor_train: float, optional
-    :type capacity_factor_eval: float, optional
-    :type min_capacity: int, optional
-    :type select_policy: str, optional
-    :type noisy_func: Callable, optional
-    :type drop_tks: bool, optional
+    Args:
+        capacity_factor_train (float, optional): Capacity factor in routing of training.
+        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
+        min_capacity (int, optional): The minimum number of the capacity of each expert.
+        select_policy (str, optional): The policy about tokens selection.
+        noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
+        drop_tks (bool, optional): Whether drops tokens in evaluation
    """

    def __init__(self,
@@ -119,17 +113,12 @@ class Top2Router(nn.Module):
    """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
    for routing usage. More deailted function can be found in the paper about ViT-MoE.

-    :param capacity_factor_train: Capacity factor in routing during training
-    :param capacity_factor_eval: Capacity factor in routing during evaluation
-    :param min_capacity: The minimum number of the capacity of each expert
-    :param noisy_func: Noisy function used in logits
-    :param drop_tks: Whether drops tokens in evaluation
-
-    :type capacity_factor_train: float, optional
-    :type capacity_factor_eval: float, optional
-    :type min_capacity: int, optional
-    :type noisy_func: Callable, optional
-    :type drop_tks: bool, optional
+    Args:
+        capacity_factor_train (float, optional): Capacity factor in routing of training.
+        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
+        min_capacity (int, optional): The minimum number of the capacity of each expert
+        noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
+        drop_tks (bool, optional): Whether drops tokens in evaluation.
    """

    def __init__(self,
@@ -239,15 +228,11 @@ class MoeLayer(nn.Module):
    the moe tensor group by all to all comunication. Then it will get the output of all
    experts and exchange the output. At last returns the output of the moe system.

-    :param dim_model: Dimension of model
-    :param num_experts: The number of experts
-    :param router: Instance of router used in routing
-    :param experts: Instance of experts generated by Expert
-
-    :type dim_model: int
-    :type num_experts: int
-    :type router: nn.Module
-    :type experts: nn.Module
+    Args:
+        dim_model (int): Dimension of model.
+        num_experts (int): The number of experts.
+        router (:class:`torch.nn.Module`): Instance of router used in routing.
+        experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
    """

    def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):

--- a/colossalai/nn/layer/moe/utils.py
+++ b/colossalai/nn/layer/moe/utils.py
@@ -16,8 +16,8 @@ class NormalNoiseGenerator:
    All noise is generated from a normal distribution (0, 1 / E^2), where
    E = the number of experts.

-    :param num_experts: The number of experts
-    :type num_experts: int
+    Args:
+        num_experts (int): The number of experts.
    """

    def __init__(self, num_experts: int):
@@ -37,8 +37,8 @@ class UniformNoiseGenerator:
    Makes models more resilient to rounding errors introduced by bfloat16.
    This seems particularly important for logits.

-    :param eps: Epsilon in generator
-    :type eps: float
+    Args:
+        eps (float, optional): Epsilon in generator, defaults 1e-2.
    """

    def __init__(self, eps: float = 1e-2):

--- a/colossalai/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/nn/layer/parallel_1d/_operation.py
@@ -7,17 +7,17 @@ except:


 class FusedLayerNormAffineFunction1D(torch.autograd.Function):
-    r"""
-  Layernorm
+    r"""Layernorm

-  :param input: input maxtrix
-  :param weight: weight matrix
-  :param bias: bias matrix
-  :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-  :param eps: a value added to the denominator for numerical stability
+    Args:
+        input: input matrix.
+        weight: weight matrix.
+        bias: bias matrix.
+        normalized_shape: input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability
  """

    @staticmethod

--- a/colossalai/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/nn/layer/parallel_1d/_utils.py
@@ -78,8 +78,9 @@ class _ReduceGrad(torch.autograd.Function):
    """
    Pass the input to the model parallel region.

-    :param input_: input matrix
-    :param parallel_mode: parallel mode
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
    """

    @staticmethod
@@ -99,9 +100,10 @@ class _ReduceGrad(torch.autograd.Function):
 class _ReduceInput(torch.autograd.Function):
    """
    All-reduce the input from the model parallel region.
-    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
+
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
    """

    @staticmethod
@@ -121,9 +123,10 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.
    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
-    :param dim: dimension
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
    """

    @staticmethod
@@ -142,12 +145,12 @@ class _SplitForwardGatherBackward(torch.autograd.Function):


 class _GatherForwardSplitBackward(torch.autograd.Function):
-    """
-    Gather the input from model parallel region and concatinate.
-    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
-    :param dim: dimension
+    """Gather the input from model parallel region and concatenate.
+
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
    """

    @staticmethod

--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -24,24 +24,23 @@ from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_g

 @LAYERS.register_module
 class Linear1D(torch.nn.Module):
-    """
-    Linear layer for 1D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Linear layer for 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -88,23 +87,21 @@ class Linear1D(torch.nn.Module):

 @LAYERS.register_module
 class Classifier1D(ParallelLayer):
-    """RowLinear with given weight
-    Classifier of 1D parallelism
-    
-    :param in_features: size of input features
-    :type in_features: int
-    :param num_classes: number of classes in the dataset
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""RowLinear with given weight. Classifier of 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -171,23 +168,21 @@ class Classifier1D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelClassifier1D(ParallelLayer):
-    """ColLinear with given weight
-    Classifier of 1D parallelism
-    
-    :param in_features: size of input features
-    :type in_features: int
-    :param num_classes: number of classes in the dataset
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""ColLinear with given weight. Classifier of 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -249,30 +244,28 @@ class VocabParallelClassifier1D(ParallelLayer):

 @LAYERS.register_module
 class Linear1D_Col(ParallelLayer):
-    """Linear layer with column parallelism.
+    r"""Linear layer with column parallelism.

    The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
    its second dimension as :math:`A = [A_1, ..., A_p]`.

-    :param in_features: first dimension of matrix A.
-    :type in_features: int
-    :param output_size: second dimension of matrix A.
-    :type output_size: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param gather_output: If true, call all-gether on output and make Y avaiable
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        gather_output (bool, optional): If true, call all-gather on output and make Y available
                    to all GPUs, otherwise, every GPU will have its output
                    which is :math:`Y_i = XA_i`, defaults to False
-    :type gather_output: bool, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to Fals
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -343,25 +336,23 @@ class Linear1D_Col(ParallelLayer):

 @LAYERS.register_module
 class Linear1D_Row(ParallelLayer):
-    """ Linear layer with row parallelism 
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False
-    :type parallel_input: bool, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r""" Linear layer with row parallelism
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to Fals
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -432,21 +423,33 @@ class Linear1D_Row(ParallelLayer):

 @LAYERS.register_module
 class Embedding1D(ParallelLayer):
-    """
-    Embedding for 1D parallelism
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding for 1D parallelism.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """

    def __init__(self,
@@ -499,20 +502,33 @@ class Embedding1D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelEmbedding1D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding parallelized in the vocabulary dimension.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -578,13 +594,11 @@ class VocabParallelEmbedding1D(torch.nn.Module):

 @LAYERS.register_module
 class Dropout1D(ParallelLayer):
-    """
-    Dropout layer of 1D parallelism
+    """Dropout layer of 1D parallelism.

-    :param p: dropout rate, defaults to 0.5
-    :type p: float, optional
-    :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
-    :type inplace: bool, optional
+    Args:
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
    """

    def __init__(self, p: float = 0.5, inplace: bool = False):

--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
@@ -21,27 +21,26 @@ def matmul_2d(
    row_parallel_mode=ParallelMode.PARALLEL_2D_ROW,
    col_parallel_mode=ParallelMode.PARALLEL_2D_COL,
 ):
-    """
-    Matrix multiplication for 2D parallelism
-
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row, defaults to None
-    :type row_rank: int, optional
-    :param col_rank: the rank of column, defaults to None
-    :type col_rank: int, optional
-    :param row_parallel_mode: row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW
-    :type row_parallel_mode: str, optional
-    :param col_parallel_mode: column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL
-    :type col_parallel_mode: str, optional
-    :return: :math:`C = AB`
-    :rtype: torch.tensor
+    r"""Matrix multiplication for 2D parallelism.
+
+    Args:
+        a (:class:`torch.tensor`): matrix :math:`A`.
+        b (:class:`torch.tensor`): matrix :math:`B`.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
+            row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
+            column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
+
+    Returns:
+        :class:`torch.tensor`: :math:`C = AB`.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    if row_rank is None:
        row_rank = gpc.get_local_rank(col_parallel_mode)
@@ -135,35 +134,26 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
                  row_rank: int, col_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode,
                  data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                  tensor_parallel_size: int) -> Tensor:
-    """
-    2D parallel classifier
-
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor, optional
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""2D parallel classifier.
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        bias (:class:`torch.tensor`, optional): matrix of bias.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Classifier2D.apply(A, B, bias, summa_dim, out_shape, row_rank, col_rank, row_parallel_mode,
                               col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
@@ -171,33 +161,25 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,


 class Matmul_AB_2D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = AB`
-
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix multiplication for :math:`C = AB`.
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@@ -305,33 +287,26 @@ class Matmul_AB_2D(torch.autograd.Function):


 class Matmul_ABT_2D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = AB^T`
-    
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix multiplication for :math:`C = AB^T`
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+            column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@@ -445,33 +420,25 @@ class Matmul_ABT_2D(torch.autograd.Function):


 class Matmul_ATB_2D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = A^TB`
-
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param summa_dim: dimension of SUMMA fo 2D parallelism
-    :type summa_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix multiplication for :math:`C = A^TB`.
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        summa_dim (int): dimension of SUMMA fo 2D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
@@ -639,33 +606,26 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
                row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, skip_bias_add: bool,
                data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                tensor_parallel_size: int) -> Tensor:
-    """
-    Matrix add bias: :math:`C = A + b`
-
-    :param input_: matrix :math:`A`
-    :type input_: torch.tensor
-    :param bias: matrix :math:`b`
-    :type bias: torch.tensor
-    :param output_size_per_partition: size of ouput per partition
-    :type output_size_per_partition: int
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion
-    :type skip_bias_add: bool
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix add bias: :math:`C = A + b`.
+
+    Args:
+        input_ (:class:`torch.tensor`): matrix :math:`A`.
+        bias (:class:`torch.tensor`): matrix :math:`B`.
+        output_size_per_partition (int): size of output per partition.
+        row_rank (int, optional): the rank of row, defaults to None.
+        col_rank (int, optional): the rank of column, defaults to None.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        skip_bias_add (bool):
+            If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Add_Bias_2D.apply(input_, bias, output_size_per_partition, row_rank, col_rank, row_parallel_mode,
                              col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
@@ -711,21 +671,19 @@ class _Layernorm_2D(torch.autograd.Function):

 def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, row_parallel_mode: ParallelMode,
                 col_parallel_mode: ParallelMode) -> Tensor:
-    """
-    Layernorm
-
-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param E_x: mean
-    :type E_x: torch.tensor
-    :param Var_x: variance
-    :type Var_x: torch.tensor
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""Layernorm.
+
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        E_x (:class:`torch.tensor`): mean.
+        Var_x (:class:`torch.tensor`): variance.
+        hidden_size (int): hidden size.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Layernorm_2D.apply(input_, E_x, Var_x, hidden_size, row_parallel_mode, col_parallel_mode)

@@ -748,27 +706,29 @@ class _AllGatherTensor2D(torch.autograd.Function):


 def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All gather the tensor of 2D parallelism
-
-    :param inputs: input maxtrix
-    :type inputs: torch.tensor
-    :param dim: dimension to gather
-    :type dim: int
-    :param parallel_mode: parallel mode
-    :type parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""All gather the tensor of 2D parallelism.
+
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to gather.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _AllGatherTensor2D.apply(tensor, dim, parallel_mode)


 def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor:
-    """Splits 2D tensor in specified dimension across cols
-    :param input_: Input tensor
-    :param dim: Specified dimension in which to split
-    :type input_: torch.Tensor
-    :type dim: int, optional
-    :return output: Splitted tensor
-    :rtype output: torch.Tensor
+    """Splits 2D tensor in specified dimension across cols.
+
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        dim (int): Specified dimension in which to split.
+
+    Returns:
+        :class:`torch.tensor`: The tensor has been split.
    """
    if input_.size(dim) <= 1:
        return input_
@@ -787,11 +747,15 @@ class _ReduceTensor2D(torch.autograd.Function):


 def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All-reduce the input.
-    
-    :param input_: input tensor
-    :param parallel_mode: parallel mode
+    r"""All-reduce the input.
+
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceTensor2D.apply(input_, parallel_mode)

@@ -809,12 +773,16 @@ class _ReduceScatterTensor2D(torch.autograd.Function):


 def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    Reduce-scatter the input.
-    
-    :param tensor: Input tensor
-    :param dim: Dimension to scatter
-    :param parallel_mode: Parallel mode
+    r"""Reduce-scatter the input.
+
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to reduce.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor2D.apply(tensor, dim, parallel_mode)

@@ -849,11 +817,11 @@ class _ReduceByBatch2D(torch.autograd.Function):


 def reduce_by_batch_2d(input_, reduce_mean: bool = False) -> Tensor:
-    """All-reduce the input from the model parallel region.
+    r"""All-reduce the input from the model parallel region.

-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
-    :type reduce_mean: bool, optional
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        reduce_mean (bool, optional):
+            If set to ``True``, it will divide the output by column parallel size, default to False.
    """
    return _ReduceByBatch2D.apply(input_, reduce_mean)
\ No newline at end of file
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
@@ -22,23 +22,22 @@ from ._utils import assert_summa_initialization, get_summa_dim_from_env

 @LAYERS.register_module
 class Linear2D(ParallelLayer):
-    """
-    Linear layer for 2D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Linear layer for 2D parallelism
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@@ -119,18 +118,16 @@ class Linear2D(ParallelLayer):

 @LAYERS.register_module
 class LayerNorm2D(ParallelLayer):
-    r"""
-    Layer Normalization for 2D parallelism
-
-    :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    r"""Layer Normalization for 2D parallelism.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()
@@ -189,27 +186,24 @@ class LayerNorm2D(ParallelLayer):

 @LAYERS.register_module
 class PatchEmbedding2D(ParallelLayer):
-    """
-    2D Image to Patch Embedding
-
-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    r"""2D Image to Patch Embedding.
+
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 img_size: int,
@@ -291,21 +285,33 @@ class PatchEmbedding2D(ParallelLayer):

 @LAYERS.register_module
 class Embedding2D(ParallelLayer):
-    """
-    Embedding for 2D parallelism
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding for 2D parallelism.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
                 num_embeddings: int,
@@ -358,20 +364,33 @@ class Embedding2D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelEmbedding2D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding parallelized in the vocabulary dimension.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 num_embeddings: int,
@@ -435,23 +454,21 @@ class VocabParallelEmbedding2D(torch.nn.Module):

 @LAYERS.register_module
 class Classifier2D(ParallelLayer):
-    """
-    Classifier for 2D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Classifier for 2D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@@ -515,23 +532,21 @@ class Classifier2D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelClassifier2D(ParallelLayer):
-    """
-    Vocab parallel classifier layer for 2D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Vocab parallel classifier layer for 2D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,

--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/nn/layer/parallel_2p5d/_operation.py
@@ -100,35 +100,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
                                                                                     ...], row_rank: int, col_rank: int,
                    row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode, data_parallel_rank: int,
                    pipeline_parallel_rank: int, pipeline_parallel_size: int, tensor_parallel_size: int) -> Tensor:
-    """
-    Classifier
-
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor, optional
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Classifier.
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        bias (:class:`torch.tensor`): matrix of bias.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Classifier2p5D.apply(A, B, bias, tesseract_dim, out_shape, row_rank, col_rank, row_parallel_mode,
                                 col_parallel_mode, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size,
@@ -136,35 +127,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T


 class Matmul_AB_2p5D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = AB`
-
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param dep_rank: the rank of depth
-    :type dep_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix multiplication for :math:`C = AB`.
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        dep_rank (int): the rank of depth.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
@@ -270,35 +252,26 @@ class Matmul_AB_2p5D(torch.autograd.Function):


 class Matmul_ABT_2p5D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = AB^T`
-
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param dep_rank: the rank of depth
-    :type dep_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix multiplication for :math:`C = AB^T`.
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        dep_rank (int): the rank of depth.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
@@ -409,35 +382,26 @@ class Matmul_ABT_2p5D(torch.autograd.Function):


 class Matmul_ATB_2p5D(torch.autograd.Function):
-    """
-    Matrix multiplication for :math:`C = A^TB`
-
-    :param a: matrix :math:`A`
-    :type a: torch.tensor
-    :param b: matrix :math:`B`
-    :type b: torch.tensor
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param out_shape: shape of output tensor
-    :type out_shape: tuple
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param dep_rank: the rank of depth
-    :type dep_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix multiplication for :math:`C = A^TB`
+
+    Args:
+        A (:class:`torch.tensor`): matrix :math:`A`.
+        B (:class:`torch.tensor`): matrix :math:`B`.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        out_shape (:class:`torch.size`): shape of output tensor.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        dep_rank (int): the rank of depth.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
@@ -629,36 +593,27 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
                  col_rank: int, dep_rank: int, col_parallel_mode: ParallelMode, skip_bias_add: bool,
                  data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                  tensor_parallel_size: int) -> Tensor:
-    """
-    Matrix add bias: :math:`C = A + b`
-
-    :param input: matrix :math:`A`
-    :type input: torch.tensor
-    :param bias: matrix :math:`b`
-    :type bias: torch.tensor
-    :param output_size_per_partition: output size in each partition
-    :type output_size_per_partition: int
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param row_rank: the rank of row
-    :type row_rank: int
-    :param col_rank: the rank of column
-    :type col_rank: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-           which is preserved for kernel fusion
-    :type skip_bias_add: bool
-    :param data_parallel_rank: data parallel rank
-    :type data_parallel_rank: int
-    :param pipeline_parallel_rank: pipeline parallel rank
-    :type pipeline_parallel_rank: int
-    :param pipeline_parallel_size: pipeline parallel size
-    :type pipeline_parallel_size: int
-    :param tensor_parallel_size: tensor parallel size
-    :type tensor_parallel_size: int
+    r"""Matrix add bias: :math:`C = A + b`.
+
+    Args:
+        input (:class:`torch.tensor`): matrix :math:`A`.
+        bias (:class:`torch.tensor`): matrix :math:`B`.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
+        output_size_per_partition (int): output size in each partition.
+        row_rank (int): the rank of row.
+        col_rank (int): the rank of column.
+        dep_rank (int): the rank of depth.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion.
+        data_parallel_rank (int): data parallel rank.
+        pipeline_parallel_rank (int): pipeline parallel rank
+        pipeline_parallel_size (int): pipeline parallel size.
+        tensor_parallel_size (int): tensor parallel size.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Add_Bias_2p5D.apply(input, bias, output_size_per_partition, tesseract_dim, row_rank, col_rank, dep_rank,
                                col_parallel_mode, skip_bias_add, data_parallel_rank, pipeline_parallel_rank,
@@ -666,19 +621,18 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t


 class _Layernorm2p5D(torch.autograd.Function):
-    """
-    Layernorm
-
-    :param input: input maxtrix
-    :type input: torch.tensor
-    :param E_x: mean
-    :type E_x: torch.tensor
-    :param Var_x: variance
-    :type Var_x: torch.tensor
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""Layernorm.
+
+    Args:
+        input (:class:`torch.tensor`): input matrix.
+        E_x (:class:`torch.tensor`): mean.
+        Var_x (:class:`torch.tensor`): variance.
+        hidden_size (int): hidden size.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod
@@ -718,19 +672,18 @@ class _Layernorm2p5D(torch.autograd.Function):

 def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
                   row_parallel_mode: ParallelMode) -> Tensor:
-    """
-    Layernorm
-
-    :param input: input maxtrix
-    :type input: torch.tensor
-    :param E_x: mean
-    :type E_x: torch.tensor
-    :param Var_x: variance
-    :type Var_x: torch.tensor
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param row_parallel_mode: row parallel mode
-    :type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""Layernorm.
+
+    Args:
+        input (:class:`torch.tensor`): input matrix.
+        E_x (:class:`torch.tensor`): mean.
+        Var_x (:class:`torch.tensor`): variance.
+        hidden_size (int): hidden size.
+        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _Layernorm2p5D.apply(input, E_x, Var_x, hidden_size, row_parallel_mode)

@@ -753,29 +706,31 @@ class _AllGatherTensor2p5D(torch.autograd.Function):


 def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: ParallelMode) -> Tensor:
-    """
-    all gather the weight of 2.5D parallelism
-
-    :param inputs: input maxtrix
-    :type inputs: torch.tensor
-    :param dim: dimension of all gather
-    :type dim: int
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""all gather the weight of 2.5D parallelism.
+
+    Args:
+        inputs (:class:`torch.tensor`): input tensor.
+        dim (int): dimension of all-gather.
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _AllGatherTensor2p5D.apply(inputs, dim, col_parallel_mode)


 class SplitFirst(torch.autograd.Function):
-    """
-    :param inputs: input maxtrix
-    :type inputs: torch.tensor
-    :param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
-    :type tesseract_dim: int
-    :param col_parallel_mode: column parallel mode
-    :type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""
+
+    Args:
+        inputs (:class:`torch.tensor`): input tensor.
+        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
+        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """

    @staticmethod
@@ -801,16 +756,14 @@ class SplitFirst(torch.autograd.Function):


 def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
-    """Splits 2P5D tensor in specified dimension across cols
+    """Splits 2P5D tensor in specified dimension across cols.

-    :param input_: Input tensor
-    :param dim: Specified dimension in which to split
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        dim (int): Specified dimension in which to split.

-    :type input_: torch.Tensor
-    :type dim: int, optional
-
-    :return output: Splitted tensor
-    :rtype output: torch.Tensor
+    Returns:
+        :class:`torch.tensor`: The tensor has been split.
    """
    if input_.size(dim) <= 1:
        return input_
@@ -829,11 +782,15 @@ class _ReduceTensor2p5D(torch.autograd.Function):


 def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All-reduce the input.
+    r"""All-reduce the input.

-    :param input_: input tensor
-    :param parallel_mode: parallel mode
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceTensor2p5D.apply(input_, parallel_mode)

@@ -851,11 +808,16 @@ class _ReduceScatterTensor2p5D(torch.autograd.Function):


 def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    Reduce-scatter the input.
+    r"""Reduce-scatter the input.

-    :param input_: input tensor
-    :param parallel_mode: parallel mode
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to reduce.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor2p5D.apply(input_, dim, parallel_mode)

@@ -890,12 +852,11 @@ class _RreduceByBatch2p5D(torch.autograd.Function):


 def reduce_by_batch_2p5d(input_, reduce_mean: bool = False) -> Tensor:
-    """
-    All-reduce the input from the model parallel region.
+    r"""All-reduce the input from the model parallel region.

-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
-    :type reduce_mean: bool, optional
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        reduce_mean (bool, optional):
+            If set to ``True``, it will divide the output by column parallel size, default to False.
    """
    return _RreduceByBatch2p5D.apply(input_, reduce_mean)
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@@ -23,21 +23,22 @@ from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_

 @LAYERS.register_module
 class Linear2p5D(ParallelLayer):
-    """
-    Linear layer for 2.5D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Linear layer for 2.5D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@@ -131,19 +132,16 @@ class Linear2p5D(ParallelLayer):

 @LAYERS.register_module
 class LayerNorm2p5D(ParallelLayer):
-    r"""
-    Layer Normalization for 2.5D parallelism
-
-    :param normalized_shape: input shape from an expected input of size.
-        :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-        \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    r"""Layer Normalization for 2.5D parallelism.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()
@@ -204,27 +202,24 @@ class LayerNorm2p5D(ParallelLayer):

 @LAYERS.register_module
 class PatchEmbedding2p5D(ParallelLayer):
-    """
-    2D Image to Patch Embedding
-
-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    r"""2D Image to Patch Embedding.
+
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 img_size: int,
@@ -306,21 +301,33 @@ class PatchEmbedding2p5D(ParallelLayer):

 @LAYERS.register_module
 class Embedding2p5D(ParallelLayer):
-    """
-    Embedding for 2.5D parallelism
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding for 2.5D parallelism.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
                 num_embeddings: int,
@@ -376,18 +383,31 @@ class Embedding2p5D(ParallelLayer):
 class VocabParallelEmbedding2p5D(torch.nn.Module):
    """Embedding parallelized in the vocabulary dimension.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 num_embeddings: int,
@@ -455,23 +475,21 @@ class VocabParallelEmbedding2p5D(torch.nn.Module):

 @LAYERS.register_module
 class Classifier2p5D(ParallelLayer):
-    """
-    Classifier for 2.5D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Classifier for 2.5D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@@ -537,23 +555,21 @@ class Classifier2p5D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelClassifier2p5D(ParallelLayer):
-    """
-    Vocab parallel classifier layer for 2.5D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Vocab parallel classifier layer for 2.5D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,

--- a/colossalai/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/nn/layer/parallel_3d/_operation.py
@@ -88,27 +88,22 @@ def linear_3d(input_: Tensor,
              input_dim: int = 0,
              weight_dim: int = -1,
              output_dim: int = 0) -> Tensor:
-    """
-    Linear layer for 3D parallelism
-
-    :param input_: matrix of input
-    :type input_: torch.tensor
-    :param weight: matrix of weight
-    :type weight: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor, optional
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param output_parallel_mode: output parallel mode
-    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param input_dim: dimension of input, defaults to 0
-    :type input_dim: int, optional
-    :param weight_dim: dimension of weight, defaults to -1
-    :type weight_dim: int, optional
-    :param output_dim: dimension of output, defaults to 0
-    :type output_dim: int, optional
+    r"""Linear layer for 3D parallelism.
+
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        weight (:class:`torch.tensor`): matrix of weight.
+        bias (:class:`torch.tensor`): matrix of bias.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_dim (int, optional): dimension of input, defaults to 0.
+        weight_dim (int, optional): dimension of weight, defaults to -1.
+        output_dim (int, optional): dimension of output, defaults to 0.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Linear3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode,
                           input_dim, weight_dim, output_dim)
@@ -174,21 +169,19 @@ class _Classifier3D(torch.autograd.Function):

 def classifier_3d(input_: Tensor, weight: Tensor, bias: Optional[Tensor], input_parallel_mode: ParallelMode,
                  weight_parallel_mode: ParallelMode, output_parallel_mode: ParallelMode) -> Tensor:
-    """
-    3D parallel classifier
-
-    :param input_: matrix of input
-    :type input_: torch.tensor
-    :param weight: matrix of weight
-    :type weight: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor, optional
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param output_parallel_mode: output parallel mode
-    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""3D parallel classifier.
+
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        weight (:class:`torch.tensor`): matrix of weight.
+        bias (:class:`torch.tensor`): matrix of bias.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Classifier3D.apply(input_, weight, bias, input_parallel_mode, weight_parallel_mode, output_parallel_mode)

@@ -244,48 +237,44 @@ class _Layernorm3D(torch.autograd.Function):
 def layernorm_3d(input_: Tensor, weight: Tensor, bias: Tensor, normalized_shape: int, eps: float,
                 input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode,
                 output_parallel_mode: ParallelMode) -> Tensor:
-    r"""
-    3D parallel Layernorm
-
-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param weight: matrix of weight
-    :type weight: torch.tensor
-    :param bias: matrix of bias
-    :type bias: torch.tensor
-    :param normalized_shape: input shape from an expected input of size.
-    :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-    \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability
-    :type eps: float
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param output_parallel_mode: output parallel mode
-    :type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""3D parallel Layernorm.
+
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        weight (:class:`torch.tensor`): matrix of weight.
+        bias (:class:`torch.tensor`): matrix of bias.
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float): a value added to the denominator for numerical stability
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _Layernorm3D.apply(input_, weight, bias, normalized_shape, eps, input_parallel_mode, weight_parallel_mode,
                              output_parallel_mode)


 def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """Splits 3D parallel tensor in specified dimension
+    r"""Splits 3D parallel tensor in specified dimension.

-    :param tensor: Input tensor
-    :param dim: Specified dimension in which to split
-    :param parallel_mode: Parallel mode
-    :param weight_parallel_mode: Weight parallel mode
+     Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Specified dimension in which to split.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode.

-    :type tensor: torch.Tensor
-    :type dim: int
-    :type parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    Returns:
+        :class:`torch.tensor`: The tensor has been split.

-    :return output: Splitted tensor
-    :rtype output: torch.Tensor
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    if tensor.size(dim) <= 1:
        return tensor
@@ -298,17 +287,20 @@ def split_batch_3d(input_: Tensor,
                   dim: int = 0,
                   input_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_INPUT,
                   weight_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_WEIGHT) -> Tensor:
-    """Splits 3D tensor in batch
-    :param input_: Input tensor
-    :param dim: Specified dimension in which to split
-    :param input_parallel_mode: Input parallel mode
-    :param weight_parallel_mode: Weight parallel mode
-    :type input_: torch.Tensor
-    :type dim: int, optional
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
-    :return output: Splitted tensor
-    :rtype output: torch.Tensor
+    r"""Splits 3D tensor in batch.
+
+    Args:
+        input_ (:class:`torch.tensor`): Input tensor.
+        dim (int): Specified dimension in which to split.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
+
+    Returns:
+        :class:`torch.tensor`: The tensor has been split.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    if input_.size(dim) <= 1:
        return input_
@@ -333,11 +325,15 @@ class _ReduceTensor3D(torch.autograd.Function):


 def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All-reduce the input
+    r"""All-reduce the input
+
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.

-    :param tensor: Input tensor
-    :param parallel_mode: Parallel mode
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _ReduceTensor3D.apply(tensor, parallel_mode)

@@ -358,11 +354,16 @@ class _AllGatherTensor3D(torch.autograd.Function):


 def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    All-reduce the gradient in backward pass.
+    r"""All-reduce the gradient in backward pass.
+
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to gather.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.

-    :param tensor: Input tensor
-    :param parallel_mode: Parallel mode
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
    """
    return _AllGatherTensor3D.apply(tensor, dim, parallel_mode)

@@ -382,12 +383,16 @@ class _ReduceScatterTensor3D(torch.autograd.Function):


 def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Tensor:
-    """
-    Reduce-scatter the input.
+    r"""Reduce-scatter the input.
+
+    Args:
+        tensor (:class:`torch.tensor`): Input tensor.
+        dim (int): Dimension to scatter.
+        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.

-    :param tensor: Input tensor
-    :param dim: Dimension to scatter
-    :param parallel_mode: Parallel mode
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceScatterTensor3D.apply(tensor, dim, parallel_mode)

@@ -423,34 +428,33 @@ def reduce_by_batch_3d(tensor: Tensor,
                       input_parallel_mode: ParallelMode,
                       weight_parallel_mode: ParallelMode,
                       reduce_mean: bool = False) -> Tensor:
-    """
-    All-reduce the input from the model parallel region.
-
-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param reduce_mean:  If set to ``True``, it will divide the output by (input parallel size * weight parallel size),
-    default to False
-    :type reduce_mean: int, optional
+    r"""All-reduce the input from the model parallel region.
+
+    Args:
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        reduce_mean (bool, optional): If set to ``True``, it will divide the output by
+            (input parallel size * weight parallel size), default to False.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    return _ReduceByBatch3D.apply(tensor, input_parallel_mode, weight_parallel_mode, reduce_mean)


 class _BroadcastWeight3D_FromDiagonal(torch.autograd.Function):
-    """
-    broadcast weight from diagonal
-
-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: output parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+    r"""broadcast weight from diagonal.
+
+    Args:
+        input_ (:class:`torch.tensor`): input matrix.
+        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    @staticmethod

--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
@@ -24,19 +24,16 @@ from ._utils import get_depth_from_env, get_last_group, get_parallel_mode_from_e

 @LAYERS.register_module
 class LayerNorm3D(ParallelLayer):
-    r"""
-    Layer Normalization for 3D parallelism
-
-    :param normalized_shape: input shape from an expected input of size.
-    :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-    \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-12
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    r"""Layer Normalization for 3D parallelism.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-12.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """

    def __init__(self, normalized_shape: int, eps: float = 1e-12, dtype=None):
@@ -71,21 +68,20 @@ class LayerNorm3D(ParallelLayer):

 @LAYERS.register_module
 class Linear3D(ParallelLayer):
-    """
-    Linear layer for 3D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Linear layer for 3D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -146,23 +142,21 @@ class Linear3D(ParallelLayer):

 @LAYERS.register_module
 class Classifier3D(ParallelLayer):
-    """
-    Classifier for 3D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Classifier for 3D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -225,23 +219,21 @@ class Classifier3D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelClassifier3D(ParallelLayer):
-    """
-    Vocab parallel classifier layer for 2D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Vocab parallel classifier layer for 3D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -311,27 +303,24 @@ class VocabParallelClassifier3D(ParallelLayer):

 @LAYERS.register_module
 class PatchEmbedding3D(ParallelLayer):
-    """
-    2D Image to Patch Embedding
-
-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    r"""2D Image to Patch Embedding.
+
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -419,21 +408,33 @@ class PatchEmbedding3D(ParallelLayer):

 @LAYERS.register_module
 class Embedding3D(ParallelLayer):
-    """
-    Embedding for 3D parallelism
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding for 3D parallelism.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """

    def __init__(self,
@@ -491,20 +492,33 @@ class Embedding3D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelEmbedding3D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding parallelized in the vocabulary dimension.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,

--- a/colossalai/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/nn/layer/parallel_sequence/layers.py
@@ -24,14 +24,13 @@ class TransformerSelfAttentionRing(nn.Module):
    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.

-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param kv_channels: channels of key/value tensor
-    :type kv_channels: int
-    :param num_attention_heads: number of attention heads
-    :type num_attention_heads: int
-    :param attention_dropout: dropout probability for attention layer
-    :type attention_dropout: float
+    Args:
+        hidden_size (int): hidden size.
+        num_attention_heads (int): number of attention heads.
+        attention_dropout (float): dropout probability for attention layer.
+        attention_mask_func (:class:`typing.Callable`): Mask function to be applied.
+        layer_number (int): number of layers.
+
    """

    def __init__(self,

--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/nn/layer/utils/common.py
@@ -38,11 +38,16 @@ class CheckpointModule(nn.Module):


 def divide(numerator, denominator):
-    """Only allow exact division
+    """Only allow exact division.

-    :param numerator: Numerator of the division
-    :param denominator: Denominator of the division
+    Args:
+        numerator (int): Numerator of the division.
+        denominator (int): Denominator of the division.
+
+    Returns:
+        int: the result of exact division.
    """
+    assert denominator != 0, 'denominator can not be zero'
    assert numerator % denominator == 0, \
        '{} is not divisible by {}'.format(numerator, denominator)
    return numerator // denominator

--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
@@ -15,11 +15,16 @@ from ..utils import to_2tuple

 def drop_path(x, drop_prob: float = 0., training: bool = False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.
+
+    Args:
+        drop_prob (float, optional): probability of dropping path, defaults 0.0.
+        training (bool, optional): whether in training progress, defaults False.
    """
    if drop_prob == 0. or not training:
        return x
@@ -35,6 +40,9 @@ class DropPath(nn.Module):
    """
    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+
+    Args:
+        drop_prob (float, optional): probability of dropping path, defaults None.
    """

    def __init__(self, drop_prob=None):
@@ -46,7 +54,19 @@ class DropPath(nn.Module):


 class WrappedDropout(nn.Module):
-    """Same as torch.nn.Dropout. But it is wrapped with the context of seed manager.
+    r"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager. During training, randomly zeroes
+    some elements of the input tensor with probability p using samples from a Bernoulli distribution. Each
+    channel will be zeroed out independently on every forward call. Furthermore, the outputs are scaled by a factor of
+    1/(1-p) during training. This means that during evaluation the module simply computes an identity function.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
+        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    def __init__(self, p: float = 0.5, inplace: bool = False, mode=None):
@@ -74,8 +94,16 @@ class WrappedDropout(nn.Module):


 class WrappedDropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    r"""Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    Here, it is wrapped with the context of seed manager.
+
+    Args:
+        p (float, optional): probability of dropping path, defaults 0.0.
+        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """

    def __init__(self, p: float = 0., mode=None):
@@ -101,27 +129,25 @@ class WrappedDropPath(nn.Module):

 @LAYERS.register_module
 class VanillaPatchEmbedding(nn.Module):
-    """
+    r"""
    2D Image to Patch Embedding

-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -174,23 +200,21 @@ class VanillaPatchEmbedding(nn.Module):

 @LAYERS.register_module
 class VanillaClassifier(nn.Module):
-    """
-    Dense linear classifier
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Dense linear classifier.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,

--- a/colossalai/nn/layer/wrapper/lambda_wrapper.py
+++ b/colossalai/nn/layer/wrapper/lambda_wrapper.py
@@ -9,12 +9,11 @@ from colossalai.registry import LAYERS

 @LAYERS.register_module
 class LambdaWrapper(nn.Module):
-    """Wrap a function to nn.Module, which takes a config of layers and can fully access them
+    """Wrap a function to nn.Module, which takes a config of layers and can fully access them.

-    :param func: User customed function
-    :type func: Callable
-    :param layers_cfg: Config of layers, defaults to None
-    :type layers_cfg: dict, optional
+    Args:
+        func (``Callable``): User customed function.
+        layers_cfg (dict, optional): Config of layers, defaults to None.
    """

    def __init__(self, func, layers_cfg: dict = None):

--- a/colossalai/nn/loss/loss_1d.py
+++ b/colossalai/nn/loss/loss_1d.py
@@ -86,12 +86,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):

 @LOSSES.register_module
 class VocabParallelCrossEntropyLoss1D(_Loss):
-    """
-    Vocab parallel cross entropy loss for 1D parallelism
-
-    :param reduction: whether to average the loss, defaults to True
+    """Vocab parallel cross entropy loss for 1D parallelism.

-    :type reduction: bool, optional
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.
    """

    def __init__(self, reduction=True):
@@ -99,10 +97,11 @@ class VocabParallelCrossEntropyLoss1D(_Loss):
        self.reduction_mean = reduction

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        loss = _VocabParallelCrossEntropy1D.apply(logits, targets)
        if self.reduction_mean:

--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss

 @LOSSES.register_module
 class CrossEntropyLoss2D(_Loss):
-    """
-    Cross entropy loss for 2D parallelism
+    r"""Cross entropy loss for 2D parallelism
+
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.
+
+    The ``args`` and ``kwargs`` should include parameters below:
+    ::

-    :param reduction: whether to average the loss, defaults to True
-    :param args: Args for loss function
-    :param kwargs: Kwargs for loss function
+        weight (Tensor, optional)
+        size_average (bool, optional)
+        ignore_index (int, optional)
+        reduce (bool, optional)
+        label_smoothing (float, optional)

-    :type reduction: bool, optional
+    More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
+    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
    """

    def __init__(self, reduction=True, *args, **kwargs):
@@ -31,10 +39,14 @@ class CrossEntropyLoss2D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+
+        Returns:
+            float: the loss between logits and targets.
        """
        targets = split_tensor_2d(targets)
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
@@ -116,12 +128,10 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):

 @LOSSES.register_module
 class VocabParallelCrossEntropyLoss2D(_Loss):
-    """
-    Vocab parallel cross entropy loss for 2D parallelism
-
-    :param reduction: whether to average the loss, defaults to True
+    """Vocab parallel cross entropy loss for 2D parallelism.

-    :type reduction: bool, optional
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.
    """

    def __init__(self, reduction=True):
@@ -129,10 +139,11 @@ class VocabParallelCrossEntropyLoss2D(_Loss):
        self.reduction_mean = reduction

    def forward(self, logits, targets):
-        """Calculate loss between logits and targets
+        """Calculate loss between logits and targets.

-        :param logits: Output logits of model
-        :param targets: True targets from data
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
        """
        targets = split_tensor_2d(targets)
        loss = _VocabParallelCrossEntropy2D.apply(