Refactored docstring to google style

ec5086c4 · Liang Bowen · アマデウス · 53b1b6e3 · ec5086c4 · ec5086c4
Commit ec5086c4 authored Mar 25, 2022 by Liang Bowen Committed by アマデウス Mar 29, 2022
20 changed files
--- a/colossalai/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/nn/layer/colossalai_layer/linear.py
@@ -31,22 +31,35 @@ _vocab_parallel_classifier = {


 class Linear(nn.Module):
-    """
-    Linear layer of colossalai
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param kwargs: Kwargs used for particular parallelisms
+    """Linear layer of colossalai.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    Note: ``kwargs`` would contain different parameters when you use different parallelisms.
+
+    The ``kwargs`` should contain parameters below:
+    ::
+
+        Linear1D:
+            gather_output: bool (optional, default to be false)
+            skip_bias_add: bool (optional, default to be false)
+        Linear2D:
+            skip_bias_add: bool (optional, default to be false)
+        Linear2p5D:
+            skip_bias_add: bool (optional, default to be false)
+        Linear3D:
+            None
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -88,21 +101,21 @@ class Linear(nn.Module):


 class Classifier(nn.Module):
-    """
-    Classifier layer of colossalai
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of total classes for the dataset
-    :type num_classes: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    """Classifier layer of colossalai.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,

--- a/colossalai/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/nn/layer/colossalai_layer/normalization.py
@@ -19,18 +19,15 @@ _parallel_layernorm = {


 class LayerNorm(nn.Module):
-    r"""
-    Layer Normalization for colossalai
-
-    :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    r"""Layer Normalization for colossalai.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """

    def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:

--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@@ -28,11 +28,10 @@ class Experts(MoeExperts):
    moe model parallel group, where E is the number of experts. Every expert
    is a instence of the class, 'expert' in initialization parameters.

-    :param expert: The class of all experts
-    :param num_experts: The number of experts
-    :param expert_args: Args used to initialize experts
-
-    :type num_experts: int
+    Args:
+        expert_cls (:class:`torch.nn.Module`): The class of all experts
+        num_experts (int): The number of experts
+        expert_args: Args used to initialize experts, the args could be found in corresponding expert class
    """

    def __init__(self, expert_cls: Type[nn.Module], num_experts: int, **expert_args):

--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@@ -18,19 +18,13 @@ class Top1Router(nn.Module):
    for routing usage. More deailted function can be found in the paper about Switch Transformer
    of Google.

-    :param capacity_factor_train: Capacity factor in routing during training
-    :param capacity_factor_eval: Capacity factor in routing during evaluation
-    :param min_capacity: The minimum number of the capacity of each expert
-    :param select_policy: The policy about tokens selection
-    :param noisy_func: Noisy function used in logits
-    :param drop_tks: Whether drops tokens in evaluation
-
-    :type capacity_factor_train: float, optional
-    :type capacity_factor_eval: float, optional
-    :type min_capacity: int, optional
-    :type select_policy: str, optional
-    :type noisy_func: Callable, optional
-    :type drop_tks: bool, optional
+    Args:
+        capacity_factor_train (float, optional): Capacity factor in routing of training.
+        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
+        min_capacity (int, optional): The minimum number of the capacity of each expert.
+        select_policy (str, optional): The policy about tokens selection.
+        noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
+        drop_tks (bool, optional): Whether drops tokens in evaluation
    """

    def __init__(self,
@@ -119,17 +113,12 @@ class Top2Router(nn.Module):
    """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
    for routing usage. More deailted function can be found in the paper about ViT-MoE.

-    :param capacity_factor_train: Capacity factor in routing during training
-    :param capacity_factor_eval: Capacity factor in routing during evaluation
-    :param min_capacity: The minimum number of the capacity of each expert
-    :param noisy_func: Noisy function used in logits
-    :param drop_tks: Whether drops tokens in evaluation
-
-    :type capacity_factor_train: float, optional
-    :type capacity_factor_eval: float, optional
-    :type min_capacity: int, optional
-    :type noisy_func: Callable, optional
-    :type drop_tks: bool, optional
+    Args:
+        capacity_factor_train (float, optional): Capacity factor in routing of training.
+        capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
+        min_capacity (int, optional): The minimum number of the capacity of each expert
+        noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
+        drop_tks (bool, optional): Whether drops tokens in evaluation.
    """

    def __init__(self,
@@ -239,15 +228,11 @@ class MoeLayer(nn.Module):
    the moe tensor group by all to all comunication. Then it will get the output of all
    experts and exchange the output. At last returns the output of the moe system.

-    :param dim_model: Dimension of model
-    :param num_experts: The number of experts
-    :param router: Instance of router used in routing
-    :param experts: Instance of experts generated by Expert
-
-    :type dim_model: int
-    :type num_experts: int
-    :type router: nn.Module
-    :type experts: nn.Module
+    Args:
+        dim_model (int): Dimension of model.
+        num_experts (int): The number of experts.
+        router (:class:`torch.nn.Module`): Instance of router used in routing.
+        experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
    """

    def __init__(self, dim_model: int, num_experts: int, router: nn.Module, experts: MoeExperts):

--- a/colossalai/nn/layer/moe/utils.py
+++ b/colossalai/nn/layer/moe/utils.py
@@ -16,8 +16,8 @@ class NormalNoiseGenerator:
    All noise is generated from a normal distribution (0, 1 / E^2), where
    E = the number of experts.

-    :param num_experts: The number of experts
-    :type num_experts: int
+    Args:
+        num_experts (int): The number of experts.
    """

    def __init__(self, num_experts: int):
@@ -37,8 +37,8 @@ class UniformNoiseGenerator:
    Makes models more resilient to rounding errors introduced by bfloat16.
    This seems particularly important for logits.

-    :param eps: Epsilon in generator
-    :type eps: float
+    Args:
+        eps (float, optional): Epsilon in generator, defaults 1e-2.
    """

    def __init__(self, eps: float = 1e-2):

--- a/colossalai/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/nn/layer/parallel_1d/_operation.py
@@ -7,17 +7,17 @@ except:


 class FusedLayerNormAffineFunction1D(torch.autograd.Function):
-    r"""
-  Layernorm
+    r"""Layernorm

-  :param input: input maxtrix
-  :param weight: weight matrix
-  :param bias: bias matrix
-  :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-  :param eps: a value added to the denominator for numerical stability
+    Args:
+        input: input matrix.
+        weight: weight matrix.
+        bias: bias matrix.
+        normalized_shape: input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability
  """

    @staticmethod

--- a/colossalai/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/nn/layer/parallel_1d/_utils.py
@@ -78,8 +78,9 @@ class _ReduceGrad(torch.autograd.Function):
    """
    Pass the input to the model parallel region.

-    :param input_: input matrix
-    :param parallel_mode: parallel mode
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
    """

    @staticmethod
@@ -99,9 +100,10 @@ class _ReduceGrad(torch.autograd.Function):
 class _ReduceInput(torch.autograd.Function):
    """
    All-reduce the input from the model parallel region.
-    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
+
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
    """

    @staticmethod
@@ -121,9 +123,10 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.
    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
-    :param dim: dimension
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
    """

    @staticmethod
@@ -142,12 +145,12 @@ class _SplitForwardGatherBackward(torch.autograd.Function):


 class _GatherForwardSplitBackward(torch.autograd.Function):
-    """
-    Gather the input from model parallel region and concatinate.
-    
-    :param input_: input matrix
-    :param parallel_mode: parallel mode
-    :param dim: dimension
+    """Gather the input from model parallel region and concatenate.
+
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
    """

    @staticmethod

--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -24,24 +24,23 @@ from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_g

 @LAYERS.register_module
 class Linear1D(torch.nn.Module):
-    """
-    Linear layer for 1D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Linear layer for 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -88,23 +87,21 @@ class Linear1D(torch.nn.Module):

 @LAYERS.register_module
 class Classifier1D(ParallelLayer):
-    """RowLinear with given weight
-    Classifier of 1D parallelism
-    
-    :param in_features: size of input features
-    :type in_features: int
-    :param num_classes: number of classes in the dataset
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""RowLinear with given weight. Classifier of 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -171,23 +168,21 @@ class Classifier1D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelClassifier1D(ParallelLayer):
-    """ColLinear with given weight
-    Classifier of 1D parallelism
-    
-    :param in_features: size of input features
-    :type in_features: int
-    :param num_classes: number of classes in the dataset
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""ColLinear with given weight. Classifier of 1D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -249,30 +244,28 @@ class VocabParallelClassifier1D(ParallelLayer):

 @LAYERS.register_module
 class Linear1D_Col(ParallelLayer):
-    """Linear layer with column parallelism.
+    r"""Linear layer with column parallelism.

    The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
    its second dimension as :math:`A = [A_1, ..., A_p]`.

-    :param in_features: first dimension of matrix A.
-    :type in_features: int
-    :param output_size: second dimension of matrix A.
-    :type output_size: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param gather_output: If true, call all-gether on output and make Y avaiable
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        gather_output (bool, optional): If true, call all-gather on output and make Y available
                    to all GPUs, otherwise, every GPU will have its output
                    which is :math:`Y_i = XA_i`, defaults to False
-    :type gather_output: bool, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to Fals
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -343,25 +336,23 @@ class Linear1D_Col(ParallelLayer):

 @LAYERS.register_module
 class Linear1D_Row(ParallelLayer):
-    """ Linear layer with row parallelism 
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False
-    :type parallel_input: bool, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
-        which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r""" Linear layer with row parallelism
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to Fals
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -432,21 +423,33 @@ class Linear1D_Row(ParallelLayer):

 @LAYERS.register_module
 class Embedding1D(ParallelLayer):
-    """
-    Embedding for 1D parallelism
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding for 1D parallelism.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """

    def __init__(self,
@@ -499,20 +502,33 @@ class Embedding1D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelEmbedding1D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding parallelized in the vocabulary dimension.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """

    def __init__(self,
@@ -578,13 +594,11 @@ class VocabParallelEmbedding1D(torch.nn.Module):

 @LAYERS.register_module
 class Dropout1D(ParallelLayer):
-    """
-    Dropout layer of 1D parallelism
+    """Dropout layer of 1D parallelism.

-    :param p: dropout rate, defaults to 0.5
-    :type p: float, optional
-    :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
-    :type inplace: bool, optional
+    Args:
+        p (float, optional): probability of an element to be zeroed, defaults 0.5.
+        inplace (bool, optional): whether to do dropout in-place, default to be False.
    """

    def __init__(self, p: float = 0.5, inplace: bool = False):

--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
@@ -22,23 +22,22 @@ from ._utils import assert_summa_initialization, get_summa_dim_from_env

 @LAYERS.register_module
 class Linear2D(ParallelLayer):
-    """
-    Linear layer for 2D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
-    :type skip_bias_add: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Linear layer for 2D parallelism
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@@ -119,18 +118,16 @@ class Linear2D(ParallelLayer):

 @LAYERS.register_module
 class LayerNorm2D(ParallelLayer):
-    r"""
-    Layer Normalization for 2D parallelism
-
-    :param normalized_shape: input shape from an expected input
-        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    r"""Layer Normalization for 2D parallelism.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()
@@ -189,27 +186,24 @@ class LayerNorm2D(ParallelLayer):

 @LAYERS.register_module
 class PatchEmbedding2D(ParallelLayer):
-    """
-    2D Image to Patch Embedding
-
-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    r"""2D Image to Patch Embedding.
+
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 img_size: int,
@@ -291,21 +285,33 @@ class PatchEmbedding2D(ParallelLayer):

 @LAYERS.register_module
 class Embedding2D(ParallelLayer):
-    """
-    Embedding for 2D parallelism
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding for 2D parallelism.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
                 num_embeddings: int,
@@ -358,20 +364,33 @@ class Embedding2D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelEmbedding2D(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding parallelized in the vocabulary dimension.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 num_embeddings: int,
@@ -435,23 +454,21 @@ class VocabParallelEmbedding2D(torch.nn.Module):

 @LAYERS.register_module
 class Classifier2D(ParallelLayer):
-    """
-    Classifier for 2D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Classifier for 2D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@@ -515,23 +532,21 @@ class Classifier2D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelClassifier2D(ParallelLayer):
-    """
-    Vocab parallel classifier layer for 2D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Vocab parallel classifier layer for 2D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,

--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/nn/layer/parallel_2p5d/_operation.py
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@@ -23,21 +23,22 @@ from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_

 @LAYERS.register_module
 class Linear2p5D(ParallelLayer):
-    """
-    Linear layer for 2.5D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param out_features: size of each output sample
-    :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Linear layer for 2.5D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@@ -131,19 +132,16 @@ class Linear2p5D(ParallelLayer):

 @LAYERS.register_module
 class LayerNorm2p5D(ParallelLayer):
-    r"""
-    Layer Normalization for 2.5D parallelism
-
-    :param normalized_shape: input shape from an expected input of size.
-        :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-        \times \ldots \times \text{normalized_shape}[-1]]`
-        If a single integer is used, it is treated as a singleton list, and this module will
-        normalize over the last dimension which is expected to be of that specific size.
-    :type normalized_shape: int
-    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
-    :type eps: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
+    r"""Layer Normalization for 2.5D parallelism.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
    """
    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()
@@ -204,27 +202,24 @@ class LayerNorm2p5D(ParallelLayer):

 @LAYERS.register_module
 class PatchEmbedding2p5D(ParallelLayer):
-    """
-    2D Image to Patch Embedding
-
-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: size of embedding
-    :type embed_size: int
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
-    :param position_embed_initializer: The intializer of position embedding, defaults to zero
-    :type position_embed_initializer: typing.Callable, optional
+    r"""2D Image to Patch Embedding.
+
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 img_size: int,
@@ -306,21 +301,33 @@ class PatchEmbedding2p5D(ParallelLayer):

 @LAYERS.register_module
 class Embedding2p5D(ParallelLayer):
-    """
-    Embedding for 2.5D parallelism
-
-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    r"""Embedding for 2.5D parallelism.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
    """
    def __init__(self,
                 num_embeddings: int,
@@ -376,18 +383,31 @@ class Embedding2p5D(ParallelLayer):
 class VocabParallelEmbedding2p5D(torch.nn.Module):
    """Embedding parallelized in the vocabulary dimension.

-    :param num_embeddings: number of embeddings
-    :type num_embeddings: int
-    :param embedding_dim: dimension of embedding
-    :type embedding_dim: int
-    :param padding_idx: index of padding, defaults to None
-    :type padding_idx: int, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to normal initializer
-    :type weight_initializer: typing.Callable, optional
-    :param args: Args used in F.embedding
-    :param kwargs: Kwargs used in F.embedding
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about initializer please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 num_embeddings: int,
@@ -455,23 +475,21 @@ class VocabParallelEmbedding2p5D(torch.nn.Module):

 @LAYERS.register_module
 class Classifier2p5D(ParallelLayer):
-    """
-    Classifier for 2.5D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Classifier for 2.5D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,
@@ -537,23 +555,21 @@ class Classifier2p5D(ParallelLayer):

 @LAYERS.register_module
 class VocabParallelClassifier2p5D(ParallelLayer):
-    """
-    Vocab parallel classifier layer for 2.5D parallelism
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param weight: weight of the classifier, defaults to True
-    :type weight: torch.nn.Parameter, optional
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
-    :type bias: bool, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
-    :type weight_initializer: typing.Callable, optional
-    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
-    :type bias_initializer: typing.Callable, optional
+    r"""Vocab parallel classifier layer for 2.5D parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): number of classes.
+        weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
    """
    def __init__(self,
                 in_features: int,

--- a/colossalai/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/nn/layer/parallel_3d/_operation.py
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
--- a/colossalai/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/nn/layer/parallel_sequence/layers.py
@@ -24,14 +24,13 @@ class TransformerSelfAttentionRing(nn.Module):
    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.

-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param kv_channels: channels of key/value tensor
-    :type kv_channels: int
-    :param num_attention_heads: number of attention heads
-    :type num_attention_heads: int
-    :param attention_dropout: dropout probability for attention layer
-    :type attention_dropout: float
+    Args:
+        hidden_size (int): hidden size.
+        num_attention_heads (int): number of attention heads.
+        attention_dropout (float): dropout probability for attention layer.
+        attention_mask_func (:class:`typing.Callable`): Mask function to be applied.
+        layer_number (int): number of layers.
+
    """

    def __init__(self,

--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/nn/layer/utils/common.py
@@ -38,11 +38,16 @@ class CheckpointModule(nn.Module):


 def divide(numerator, denominator):
-    """Only allow exact division
+    """Only allow exact division.

-    :param numerator: Numerator of the division
-    :param denominator: Denominator of the division
+    Args:
+        numerator (int): Numerator of the division.
+        denominator (int): Denominator of the division.
+
+    Returns:
+        int: the result of exact division.
    """
+    assert denominator != 0, 'denominator can not be zero'
    assert numerator % denominator == 0, \
        '{} is not divisible by {}'.format(numerator, denominator)
    return numerator // denominator

--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
--- a/colossalai/nn/layer/wrapper/lambda_wrapper.py
+++ b/colossalai/nn/layer/wrapper/lambda_wrapper.py
--- a/colossalai/nn/loss/loss_1d.py
+++ b/colossalai/nn/loss/loss_1d.py
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py