2.4.2

dbe08e9b · yuguo960516yuguo · b5499578 · dbe08e9b · dbe08e9b · dbe08e9b
Commit dbe08e9b authored Jun 12, 2023 by yuguo960516yuguo
20 changed files
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1735,16 +1735,18 @@ def adaptive_avg_pool1d(x, output_size, name=None):


 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
-    """
+    r"""
+
    Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size.

    For avg adaptive pool2d:
+
    ..  math::
-        hstart &= floor(i * H_{in} / H_{out})
-        hend &= ceil((i + 1) * H_{in} / H_{out})
-        wstart &= floor(j * W_{in} / W_{out})
-        wend &= ceil((j + 1) * W_{in} / W_{out})
+        hstart &= floor(i * H_{in} / H_{out}) \\
+        hend &= ceil((i + 1) * H_{in} / H_{out}) \\
+        wstart &= floor(j * W_{in} / W_{out}) \\
+        wend &= ceil((j + 1) * W_{in} / W_{out}) \\
        Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}

    Args:
@@ -1753,14 +1755,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
            it must contain two element, (H, W). H and W can be either a int, or None which means
            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format (str, optional): The data format of the input and output data. An optional string
            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
            the order of: [batch_size, input_channels, input_height, input_width].
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
+
    Returns:
-        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor.

    Examples:
        .. code-block:: python
@@ -1788,6 +1791,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
                            x = x,
                            output_size=[3, 3])
            # out.shape is [2, 3, 3, 3]
+
    """
    if not in_dynamic_mode():
        check_variable_and_dtype(
@@ -1879,35 +1883,37 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):


 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
-    """
+    r"""
+
    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size.

    For avg adaptive pool3d:
+
    ..  math::
-        dstart &= floor(i * D_{in} / D_{out})
-        dend &= ceil((i + 1) * D_{in} / D_{out})
-        hstart &= floor(j * H_{in} / H_{out})
-        hend &= ceil((j + 1) * H_{in} / H_{out})
-        wstart &= floor(k * W_{in} / W_{out})
-        wend &= ceil((k + 1) * W_{in} / W_{out})
+        dstart &= floor(i * D_{in} / D_{out}) \\
+        dend &= ceil((i + 1) * D_{in} / D_{out}) \\
+        hstart &= floor(j * H_{in} / H_{out}) \\
+        hend &= ceil((j + 1) * H_{in} / H_{out}) \\
+        wstart &= floor(k * W_{in} / W_{out}) \\
+        wend &= ceil((k + 1) * W_{in} / W_{out}) \\
        Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
            {(dend - dstart) * (hend - hstart) * (wend - wstart)}

    Args:
        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
-                          The data type can be float32, float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+            The data type can be float32, float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or
+            list, it must contain three elements, (D, H, W). D, H and W can be either a int,
+            or None which means the size will be the same as that of the input.
+        data_format (str, optional): The data format of the input and output data. An optional string
            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
    Returns:
-        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor.

    Examples:
        .. code-block:: python
@@ -1937,6 +1943,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
                            x = input_data,
                            output_size=[3, 3, 3])
            # out.shape is [2, 3, 3, 3, 3]
+
    """
    if not in_dynamic_mode():
        check_variable_and_dtype(

--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -1450,15 +1450,16 @@ class Maxout(Layer):

 class Softmax2D(Layer):
    r"""
+
    Softmax2D Activation.
    Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
    The sum of result in each location (C, H_i, W_j) will be one.

    Shape:
        - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
-        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input)
+        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)

-    Return:
+    Returns:
        A Tensor of the same shape and dtype as input with value in range [0, 1].

    Examples:
@@ -1483,6 +1484,7 @@ class Softmax2D(Layer):
            #   [[0.42368975 0.51082766 0.47752273 0.5258871 ]
            #    [0.66754097 0.47182566 0.5187628  0.5402329 ]
            #    [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
+
    """

    def __init__(self, name=None):

--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -20,6 +20,7 @@ __all__ = []

 class PairwiseDistance(Layer):
    r"""
+
    It computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:

@@ -38,14 +39,14 @@ class PairwiseDistance(Layer):
            Generally, no setting is required. Default: None.

    Shape:
-        x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
-            is the dimension of the data. Available data type is float32, float64.
-        y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
-        output: The same dtype as input tensor.
+        - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
+          is the dimension of the data. Available data type is float32, float64.
+        - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
+        - output: The same dtype as input tensor.
            - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
-                depending on whether the input has data shaped as :math:`[N, D]`.
+              depending on whether the input has data shaped as :math:`[N, D]`.
            - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
-                depending on whether the input has data shaped as :math:`[N, D]`.
+              depending on whether the input has data shaped as :math:`[N, D]`.

    Examples:
        .. code-block:: python

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -31,7 +31,8 @@ __all__ = []

 class BCEWithLogitsLoss(Layer):
    r"""
-    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+
+    This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
    layer and some reduce operations.

@@ -54,7 +55,7 @@ class BCEWithLogitsLoss(Layer):
    For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
    we reformulate the loss as follows:

-    .. math::
+        .. math::
           Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})

    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
@@ -86,21 +87,21 @@ class BCEWithLogitsLoss(Layer):
            For more information, please refer to :ref:`api_guide_Name`.

    Shapes:
-        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
-            N is batch_size, `*` means number of additional dimensions. The ``logit``
-            is usually the output of Linear layer. Available dtype is float32, float64.
-        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
-            ``logit``. The target labels which values should be numbers between 0 and 1.
-            Available dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``logit`` , else the shape of output is scalar.
+        - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
+          N is batch_size, `*` means number of additional dimensions. The ``logit``
+          is usually the output of Linear layer. Available dtype is float32, float64.
+        - label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+          ``logit``. The target labels which values should be numbers between 0 and 1.
+          Available dtype is float32, float64.
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``logit`` , else the shape of output is scalar.

    Returns:
        A callable object of BCEWithLogitsLoss.

    Examples:
-
        .. code-block:: python
+
            import paddle
            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
@@ -139,6 +140,7 @@ class BCEWithLogitsLoss(Layer):

 class CrossEntropyLoss(Layer):
    r"""
+
    By default, this operator implements the cross entropy loss function with softmax. This function
    combines the calculation of the softmax operation and the cross entropy loss function
    to provide a more numerically stable computing.
@@ -251,60 +253,35 @@ class CrossEntropyLoss(Layer):


    Parameters:
-
-        - **weight** (Tensor, optional)
-
-            a manual rescaling weight given to each class.
+        weight (Tensor, optional): a manual rescaling weight given to each class.
            If given, has to be a Tensor of size C and the data type is float32, float64.
            Default is ``'None'`` .
-
-        - **ignore_index** (int64, optional)
-
-            Specifies a target value that is ignored
+        ignore_index (int64, optional): Specifies a target value that is ignored
            and does not contribute to the loss. A negative value means that no label
            value needs to be ignored. Only valid when soft_label = False.
            Default is ``-100`` .
-
-        - **reduction** (str, optional)
-
-            Indicate how to average the loss by batch_size,
+        reduction (str, optional): Indicate how to average the loss by batch_size,
            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
            Default is ``'mean'``.
-
-        - **soft_label** (bool, optional)
-
-            Indicate whether label is soft.
+        soft_label (bool, optional): Indicate whether label is soft.
            If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
            Default is ``False``.
-
-        - **axis** (int, optional)
-
-            The index of dimension to perform softmax calculations.
+        axis (int, optional): The index of dimension to perform softmax calculations.
            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
            of dimensions of input :attr:`input`.
            Default is ``-1`` .
-
-        - **use_softmax** (bool, optional)
-
-            Indicate whether compute softmax before cross_entropy.
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
            Default is ``True``.
-
-        - **name** (str, optional)
-
-            The name of the operator. Default is ``None`` .
+        name (str, optional): The name of the operator. Default is ``None`` .
            For more information, please refer to :ref:`api_guide_Name` .


    Shape:
-
-        - **input** (Tensor)
-
-            Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
-
+        - **input** (Tensor), the data type is float32, float64. Shape is
+          :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
            Note:

                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
@@ -312,7 +289,6 @@ class CrossEntropyLoss(Layer):

                2. when use_softmax=False, it expects the output of softmax operator.

-
        - **label** (Tensor)

            1. If soft_label=False, the shape is
@@ -322,15 +298,10 @@ class CrossEntropyLoss(Layer):
            2. If soft_label=True, the shape and data type should be same with ``input`` ,
            and the sum of the labels for each sample should be 1.

-        - **output** (Tensor)
-
-            Return the softmax cross_entropy loss of ``input`` and ``label``.
-
-            The data type is the same as input.
-
-            If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
-
-            If :attr:`reduction` is ``'none'``:
+        - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
+          The data type is the same as input.
+          If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
+          If :attr:`reduction` is ``'none'``:

            1. If soft_label = False, the dimension of return value is the same with ``label`` .

@@ -634,6 +605,7 @@ class MSELoss(Layer):

 class L1Loss(Layer):
    r"""
+
    Construct a callable object of the ``L1Loss`` class.
    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.

@@ -663,11 +635,11 @@ class L1Loss(Layer):
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

    Shape:
-        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
-        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
-        output (Tensor): The L1 Loss of ``input`` and ``label``.
-            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
-            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+        - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        - output (Tensor): The L1 Loss of ``input`` and ``label``.
+          If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
+          If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].

    Examples:
        .. code-block:: python
@@ -692,6 +664,7 @@ class L1Loss(Layer):
            print(output)
            # [[0.20000005 0.19999999]
            # [0.2        0.79999995]]
+
    """

    def __init__(self, reduction='mean', name=None):
@@ -712,6 +685,7 @@ class L1Loss(Layer):

 class BCELoss(Layer):
    """
+
    This interface is used to construct a callable object of the ``BCELoss`` class.
    The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
    and target labels ``label`` . The binary_cross_entropy loss can be described as:
@@ -755,14 +729,14 @@ class BCELoss(Layer):
            For more information, please refer to :ref:`api_guide_Name`.

    Shape:
-        input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means
-            number of additional dimensions. The input ``input`` should always
-            be the output of sigmod.  Available dtype is float32, float64.
-        label (Tensor): 2-D tensor with the same shape as ``input``. The target
-            labels which values should be numbers between 0 and 1. Available
-            dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``input`` , else the shape of output is scalar.
+        - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
+          number of additional dimensions. The input ``input`` should always
+          be the output of sigmod.  Available dtype is float32, float64.
+        - label (Tensor): 2-D tensor with the same shape as ``input``. The target
+          labels which values should be numbers between 0 and 1. Available
+          dtype is float32, float64.
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``input`` , else the shape of output is scalar.

    Returns:
        A callable object of BCELoss.
@@ -855,7 +829,7 @@ class NLLLoss(Layer):
            if `reduction` is ``'sum'``, the reduced sum loss is returned;
            if `reduction` is ``'none'``, no reduction will be apllied.
            Default is ``'mean'``.
-         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.

    Shape:
        - input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
@@ -914,6 +888,7 @@ class NLLLoss(Layer):

 class KLDivLoss(Layer):
    r"""
+
    Generate a callable object of 'KLDivLoss' to calculate the
    Kullback-Leibler divergence loss between Input(X) and
    Input(Target). Notes that Input(X) is the log-probability
@@ -933,14 +908,10 @@ class KLDivLoss(Layer):
             Default is ``'mean'``.

    Shape:
-
-        - input (Tensor): (N, *), where * means, any number of additional dimensions.
-
-        - label (Tensor): (N, *), same shape as input.
-
+        - input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
+        - label (Tensor): ``(N, *)``, same shape as input.
        - output (Tensor): tensor with shape: [1] by default.

-
    Examples:
        .. code-block:: python

@@ -970,6 +941,7 @@ class KLDivLoss(Layer):
            kldiv_criterion = nn.KLDivLoss(reduction='none')
            pred_loss = kldiv_criterion(x, target)
            # shape=[5, 20]
+
    """

    def __init__(self, reduction='mean'):
@@ -1720,6 +1692,7 @@ class TripletMarginLoss(Layer):

 class SoftMarginLoss(Layer):
    r"""
+
    Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
    and target labels ``label`` . It can be described as:

@@ -1738,17 +1711,14 @@ class SoftMarginLoss(Layer):
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

    Shapes:
-
-        Input (Tensor): The input tensor with shape: [N, *],
-        N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
-        Available dtype is float32, float64.
-
-        Label (Tensor): The target labels tensor with the same shape as
-        ``input``. The target labels which values should be numbers -1 or 1.
-        Available dtype is int32, int64, float32, float64.
-
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``input`` , else the shape of output is [1].
+        - Input (Tensor): The input tensor with shape: ``[N, *]``,
+          N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
+          Available dtype is float32, float64.
+        - Label (Tensor): The target labels tensor with the same shape as
+          ``input``. The target labels which values should be numbers -1 or 1.
+          Available dtype is int32, int64, float32, float64.
+        - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``input`` , else the shape of output is [1].

    Returns:
        A callable object of SoftMarginLoss.
@@ -1780,6 +1750,7 @@ class SoftMarginLoss(Layer):
            #         [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511],
            #         [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399],
            #         [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]])
+
    """

    def __init__(self, reduction='mean', name=None):

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -321,6 +321,7 @@ Where `H` means height of feature map, `W` means width of feature map.

 class GroupNorm(Layer):
    """
+
    This interface is used to construct a callable object of the ``GroupNorm`` class.
    For more details, refer to code examples.
    It implements the function of the Group Normalization Layer.
@@ -341,7 +342,7 @@ class GroupNorm(Layer):
        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
-        - x: Tensor with shape: (batch, num_features, *).
+        - x: Tensor with shape: attr:`(batch, num_features, *)`.
        - output: The same shape as input x.

    Returns:
@@ -1047,6 +1048,7 @@ class BatchNorm3D(_BatchNormBase):

 class SyncBatchNorm(_BatchNormBase):
    r"""
+
    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
    be used as a normalizer function for other operations, such as conv2d and fully connected 
@@ -1092,9 +1094,9 @@ class SyncBatchNorm(_BatchNormBase):
    - :math:`\beta` : trainable shift parameter vector 

    Note:
-        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
-        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
-        ``list`` to pack the model. 
+        If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the
+        evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of
+        :ref:`api_paddle_hub_list` to pack the model.

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -1112,29 +1114,30 @@ class SyncBatchNorm(_BatchNormBase):
             have trainable bias parameter. Default: None.

    Shapes:
-        input: Tensor that the dimension from 2 to 5.
-        output: Tensor with the same shape as input.
+        - input: Tensor that the dimension from 2 to 5.
+        - output: Tensor with the same shape as input.

    Examples:
        .. code-block:: python

-          # required: gpu
+            # required: gpu

-          import paddle
-          import paddle.nn as nn
+            import paddle
+            import paddle.nn as nn

-          x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+            x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')

-          if paddle.is_compiled_with_cuda():
-              sync_batch_norm = nn.SyncBatchNorm(2)
-              hidden1 = sync_batch_norm(x)
-              print(hidden1)
-              # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-              #        [[[[ 0.26824948,  1.09363246],
-              #           [ 0.26824948, -1.63013160]],
+            if paddle.is_compiled_with_cuda():
+                sync_batch_norm = nn.SyncBatchNorm(2)
+                hidden1 = sync_batch_norm(x)
+                print(hidden1)
+                # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+                #        [[[[ 0.26824948,  1.09363246],
+                #           [ 0.26824948, -1.63013160]],
+
+                #          [[ 0.80956620, -0.66528702],
+                #           [-1.27446556,  1.13018656]]]])

-              #          [[ 0.80956620, -0.66528702],
-              #           [-1.27446556,  1.13018656]]]])
    """

    def __init__(
@@ -1284,8 +1287,8 @@ class SyncBatchNorm(_BatchNormBase):
            The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.

        Examples:
-
            .. code-block:: python
+
                import paddle
                import paddle.nn as nn


--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -224,6 +224,7 @@ class AvgPool2D(Layer):

 class AvgPool3D(Layer):
    """
+
    This operation applies 3D max pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
    in NCDHW format, where N is batch size, C is the number of channels,
@@ -264,6 +265,7 @@ class AvgPool3D(Layer):
          The data type can be float32, float64.
        - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
          The data type is same as input x.
+
    Examples:
        .. code-block:: python


--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -514,14 +514,17 @@ class QuantizedConv2D(Layer):

 class QuantizedConv2DTranspose(Layer):
    """
+
    The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
    The only difference is that its inputs are all fake quantized.
    
    Examples:
       .. code-block:: python
+
          import paddle
          import paddle.nn as nn
          from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
+
          x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
          conv = nn.Conv2DTranspose(4, 6, (3, 3))
          conv_quantized = QuantizedConv2DTranspose(conv)
@@ -531,6 +534,7 @@ class QuantizedConv2DTranspose(Layer):
          y_np = y_var.numpy()
          print(y_np.shape, y_quantized_np.shape)
          # (2, 6, 10, 10), (2, 6, 10, 10)
+
    """

    def __init__(self,

--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1661,6 +1661,7 @@ class MultiplicativeDecay(LRScheduler):

 class OneCycleLR(LRScheduler):
    r"""
+
    Sets the learning rate according to the one cycle learning rate scheduler.
    The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
    from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
@@ -1674,22 +1675,25 @@ class OneCycleLR(LRScheduler):
    Also note that you should update learning rate each step.

    Args:
-        max_learning_rate (float): The maximum learning rate. It is a python float number.
-             Functionally, it defines the initial learning rate by ``divide_factor`` .
+        max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
        total_steps (int): Number of total training steps.
-        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
-        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
-            'linear' for linear annealing. Default: 'cos'.
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
        three_phase (bool, optional): Whether to use three phase.
+
            If ``True``:
+
                1. The learning rate will first increase from initial learning rate to maximum learning rate.
                2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
                3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
+
            If ``False``:
+
                1. The learning rate will increase to maximum learning rate.
                2. Then it will directly decrease to minimum learning rate.
+
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

@@ -1741,6 +1745,7 @@ class OneCycleLR(LRScheduler):
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # You should update learning rate each step
+
    """

    def __init__(

--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -1197,9 +1197,7 @@ class Optimizer(object):
        assert regularization_term is not None

        if framework.in_dygraph_mode():
-            if grad.is_dense() and regularization_term.is_dense():
-                return _C_ops.add_n([grad, regularization_term])
-            return _legacy_C_ops.sum([grad, regularization_term])
+            return _C_ops.add_n([grad, regularization_term])
        elif framework._in_legacy_dygraph():
            return _legacy_C_ops.sum([grad, regularization_term])


--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -39,15 +39,15 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
            with shape `[..., seq_length]` or `[seq_length, ...]`.
        frame_length (int): Length of the frame and `0 < frame_length <= x.shape[axis]`.
        hop_length (int): Number of steps to advance between adjacent frames
-            and `0 < hop_length`. 
+            and `0 < hop_length`.
        axis (int, optional): Specify the axis to operate on the input Tensors. Its
            value should be 0(the first dimension) or -1(the last dimension). If not
-            specified, the last axis is used by default. 
+            specified, the last axis is used by default.

    Returns:
        The output frames tensor with shape `[..., frame_length, num_frames]` if `axis==-1`,
            otherwise `[num_frames, frame_length, ...]` where
-        
+
            `num_framse = 1 + (x.shape[axis] - frame_length) // hop_length`

    Examples:
@@ -56,7 +56,7 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):

        import paddle
        from paddle.signal import frame
-        
+
        # 1D
        x = paddle.arange(8)
        y0 = frame(x, frame_length=4, hop_length=2, axis=-1)  # [4, 3]
@@ -124,7 +124,8 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
        if frame_length > x.shape[axis]:
            raise ValueError(
                f'Attribute frame_length should be less equal than sequence length, '
-                f'but got ({frame_length}) > ({x.shape[axis]}).')
+                f'but got ({frame_length}) > ({x.shape[axis]}).'
+            )

    op_type = 'frame'

@@ -132,25 +133,33 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
        return _C_ops.frame(x, frame_length, hop_length, axis)

    if _in_legacy_dygraph():
-        attrs = ('frame_length', frame_length, 'hop_length', hop_length, 'axis',
-                 axis)
+        attrs = (
+            'frame_length',
+            frame_length,
+            'hop_length',
+            hop_length,
+            'axis',
+            axis,
+        )
        op = getattr(_legacy_C_ops, op_type)
        out = op(x, *attrs)
    else:
        check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-            op_type)
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
+        )
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type=op_type,
-                         inputs={'X': x},
-                         attrs={
-                             'frame_length': frame_length,
-                             'hop_length': hop_length,
-                             'axis': axis
-                         },
-                         outputs={'Out': out})
+        helper.append_op(
+            type=op_type,
+            inputs={'X': x},
+            attrs={
+                'frame_length': frame_length,
+                'hop_length': hop_length,
+                'axis': axis,
+            },
+            outputs={'Out': out},
+        )
    return out


@@ -163,10 +172,10 @@ def overlap_add(x, hop_length, axis=-1, name=None):
            with shape `[..., frame_length, num_frames]` or
            `[num_frames, frame_length ...]`.
        hop_length (int): Number of steps to advance between adjacent frames and
-            `0 < hop_length <= frame_length`. 
+            `0 < hop_length <= frame_length`.
        axis (int, optional): Specify the axis to operate on the input Tensors. Its
            value should be 0(the first dimension) or -1(the last dimension). If not
-            specified, the last axis is used by default. 
+            specified, the last axis is used by default.

    Returns:
        The output frames tensor with shape `[..., seq_length]` if `axis==-1`,
@@ -180,7 +189,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):

        import paddle
        from paddle.signal import overlap_add
-        
+
        # 2D
        x0 = paddle.arange(16).reshape([8, 2])
        # [[0 , 1 ],
@@ -205,7 +214,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
        y0 = overlap_add(x0, hop_length=2, axis=-1)  # [2, 1, 10]

        x1 = paddle.arange(32).reshape([2, 8, 1, 2])
-        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2] 
+        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2]
    """
    if axis not in [0, -1]:
        raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')
@@ -225,32 +234,34 @@ def overlap_add(x, hop_length, axis=-1, name=None):
        out = op(x, *attrs)
    else:
        check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-            op_type)
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
+        )
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type=op_type,
-                         inputs={'X': x},
-                         attrs={
-                             'hop_length': hop_length,
-                             'axis': axis
-                         },
-                         outputs={'Out': out})
+        helper.append_op(
+            type=op_type,
+            inputs={'X': x},
+            attrs={'hop_length': hop_length, 'axis': axis},
+            outputs={'Out': out},
+        )
    return out


-def stft(x,
-         n_fft,
-         hop_length=None,
-         win_length=None,
-         window=None,
-         center=True,
-         pad_mode='reflect',
-         normalized=False,
-         onesided=True,
-         name=None):
+def stft(
+    x,
+    n_fft,
+    hop_length=None,
+    win_length=None,
+    window=None,
+    center=True,
+    pad_mode='reflect',
+    normalized=False,
+    onesided=True,
+    name=None,
+):
    r"""
+
    Short-time Fourier transform (STFT).

    The STFT computes the discrete Fourier transforms (DFT) of short overlapping
@@ -263,11 +274,14 @@ def stft(x,
    
    Where:
    - :math:`t`: The :math:`t`-th input window.
+
    - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
-        or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`. 
+      or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`.
+
    - :math:`N`: Value of `n_fft`.
-    - :math:`H`: Value of `hop_length`.  
-    
+
+    - :math:`H`: Value of `hop_length`.
+
    Args:
        x (Tensor): The input data which is a 1-dimensional or 2-dimensional Tensor with
            shape `[..., seq_length]`. It can be a real-valued or a complex Tensor.
@@ -292,10 +306,10 @@ def stft(x,
            to set this property. For more information, please refer to :ref:`api_guide_Name`.
    
    Returns:
-        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`(
-            real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
-            `onesided` is `False`)
-    
+        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`
+        (real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`
+        (`onesided` is `False`)
+
    Examples:
        .. code-block:: python
    
@@ -311,14 +325,17 @@ def stft(x,
            x = paddle.randn([8, 48000], dtype=paddle.float64) + \
                    paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
            y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
+
    """
-    check_variable_and_dtype(x, 'x',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'stft')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft'
+    )

    x_rank = len(x.shape)
-    assert x_rank in [1, 2], \
-        f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
+    assert x_rank in [
+        1,
+        2,
+    ], f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'

    if x_rank == 1:  # (batch, seq_length)
        x = x.unsqueeze(0)
@@ -326,69 +343,77 @@ def stft(x,
    if hop_length is None:
        hop_length = int(n_fft // 4)

-    assert hop_length > 0, \
-        f'hop_length should be > 0, but got {hop_length}.'
+    assert hop_length > 0, f'hop_length should be > 0, but got {hop_length}.'

    if win_length is None:
        win_length = n_fft

    if _non_static_mode():
-        assert 0 < n_fft <= x.shape[-1], \
-            f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
+        assert (
+            0 < n_fft <= x.shape[-1]
+        ), f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'

-    assert 0 < win_length <= n_fft, \
-        f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
+    assert (
+        0 < win_length <= n_fft
+    ), f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'

    if window is not None:
-        assert len(window.shape) == 1 and len(window) == win_length, \
-            f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
+        assert (
+            len(window.shape) == 1 and len(window) == win_length
+        ), f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
    else:
-        window = paddle.ones(shape=(win_length, ), dtype=x.dtype)
+        window = paddle.ones(shape=(win_length,), dtype=x.dtype)

    if win_length < n_fft:
        pad_left = (n_fft - win_length) // 2
        pad_right = n_fft - win_length - pad_left
-        window = paddle.nn.functional.pad(window,
-                                          pad=[pad_left, pad_right],
-                                          mode='constant')
+        window = paddle.nn.functional.pad(
+            window, pad=[pad_left, pad_right], mode='constant'
+        )

    if center:
-        assert pad_mode in ['constant', 'reflect'], \
-            'pad_mode should be "reflect" or "constant", but got "{}".'.format(pad_mode)
+        assert pad_mode in [
+            'constant',
+            'reflect',
+        ], 'pad_mode should be "reflect" or "constant", but got "{}".'.format(
+            pad_mode
+        )

        pad_length = n_fft // 2
        # FIXME: Input `x` can be a complex tensor but pad does not supprt complex input.
-        x = paddle.nn.functional.pad(x.unsqueeze(-1),
-                                     pad=[pad_length, pad_length],
-                                     mode=pad_mode,
-                                     data_format="NLC").squeeze(-1)
+        x = paddle.nn.functional.pad(
+            x.unsqueeze(-1),
+            pad=[pad_length, pad_length],
+            mode=pad_mode,
+            data_format="NLC",
+        ).squeeze(-1)

    x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1)
    x_frames = x_frames.transpose(
-        perm=[0, 2,
-              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+        perm=[0, 2, 1]
+    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
    x_frames = paddle.multiply(x_frames, window)

    norm = 'ortho' if normalized else 'backward'
    if is_complex(x_frames):
-        assert not onesided, \
-            'onesided should be False when input or window is a complex Tensor.'
+        assert (
+            not onesided
+        ), 'onesided should be False when input or window is a complex Tensor.'

    if not is_complex(x):
-        out = fft_r2c(x=x_frames,
-                      n=None,
-                      axis=-1,
-                      norm=norm,
-                      forward=True,
-                      onesided=onesided,
-                      name=name)
+        out = fft_r2c(
+            x=x_frames,
+            n=None,
+            axis=-1,
+            norm=norm,
+            forward=True,
+            onesided=onesided,
+            name=name,
+        )
    else:
-        out = fft_c2c(x=x_frames,
-                      n=None,
-                      axis=-1,
-                      norm=norm,
-                      forward=True,
-                      name=name)
+        out = fft_c2c(
+            x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name
+        )

    out = out.transpose(perm=[0, 2, 1])  # (batch, n_fft, num_frames)

@@ -398,22 +423,24 @@ def stft(x,
    return out


-def istft(x,
-          n_fft,
-          hop_length=None,
-          win_length=None,
-          window=None,
-          center=True,
-          normalized=False,
-          onesided=True,
-          length=None,
-          return_complex=False,
-          name=None):
+def istft(
+    x,
+    n_fft,
+    hop_length=None,
+    win_length=None,
+    window=None,
+    center=True,
+    normalized=False,
+    onesided=True,
+    length=None,
+    return_complex=False,
+    name=None,
+):
    r"""
    Inverse short-time Fourier transform (ISTFT).

    Reconstruct time-domain signal from the giving complex input and window tensor when
-        nonzero overlap-add (NOLA) condition is met: 
+        nonzero overlap-add (NOLA) condition is met:

    .. math::
        \sum_{t = -\infty}^{\infty}%
@@ -432,7 +459,7 @@ def istft(x,

    Args:
        x (Tensor): The input data which is a 2-dimensional or 3-dimensional **complesx**
-            Tensor with shape `[..., n_fft, num_frames]`. 
+            Tensor with shape `[..., n_fft, num_frames]`.
        n_fft (int): The size of Fourier transform.
        hop_length (int, optional): Number of steps to advance between adjacent windows
            from time-domain signal and `0 < hop_length < win_length`. Default: `None`(
@@ -452,10 +479,10 @@ def istft(x,
            and `istft` will return a real-valued tensor when it is set to `True`.
            Default: `True`.
        length (int, optional): Specify the length of time-domain signal. Default: `None`(
-            treated as the whole length of signal). 
+            treated as the whole length of signal).
        return_complex (bool, optional): It means that whether the time-domain signal is
            real-valued. If `return_complex` is set to `True`, `onesided` should be set to
-            `False` cause the output is complex. 
+            `False` cause the output is complex.
        name (str, optional): The default value is None. Normally there is no need for user
            to set this property. For more information, please refer to :ref:`api_guide_Name`.

@@ -484,8 +511,12 @@ def istft(x,
    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')

    x_rank = len(x.shape)
-    assert x_rank in [2, 3], \
-        'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(x_rank)
+    assert x_rank in [
+        2,
+        3,
+    ], 'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(
+        x_rank
+    )

    if x_rank == 2:  # (batch, n_fft, n_frames)
        x = x.unsqueeze(0)
@@ -497,83 +528,107 @@ def istft(x,
        win_length = n_fft

    # Assure no gaps between frames.
-    assert 0 < hop_length <= win_length, \
-        'hop_length should be in (0, win_length({})], but got {}.'.format(win_length, hop_length)
-
-    assert 0 < win_length <= n_fft, \
-        'win_length should be in (0, n_fft({})], but got {}.'.format(n_fft, win_length)
+    assert (
+        0 < hop_length <= win_length
+    ), 'hop_length should be in (0, win_length({})], but got {}.'.format(
+        win_length, hop_length
+    )
+
+    assert (
+        0 < win_length <= n_fft
+    ), 'win_length should be in (0, n_fft({})], but got {}.'.format(
+        n_fft, win_length
+    )

    n_frames = x.shape[-1]
    fft_size = x.shape[-2]

    if _non_static_mode():
        if onesided:
-            assert (fft_size == n_fft // 2 + 1), \
-                'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
+            assert (
+                fft_size == n_fft // 2 + 1
+            ), 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(
+                n_fft // 2 + 1, fft_size
+            )
        else:
-            assert (fft_size == n_fft), \
-                'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
+            assert (
+                fft_size == n_fft
+            ), 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(
+                n_fft, fft_size
+            )

    if window is not None:
-        assert len(window.shape) == 1 and len(window) == win_length, \
-            'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
+        assert (
+            len(window.shape) == 1 and len(window) == win_length
+        ), 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(
+            win_length, window.shape
+        )
    else:
-        window_dtype = paddle.float32 if x.dtype in [
-            paddle.float32, paddle.complex64
-        ] else paddle.float64
-        window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
+        window_dtype = (
+            paddle.float32
+            if x.dtype in [paddle.float32, paddle.complex64]
+            else paddle.float64
+        )
+        window = paddle.ones(shape=(win_length,), dtype=window_dtype)

    if win_length < n_fft:
        pad_left = (n_fft - win_length) // 2
        pad_right = n_fft - win_length - pad_left
        # FIXME: Input `window` can be a complex tensor but pad does not supprt complex input.
-        window = paddle.nn.functional.pad(window,
-                                          pad=[pad_left, pad_right],
-                                          mode='constant')
+        window = paddle.nn.functional.pad(
+            window, pad=[pad_left, pad_right], mode='constant'
+        )

    x = x.transpose(
-        perm=[0, 2,
-              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+        perm=[0, 2, 1]
+    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
    norm = 'ortho' if normalized else 'backward'

    if return_complex:
-        assert not onesided, \
-            'onesided should be False when input(output of istft) or window is a complex Tensor.'
+        assert (
+            not onesided
+        ), 'onesided should be False when input(output of istft) or window is a complex Tensor.'

        out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
    else:
-        assert not is_complex(window), \
-            'Data type of window should not be complex when return_complex is False.'
+        assert not is_complex(
+            window
+        ), 'Data type of window should not be complex when return_complex is False.'

        if onesided is False:
-            x = x[:, :, :n_fft // 2 + 1]
+            x = x[:, :, : n_fft // 2 + 1]
        out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)

    out = paddle.multiply(out, window).transpose(
-        perm=[0, 2, 1])  # (batch, n_fft, num_frames)
-    out = overlap_add(x=out, hop_length=hop_length,
-                      axis=-1)  # (batch, seq_length)
+        perm=[0, 2, 1]
+    )  # (batch, n_fft, num_frames)
+    out = overlap_add(
+        x=out, hop_length=hop_length, axis=-1
+    )  # (batch, seq_length)

    window_envelop = overlap_add(
        x=paddle.tile(
            x=paddle.multiply(window, window).unsqueeze(0),
-            repeat_times=[n_frames,
-                          1]).transpose(perm=[1, 0]),  # (n_fft, num_frames)
+            repeat_times=[n_frames, 1],
+        ).transpose(
+            perm=[1, 0]
+        ),  # (n_fft, num_frames)
        hop_length=hop_length,
-        axis=-1)  # (seq_length, )
+        axis=-1,
+    )  # (seq_length, )

    if length is None:
        if center:
-            out = out[:, (n_fft // 2):-(n_fft // 2)]
-            window_envelop = window_envelop[(n_fft // 2):-(n_fft // 2)]
+            out = out[:, (n_fft // 2) : -(n_fft // 2)]
+            window_envelop = window_envelop[(n_fft // 2) : -(n_fft // 2)]
    else:
        if center:
            start = n_fft // 2
        else:
            start = 0

-        out = out[:, start:start + length]
-        window_envelop = window_envelop[start:start + length]
+        out = out[:, start : start + length]
+        window_envelop = window_envelop[start : start + length]

    # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
    if _non_static_mode() and window_envelop.abs().min().item() < 1e-11:

--- a/python/paddle/sparse/nn/layer/activation.py
+++ b/python/paddle/sparse/nn/layer/activation.py
@@ -20,6 +20,7 @@ __all__ = []

 class ReLU(Layer):
    """
+
    Sparse ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.

    .. math::
@@ -44,6 +45,7 @@ class ReLU(Layer):
            relu = paddle.sparse.nn.ReLU()
            out = relu(sparse_x)
            # [0., 0., 1.]
+
    """

    def __init__(self, name=None):
@@ -59,14 +61,15 @@ class ReLU(Layer):


 class Softmax(Layer):
-    """
+    r"""
+
    Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.

    Note:
-        Only support axis=-1 for SparseCsrTensor, which is faster when read data 
+        Only support axis=-1 for SparseCsrTensor, which is faster when read data
        by row (axis=-1).

-    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j` 
+    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j`
    in the matrix, we have:

    .. math::
@@ -96,17 +99,17 @@ class Softmax(Layer):
            #  [0.         0.         0.         0.98275049]]

            csr = paddle.to_tensor(np_x).to_sparse_csr()
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
            #        values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372,
            #                0.98275049])

            softmax = paddle.sparse.nn.Softmax()
            out = softmax(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
            #        values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269,
            #                1.        ])
    """
@@ -126,6 +129,7 @@ class Softmax(Layer):

 class ReLU6(Layer):
    """
+
    Sparse ReLU6 Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.

    .. math::
@@ -149,6 +153,7 @@ class ReLU6(Layer):
            sparse_x = dense_x.to_sparse_coo(1)
            relu6 = paddle.sparse.nn.ReLU6()
            out = relu6(sparse_x)
+
    """

    def __init__(self, name=None):
@@ -164,8 +169,9 @@ class ReLU6(Layer):


 class LeakyReLU(Layer):
-    """
-    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. 
+    r"""
+
+    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.

    .. math::

@@ -196,6 +202,7 @@ class LeakyReLU(Layer):
            sparse_x = dense_x.to_sparse_coo(1)
            leaky_relu = paddle.sparse.nn.LeakyReLU(0.5)
            out = leaky_relu(sparse_x)
+
    """

    def __init__(self, negative_slope=0.01, name=None):

--- a/python/paddle/sparse/nn/layer/norm.py
+++ b/python/paddle/sparse/nn/layer/norm.py
@@ -138,7 +138,7 @@ class BatchNorm(paddle.nn.BatchNorm1D):
        data_format = 'NCHW' if self._data_format[1] == 'C' else 'NHWC'

        if in_dynamic_mode():
-            batch_norm_out, _, _, _, _, _ = _C_ops.sparse_batch_norm(
+            batch_norm_out, _, _, _, _, _ = _C_ops.sparse_batch_norm_(
                input,
                self.weight,
                self.bias,

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1180,7 +1180,8 @@ def triu(x, diagonal=0, name=None):

 def meshgrid(*args, **kwargs):
    """
-    Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
+
+    Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.

    Args:
        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),

--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -22,9 +22,17 @@ from .math import multiply
 from .math import sum as paddle_sum
 from ..fluid.framework import _in_legacy_dygraph
 from paddle import _C_ops, _legacy_C_ops
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+from ..fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 import collections
 import string
 import opt_einsum
@@ -47,17 +55,18 @@ def parse_op_labels(labelstr, operand):

    Returns
    -------
-    the input operand's full label string in which all anonymous dimensions are 
-    labeled in dots. 
+    the input operand's full label string in which all anonymous dimensions are
+    labeled in dots.
    '''
    # Sanity checks
    for c in labelstr.replace('.', ''):
-        assert c.isalpha(), (
-            f"Invalid equation: {c} is not a valid label, which should be letters."
-        )
+        assert (
+            c.isalpha()
+        ), f"Invalid equation: {c} is not a valid label, which should be letters."

-    assert labelstr.replace('...', '', 1).find('.') == -1, (
-        f"Invalid equation: `.` is found outside of an ellipsis.")
+    assert (
+        labelstr.replace('...', '', 1).find('.') == -1
+    ), f"Invalid equation: `.` is found outside of an ellipsis."

    # Check shape. Note, in Paddle a tensor rank is always nonzero
    ndims = len(operand.shape)
@@ -65,8 +74,9 @@ def parse_op_labels(labelstr, operand):

    full_labelstr = labelstr.replace('...', '.' * (ndims - len(labelstr) + 3))

-    assert len(full_labelstr) == ndims, (
-        f"Invalid equation: the label string '{labelstr}' misses dimensions.")
+    assert (
+        len(full_labelstr) == ndims
+    ), f"Invalid equation: the label string '{labelstr}' misses dimensions."

    return full_labelstr

@@ -74,14 +84,14 @@ def parse_op_labels(labelstr, operand):
 def parse_labels(labelstr, operands):
    '''
    Parse label strings for all input operands.
-    
+
    Parameters
    ----------
    labelstr:
        The equation's label string
    operands:
        The input operands
-    
+
    Returns
    -------
    list of full label strings for all input operands
@@ -90,19 +100,21 @@ def parse_labels(labelstr, operands):
    nop_labels = labelstr.split(',')
    assert len(nop_labels) == len(operands), (
        f"Invalid equation: the number of operands is {len(operands)}, "
-        f"but found {len(nop_labels)} segments in the label equation.")
+        f"but found {len(nop_labels)} segments in the label equation."
+    )

    return list(map(parse_op_labels, nop_labels, operands))


 def validate_rhs(rhs, input_labels, n_bcast_dims):
    '''
-    Check whether the equation's right hand side is valid 
+    Check whether the equation's right hand side is valid
    '''
    # Sanity check.
    if n_bcast_dims > 0:
-        assert '...' in rhs, (
-            f"Invalid equation: missing ellipsis in output labels.")
+        assert (
+            '...' in rhs
+        ), f"Invalid equation: missing ellipsis in output labels."

    rhs = rhs.replace('...', '')
    rhs_set = set(rhs)
@@ -114,16 +126,18 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
    non_input_labels = rhs_set.difference(input_labels)
    assert not non_input_labels, (
        f"Invalid equation: "
-        f"output label {sorted(non_input_labels)} not used by any input.")
+        f"output label {sorted(non_input_labels)} not used by any input."
+    )
    # Verify that output labels are not duplicate
-    assert len(rhs) == len(rhs_set), (
-        f"Invalid equation: duplicate output labels are found.")
+    assert len(rhs) == len(
+        rhs_set
+    ), f"Invalid equation: duplicate output labels are found."


 def build_view(in_labels, out_labels):
    '''
-    Build an inverse map of dimension indices. Three conditions must hold for 
-    the result to be meaningful. 
+    Build an inverse map of dimension indices. Three conditions must hold for
+    the result to be meaningful.
    First, no duplicate letter labels in each label string.
    Second, the number of dots in dimout_labels >= that in in_labels.
    Third, dots are contiguous in each label string.
@@ -134,7 +148,7 @@ def build_view(in_labels, out_labels):
        The dimension labels to map to
    out_labels:
        The dimension labels to map from
-    
+
    Returns
    -------
    The inverse map from out_labels to in_labels. The length of the inverse map equals that of
@@ -159,8 +173,8 @@ def build_view(in_labels, out_labels):
        # fill the broadcast dimension indices from right to left.
        if s:
            for ax, dim in zip(
-                    range(start, end)[::-1],
-                    range(s.start(), s.end())[::-1]):
+                range(start, end)[::-1], range(s.start(), s.end())[::-1]
+            ):
                inv_map[ax] = dim

    # Now work on non-broadcast dimensions
@@ -181,7 +195,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
    plus an index table that maps from the layout to the dimensions
    in each operand. In the global view, the dimensions are arranged
    such that output ones are put on the left and contraction ones
-    are put on the right.  
+    are put on the right.

    Parameters
    ----------
@@ -191,7 +205,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
        The equation right hand side
    n_bcast_dims:
        The maxium number of broadcast dimensions
-    
+
    Returns
    -------
    A tuple of g_labels, g_view, g_nout, g_count
@@ -219,7 +233,8 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
        g_labels_out = rhs.replace('...', '.' * n_bcast_dims)
    else:
        g_labels_out = '.' * n_bcast_dims + ''.join(
-            l for l, c in zip(labels, count) if c == 1)
+            l for l, c in zip(labels, count) if c == 1
+        )

    for i in range(len(count))[::-1]:
        if labels[i] in g_labels_out:
@@ -237,7 +252,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):

 def build_global_shape(g_view, g_labels, op_shapes):
    '''
-    The global shape is the shape of all dimensions rearranged and broadcasting 
+    The global shape is the shape of all dimensions rearranged and broadcasting
    to the global view. It's a reference data structure for einsum planning.

    Parameters
@@ -267,12 +282,14 @@ def build_global_shape(g_view, g_labels, op_shapes):

    assert not non_bcastable, (
        f"Invalid operands: label {g_labels[non_bcastable[0]]} "
-        f"corresponds to non-broadcastable dimensions.")
+        f"corresponds to non-broadcastable dimensions."
+    )

    g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]

-    g_masks = [[s > 1 or s == -1 for s in view_shape]
-               for view_shape in view_shapes]
+    g_masks = [
+        [s > 1 or s == -1 for s in view_shape] for view_shape in view_shapes
+    ]

    return g_shape, g_masks

@@ -287,18 +304,19 @@ def has_duplicated_labels(labels):

 def diagonalize(labels, operand):
    '''
-    Merges dimensions with duplicate labels. 
-    
+    Merges dimensions with duplicate labels.
+
    For those dimensions with duplicate labels, merge them into one dimension
    which represents the diagonal elements. This requires the dimensions with
    duplicate labels are equal sized.
-    
+
    Examples
-    -------- 
+    --------
    'ijj...i' would be merged into 'ij...'
    '''
-    assert not has_duplicated_labels(labels), (
-        f'Duplicate labels are not supported.')
+    assert not has_duplicated_labels(
+        labels
+    ), f'Duplicate labels are not supported.'

    return labels, operand

@@ -358,12 +376,21 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
        plan.add_step(step)

    # Check if conditions hold for turnning the operation into a matmul
-    if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate(
-        (op1_vshape, op2_vshape)):
-        op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1])
-                                           ] + [np.prod(op1_vshape[K])]
-        op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2])
-                                           ] + [np.prod(op2_vshape[K])]
+    if (
+        j1 + j2 > 0
+        and k > 0
+        and -1 not in np.concatenate((op1_vshape, op2_vshape))
+    ):
+        op1_shape = (
+            list(op1_vshape[I])
+            + [np.prod(op1_vshape[J1])]
+            + [np.prod(op1_vshape[K])]
+        )
+        op2_shape = (
+            list(op2_vshape[I])
+            + [np.prod(op2_vshape[J2])]
+            + [np.prod(op2_vshape[K])]
+        )

        # Merge J dims and K dims by reshaping
        step = reshape, [var1], var1, op1_shape
@@ -412,15 +439,22 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
            step = squeeze, [var2], var2, [-1, -2]
            plan.add_step(step)
        elif j1 + j2 == 0 and not -1 in np.concatenate(
-            (op1_vshape[K], op2_vshape[K])):
+            (op1_vshape[K], op2_vshape[K])
+        ):
            assert all(op1_vshape[K] == op2_vshape[K])
-            step = reshape, [
-                var1
-            ], var1, list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])]
+            step = (
+                reshape,
+                [var1],
+                var1,
+                list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])],
+            )
            plan.add_step(step)
-            step = reshape, [
-                var2
-            ], var2, list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])]
+            step = (
+                reshape,
+                [var2],
+                var2,
+                list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])],
+            )
            plan.add_step(step)
            step = matmul, [var1, var2], var2, False, True
            plan.add_step(step)
@@ -449,8 +483,9 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
    g_view[op2] = list(op2_view)


-def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
-                   n_bcast):
+def plan_summation(
+    plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast
+):
    '''
    Plan various kinds of summation
    '''
@@ -464,8 +499,9 @@ def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,

    I, K, J1, J2 = list(range(n_bcast)), [], [], []

-    for ax, dim1, dim2 in zip(range(n_bcast, ndim), op1_view[n_bcast:],
-                              op2_view[n_bcast:]):
+    for ax, dim1, dim2 in zip(
+        range(n_bcast, ndim), op1_view[n_bcast:], op2_view[n_bcast:]
+    ):

        if (dim1 != -1) != (dim2 != -1):
            if dim1 != -1:
@@ -531,7 +567,6 @@ def plan_broadcast(plan, operands, nop_axes):


 class Plan:
-
    def __init__(self):
        self.env = {}
        self.steps = []
@@ -635,8 +670,9 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
            # op1 is a one element tensor.
            plan_scalar_prod(plan, i - 1, i)
        else:
-            plan_summation(plan, g_view, i - 1, i, g_supports, g_shape, g_count,
-                           n_bcast)
+            plan_summation(
+                plan, g_view, i - 1, i, g_supports, g_shape, g_count, n_bcast
+            )

    # for ax, dim in enumerate(g_view[nop-1][:nout]):
    #     assert dim == ax
@@ -678,7 +714,9 @@ def preprocess(equation, *operands):
    """
    equation = equation.replace(" ", "")
    nop = len(operands)
-    assert nop > 0, "Required at least one operand in Einsum API, but received %s " % nop
+    assert nop > 0, (
+        "Required at least one operand in Einsum API, but received %s " % nop
+    )

    # Part the equation to left hand side and right hand side
    lhs, *rhs = equation.lower().split('->')
@@ -692,37 +730,43 @@ def preprocess(equation, *operands):

    assert len(lhs.split(',')) == len(operands), (
        f"Invalid equation: the number of operands is {len(operands)}, "
-        f"but found {len(lhs.split(','))} segments in the label equation.")
+        f"but found {len(lhs.split(','))} segments in the label equation."
+    )

-    assert not ('...' in lhs and '...' not in rhs
-                ), f'Invalid equation: missing ellipsis in output labels.'
+    assert not (
+        '...' in lhs and '...' not in rhs
+    ), f'Invalid equation: missing ellipsis in output labels.'

-    assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) >
-                0), f'Duplicate labels are not supported.'
+    assert not (
+        len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
+    ), f'Duplicate labels are not supported.'

    assert not has_duplicated_labels(
-        rhs), f'Invalid equation: duplicate output labels are found.'
+        rhs
+    ), f'Invalid equation: duplicate output labels are found.'

    return lhs, rhs, labels


 def parse_fake_shape(equation, operands, labels):
-    """ 
+    """
+
    this shape is just used for operands planning. may differ with the original shape.
-    for example: 
+    for example:
    ... is replaced by 1
    -1  is replaced by 1
    Results
    -------
    list of shape
+
    """
    shaped = collections.namedtuple('shaped', ['shape'])

    def fake_shape(label, op):
-        assert len(op.shape) == len(
-            label
-        ), "length of shape and length of label must be the same, but received %d != %d" % (
-            len(op.shape), len(label))
+        assert len(op.shape) == len(label), (
+            "length of shape and length of label must be the same, but received %d != %d"
+            % (len(op.shape), len(label))
+        )
        fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.']
        fakes = list(map(abs, fakes))  # make -1 -> 1
        if '.' in label:
@@ -734,7 +778,6 @@ def parse_fake_shape(equation, operands, labels):


 def rhs_inference(lhs):
-
    def is_free(key):
        return cnt.get(key) == 1 and key not in ['.', ',']

@@ -745,7 +788,7 @@ def rhs_inference(lhs):


 def gen_equation_for_opteinsum(lhs, rhs):
-    """ 
+    """
    1. gen rhs if rhs is None
    2. '...' -> 'A'
    """
@@ -753,7 +796,8 @@ def gen_equation_for_opteinsum(lhs, rhs):
    def get_used_label(counter):
        used = set(counter.elements())
        for c in string.ascii_lowercase:
-            if c not in used: return c
+            if c not in used:
+                return c
        raise ValueError(
            "You have used all `a` - `z`, there can't find a unused for einsum optimization"
        )
@@ -768,7 +812,7 @@ def gen_equation_for_opteinsum(lhs, rhs):


 def einsum_v2(equation, *operands):
-    """ 
+    """
    einsum v2 implementation.
    1. Implement C++ EinsumOp.
    2. V2 create the EinsumOp to calculate, so just a little verifty work in python.
@@ -786,20 +830,21 @@ def einsum_v2(equation, *operands):
    var_list = list(operands)
    for path in cons:
        (a, b), _, eq, *__ = path
-        assert a > b, "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
+        assert (
+            a > b
+        ), "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
        var_s = [var_list.pop(a), var_list.pop(b)]
        eq = eq.replace(broadcast_label, "...")
        var_list.append(gen_einsum_op(eq, *var_s))
-    assert len(
-        var_list
-    ) == 1, "There must be one elements in list, but received %d." % len(
-        var_list)
+    assert (
+        len(var_list) == 1
+    ), "There must be one elements in list, but received %d." % len(var_list)
    return var_list[0]


 def gen_einsum_op(equation, *operands):
-    """ 
-    EinsumOp Python Interface: 
+    """
+    EinsumOp Python Interface:
    """
    assert len(operands) <= 2, "Only support two operands in EinsumOp."
    if in_dygraph_mode():
@@ -807,8 +852,9 @@ def gen_einsum_op(equation, *operands):

    if _in_legacy_dygraph():
        # dygraph
-        return _legacy_C_ops.einsum(operands, len(operands), len(operands),
-                                    'equation', equation)[0]
+        return _legacy_C_ops.einsum(
+            operands, len(operands), len(operands), 'equation', equation
+        )[0]

    for inp in operands:
        check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
@@ -825,19 +871,18 @@ def gen_einsum_op(equation, *operands):
        helper.create_variable_for_type_inference(dtype=operands[0].dtype)
        for i in range(len(operands))
    ]
-    helper.append_op(type='einsum',
-                     inputs={'Operands': operands},
-                     outputs={
-                         'Out': out,
-                         "InnerCache": caches,
-                         "XShape": xshape
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='einsum',
+        inputs={'Operands': operands},
+        outputs={'Out': out, "InnerCache": caches, "XShape": xshape},
+        attrs=attrs,
+    )
    return out


 def einsum(equation, *operands):
    r"""
+
    einsum(equation, *operands)

    The current version of this API should be used in dygraph only mode.
@@ -862,39 +907,39 @@ def einsum(equation, *operands):
        - for many operads
            - broadcasting multiply
            - chained matrix multiply
-    
+
    **The summation notation**

        - The tensor dimensions are labeled using uncased English letters. E.g., `ijk`
-        relates to a three dimensional tensor whose dimensions are labeled i, j, and k.
+          relates to a three dimensional tensor whose dimensions are labeled i, j, and k.
        - The equation is `,` separated into terms, each being a distinct input's
-        dimension label string.
+          dimension label string.
        - Ellipsis `...` enables broadcasting by automatically converting the unlabeled
-        dimensions into broadcasting dimensions. 
+          dimensions into broadcasting dimensions.
        - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
-        dimensions will be reduced and removed in the output.
-        - Output labels can be explicitly specified on the right hand side of `->` or omitted.
-        In the latter case, the output labels will be inferred from the input labels.
+          dimensions will be reduced and removed in the output.
+        - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
            - Inference of output labels
                - Broadcasting label `...`, if present, is put on the leftmost position.
                - Free labels are reordered alphabetically and put after `...`.
            - On explicit output labels
                - If broadcasting is enabled, then `...` must be present.
                - The output labels can be an empty, an indication to output as a scalar
-                the sum over the original output.
+                  the sum over the original output.
                - Non-input labels are invalid.
                - Duplicate labels are invalid.
-                - For any dummmy label which is present for the output, it's promoted to
-                a free label.
+                - For any dummy label which is present for the output, it's promoted to
+                  a free label.
                - For any free label which is not present for the output, it's lowered to
-                a dummy label.
+                  a dummy label.
+
        - Examples
            - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
-            string is '...ik'
-            - 'ij -> i', where i is a free label and j is a dummy label. 
+              string is '...ik'
+            - 'ij -> i', where i is a free label and j is a dummy label.
            - '...ij, ...jk -> ...ijk', where i, j and k are all free labels.
            - '...ij, ...jk -> ij', an invalid equation since `...` is not present for
-            the output.
+              the output.

    **The summation rule**

@@ -902,15 +947,15 @@ def einsum(equation, *operands):
    may vary significantly due to implementation specific optimization.

        - Step 1: preparation for broadcasting, that is, transposing and unsqueezing
-        the input operands to have each resulting dimension identically labeled across
-        all the input operands.
+          the input operands to have each resulting dimension identically labeled across
+          all the input operands.
        - Step 2: broadcasting multiply all the resulting operands from step 1.
        - Step 3: reducing dummy labeled dimensions.
        - Step 4: transposing the result tensor to match the output labels.

    **On trace and diagonal**

-    The trace and diagonal are planned yet unimplemented features. 
+    The trace and diagonal are planned yet unimplemented features.

    Args:
        equation (`str`):
@@ -918,82 +963,84 @@ def einsum(equation, *operands):
        operands (`list|Tensor`):
            The input tensors over which to compute the Einstein summation. The number of
            operands should equal the number of input terms in the equation.
-    
+
    Returns:
-        result (`Tensor`): the result tensor.
-    
+        result (`Tensor`), the result tensor.
+
    Examples:
        .. code-block:: python

-        import paddle
-        paddle.seed(102)
-        x = paddle.rand([4])
-        y = paddle.rand([5])
-
-        # sum
-        print(paddle.einsum('i->', x))
-        # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   1.95791852)
-
-        # dot
-        print(paddle.einsum('i,i->', x, x))
-        # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [1.45936954])
-        
-        # outer
-        print(paddle.einsum("i,j->ij", x, y))
-        # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194],
-        #    [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
-        #    [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
-        #    [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
-        
-        A = paddle.rand([2, 3, 2])
-        B = paddle.rand([2, 2, 3])
-        
-        # transpose
-        print(paddle.einsum('ijk->kji', A))
-        #  Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.95649719, 0.49684682],
-        #     [0.80071914, 0.46258664],
-        #     [0.49814570, 0.33383518]],
-        #
-        #    [[0.07637714, 0.29374704],
-        #     [0.51470858, 0.51907635],
-        #     [0.99066722, 0.55802226]]])
-        
-        # batch matrix multiplication
-        print(paddle.einsum('ijk, ikl->ijl', A,B))
-        # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.32172769, 0.50617385, 0.41394392],
-        #     [0.51736701, 0.49921003, 0.38730967],
-        #     [0.69078457, 0.42282537, 0.30161136]],
-        #
-        #    [[0.32043904, 0.18164253, 0.27810261],
-        #     [0.50226176, 0.24512935, 0.39881429],
-        #     [0.51476848, 0.23367381, 0.39229113]]])
-        
-        # Ellipsis transpose
-        print(paddle.einsum('...jk->...kj', A))
-        # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.95649719, 0.80071914, 0.49814570],
-        #     [0.07637714, 0.51470858, 0.99066722]],
-        #
-        #    [[0.49684682, 0.46258664, 0.33383518],
-        #     [0.29374704, 0.51907635, 0.55802226]]])
-        
-        # Ellipsis batch matrix multiplication
-        print(paddle.einsum('...jk, ...kl->...jl', A,B))
-        # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.32172769, 0.50617385, 0.41394392],
-        #     [0.51736701, 0.49921003, 0.38730967],
-        #     [0.69078457, 0.42282537, 0.30161136]],
-        #
-        #    [[0.32043904, 0.18164253, 0.27810261],
-        #     [0.50226176, 0.24512935, 0.39881429],
-        #     [0.51476848, 0.23367381, 0.39229113]]])
+            import paddle
+            paddle.seed(102)
+            x = paddle.rand([4])
+            y = paddle.rand([5])
+
+            # sum
+            print(paddle.einsum('i->', x))
+            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   1.95791852)
+
+            # dot
+            print(paddle.einsum('i,i->', x, x))
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [1.45936954])
+
+            # outer
+            print(paddle.einsum("i,j->ij", x, y))
+            # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194],
+            #    [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
+            #    [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
+            #    [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
+
+            A = paddle.rand([2, 3, 2])
+            B = paddle.rand([2, 2, 3])
+
+            # transpose
+            print(paddle.einsum('ijk->kji', A))
+            #  Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.95649719, 0.49684682],
+            #     [0.80071914, 0.46258664],
+            #     [0.49814570, 0.33383518]],
+            #
+            #    [[0.07637714, 0.29374704],
+            #     [0.51470858, 0.51907635],
+            #     [0.99066722, 0.55802226]]])
+
+            # batch matrix multiplication
+            print(paddle.einsum('ijk, ikl->ijl', A,B))
+            # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.32172769, 0.50617385, 0.41394392],
+            #     [0.51736701, 0.49921003, 0.38730967],
+            #     [0.69078457, 0.42282537, 0.30161136]],
+            #
+            #    [[0.32043904, 0.18164253, 0.27810261],
+            #     [0.50226176, 0.24512935, 0.39881429],
+            #     [0.51476848, 0.23367381, 0.39229113]]])
+
+            # Ellipsis transpose
+            print(paddle.einsum('...jk->...kj', A))
+            # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.95649719, 0.80071914, 0.49814570],
+            #     [0.07637714, 0.51470858, 0.99066722]],
+            #
+            #    [[0.49684682, 0.46258664, 0.33383518],
+            #     [0.29374704, 0.51907635, 0.55802226]]])
+
+            # Ellipsis batch matrix multiplication
+            print(paddle.einsum('...jk, ...kl->...jl', A,B))
+            # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.32172769, 0.50617385, 0.41394392],
+            #     [0.51736701, 0.49921003, 0.38730967],
+            #     [0.69078457, 0.42282537, 0.30161136]],
+            #
+            #    [[0.32043904, 0.18164253, 0.27810261],
+            #     [0.50226176, 0.24512935, 0.39881429],
+            #     [0.51476848, 0.23367381, 0.39229113]]])
+
    """
    import os
+
    if int(os.environ.get('FLAGS_new_einsum', "1")):
        return einsum_v2(equation, *operands)

@@ -1039,9 +1086,11 @@ def einsum(equation, *operands):
    #   Counting how many non-trivial dimensions remain for each ax

    g_labels, g_view, g_nout, g_count = build_global_view(
-        nop_labels, rhs, n_bcast_dims)
-    g_shape, g_supports = build_global_shape(g_view, g_labels,
-                                             [op.shape for op in operands])
+        nop_labels, rhs, n_bcast_dims
+    )
+    g_shape, g_supports = build_global_shape(
+        g_view, g_labels, [op.shape for op in operands]
+    )

    # Now we're ready to build up an execution plan
    args = operands, g_view, g_shape, g_supports, g_count, n_bcast_dims

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1912,12 +1912,15 @@ def mv(x, vec, name=None):

 def det(x, name=None):
    """
+
    Calculates determinant value of a square matrix or batches of square matrices.

    Args:
-        x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the
+        x (Tensor): the input matrix of size `(n, n)` or the
            batch of matrices of size `(*, n, n)` where `*` is one or more
            batch dimensions.
+        name(str, optional): Name of the output. Default is None. It's used
+            to print debug info for developers. Details: :ref:`api_guide_Name`

    Returns:
        Tensor, the determinant value of a square matrix or batches of square matrices.
@@ -1968,18 +1971,20 @@ def det(x, name=None):

 def slogdet(x, name=None):
    """
+
    Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
-    The determinant can be computed with ``sign * exp(logabsdet)
+    The determinant can be computed with ``sign * exp`` (logabsdet)

    Supports input of float, double

    Note that for matrices that have zero determinant, this returns ``(0, -inf)``
+
    Args:
        x (Tensor): the batch of matrices of size :math:`(*, n, n)`
            where math:`*` is one or more batch dimensions.

    Returns:
-        y (Tensor): A tensor containing the sign of the determinant and the natural logarithm
+        y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
        of the absolute value of determinant, respectively.

    Examples:
@@ -2097,6 +2102,7 @@ def svd(x, full_matrices=False, name=None):

 def matrix_power(x, n, name=None):
    r"""
+
    Computes the n-th power of a square matrix or a batch of square matrices.

    Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
@@ -2122,8 +2128,8 @@ def matrix_power(x, n, name=None):
            For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its
-            data type should be the same as that of `x`.
+        - Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its
+          data type should be the same as that of `x`.

    Examples:
        .. code-block:: python
@@ -3058,8 +3064,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):

 def solve(x, y, name=None):
    r"""
+
    Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
-    Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
+    Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
    a vector/matrix or a batch of vectors/matrices, the equation should be:

    .. math::
@@ -3068,9 +3075,9 @@ def solve(x, y, name=None):
    Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.

    Args:
-        x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
+        x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or
            more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or
+        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
            more batch dimensions. Its data type should be float32 or float64.
        name(str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -223,7 +223,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):


 def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
-    """
+    r"""
+
    stanh activation.

    .. math::
@@ -234,8 +235,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
        x (Tensor): The input Tensor with data type float32, float64.
        scale_a (float, optional): The scale factor a of the input. Default is 0.67.
        scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1525,9 +1525,6 @@ def add_n(inputs, name=None):
    if in_dygraph_mode():
        if isinstance(inputs, Variable):
            inputs = [inputs]
-        for x in inputs:
-            if not x.is_dense():
-                return _legacy_C_ops.sum(inputs, 'use_mkldnn', False)
        return _C_ops.add_n(inputs)
    if _in_legacy_dygraph():
        if isinstance(inputs, Variable):

--- a/python/paddle/tests/test_dlpack.py
+++ b/python/paddle/tests/test_dlpack.py
@@ -22,7 +22,6 @@ from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode


 class TestDLPack(unittest.TestCase):
-
    def func_test_dlpack_dygraph(self):
        paddle.disable_static()
        tensor = paddle.to_tensor(np.array([1, 2, 3, 4]).astype('int'))
@@ -30,11 +29,13 @@ class TestDLPack(unittest.TestCase):
        out_from_dlpack = paddle.utils.dlpack.from_dlpack(dlpack)
        if paddle.fluid.framework.in_dygraph_mode():
            self.assertTrue(
-                isinstance(out_from_dlpack, paddle.fluid.core.eager.Tensor))
+                isinstance(out_from_dlpack, paddle.fluid.core.eager.Tensor)
+            )
        else:
            self.assertTrue(isinstance(out_from_dlpack, paddle.Tensor))
-        np.testing.assert_array_equal(np.array(out_from_dlpack),
-                                      np.array([1, 2, 3, 4]).astype('int'))
+        np.testing.assert_array_equal(
+            np.array(out_from_dlpack), np.array([1, 2, 3, 4]).astype('int')
+        )

    def test_dlpack_dygraph(self):
        with _test_eager_guard():
@@ -58,26 +59,32 @@ class TestDLPack(unittest.TestCase):
    def test_dlpack_static(self):
        paddle.enable_static()
        tensor = fluid.create_lod_tensor(
-            np.array([[1], [2], [3], [4]]).astype('int'), [[1, 3]],
-            fluid.CPUPlace())
+            np.array([[1], [2], [3], [4]]).astype('int'),
+            [[1, 3]],
+            fluid.CPUPlace(),
+        )
        dlpack = paddle.utils.dlpack.to_dlpack(tensor)
        out_from_dlpack = paddle.utils.dlpack.from_dlpack(dlpack)
        self.assertTrue(isinstance(out_from_dlpack, fluid.core.Tensor))
        np.testing.assert_array_equal(
            np.array(out_from_dlpack),
-            np.array([[1], [2], [3], [4]]).astype('int'))
+            np.array([[1], [2], [3], [4]]).astype('int'),
+        )

        # when build with cuda
        if core.is_compiled_with_cuda():
            gtensor = fluid.create_lod_tensor(
-                np.array([[1], [2], [3], [4]]).astype('int'), [[1, 3]],
-                fluid.CUDAPlace(0))
+                np.array([[1], [2], [3], [4]]).astype('int'),
+                [[1, 3]],
+                fluid.CUDAPlace(0),
+            )
            gdlpack = paddle.utils.dlpack.to_dlpack(gtensor)
            gout_from_dlpack = paddle.utils.dlpack.from_dlpack(gdlpack)
            self.assertTrue(isinstance(gout_from_dlpack, fluid.core.Tensor))
            np.testing.assert_array_equal(
                np.array(gout_from_dlpack),
-                np.array([[1], [2], [3], [4]]).astype('int'))
+                np.array([[1], [2], [3], [4]]).astype('int'),
+            )

    def func_test_dlpack_dtype_conversion(self):
        paddle.disable_static()
@@ -104,7 +111,8 @@ class TestDLPack(unittest.TestCase):
        for dtype in complex_dtypes:
            x = paddle.to_tensor(
                [[1 + 6j, 2 + 5j, 3 + 4j], [4 + 3j, 5 + 2j, 6 + 1j]],
-                dtype=dtype)
+                dtype=dtype,
+            )
            dlpack = paddle.utils.dlpack.to_dlpack(x)
            o = paddle.utils.dlpack.from_dlpack(dlpack)
            self.assertEqual(x.dtype, o.dtype)
@@ -115,12 +123,18 @@ class TestDLPack(unittest.TestCase):
            self.func_test_dlpack_dtype_conversion()
        self.func_test_dlpack_dtype_conversion()

+    def test_to_dlpack_for_loop(self):
+        # See Paddle issue 50120
+        for i in range(10):
+            x = paddle.rand([3, 5])
+            dlpack = paddle.utils.dlpack.to_dlpack(x)

-class TestRaiseError(unittest.TestCase):

+class TestRaiseError(unittest.TestCase):
    def func_test_from_dlpack_raise_type_error(self):
-        self.assertRaises(TypeError, paddle.utils.dlpack.from_dlpack,
-                          np.zeros(5))
+        self.assertRaises(
+            TypeError, paddle.utils.dlpack.from_dlpack, np.zeros(5)
+        )

    def test_from_dlpack_raise_type_error(self):
        with _test_eager_guard():

--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -381,7 +381,7 @@ def get_rocm_arch_flags(cflags):
    """
    For ROCm platform, amdgpu target should be added for HIPCC.
    """
-    cflags = cflags + ['-fno-gpu-rdc', '-amdgpu-target=gfx906']
+    cflags = cflags + ['-fno-gpu-rdc', '-amdgpu-target=gfx906', '-amdgpu-target=gfx926']
    return cflags



--- a/python/paddle/utils/profiler.py
+++ b/python/paddle/utils/profiler.py
@@ -23,15 +23,27 @@ from ..fluid.profiler import start_profiler
 from ..fluid.profiler import profiler  # noqa: F401
 from ..fluid.profiler import stop_profiler
 from ..fluid.profiler import reset_profiler
-
-__all__ = [  #noqa
-    'Profiler', 'get_profiler', 'ProfilerOptions', 'cuda_profiler',
-    'start_profiler', 'profiler', 'stop_profiler', 'reset_profiler'
+from .deprecated import deprecated
+
+__all__ = [  # noqa
+    'Profiler',
+    'get_profiler',
+    'ProfilerOptions',
+    'cuda_profiler',
+    'start_profiler',
+    'profiler',
+    'stop_profiler',
+    'reset_profiler',
 ]


+@deprecated(
+    since="2.4.2",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
+)
 class ProfilerOptions(object):
-
    def __init__(self, options=None):
        self.options = {
            'state': 'All',
@@ -41,7 +53,7 @@ class ProfilerOptions(object):
            'output_thread_detail': False,
            'profile_path': 'none',
            'timeline_path': 'none',
-            'op_summary_path': 'none'
+            'op_summary_path': 'none',
        }
        if options is not None:
            for key in self.options.keys():
@@ -56,10 +68,13 @@ class ProfilerOptions(object):
    def __getitem__(self, name):
        if self.options.get(name, None) is None:
            raise ValueError(
-                "ProfilerOptions does not have an option named %s." % name)
+                "ProfilerOptions does not have an option named %s." % name
+            )
        else:
-            if isinstance(self.options[name],
-                          str) and self.options[name] == 'none':
+            if (
+                isinstance(self.options[name], str)
+                and self.options[name] == 'none'
+            ):
                return None
            else:
                return self.options[name]
@@ -68,8 +83,13 @@ class ProfilerOptions(object):
 _current_profiler = None


+@deprecated(
+    since="2.4.2",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
+)
 class Profiler(object):
-
    def __init__(self, enabled=True, options=None):
        if options is not None:
            self.profiler_options = options
@@ -101,22 +121,28 @@ class Profiler(object):
            try:
                start_profiler(
                    state=self.profiler_options['state'],
-                    tracer_option=self.profiler_options['tracer_level'])
+                    tracer_option=self.profiler_options['tracer_level'],
+                )
            except Exception as e:
                warnings.warn(
-                    "Profiler is not enabled becuase following exception:\n{}".
-                    format(e))
+                    "Profiler is not enabled becuase following exception:\n{}".format(
+                        e
+                    )
+                )

    def stop(self):
        if self.enabled:
            try:
                stop_profiler(
                    sorted_key=self.profiler_options['sorted_key'],
-                    profile_path=self.profiler_options['profile_path'])
+                    profile_path=self.profiler_options['profile_path'],
+                )
            except Exception as e:
                warnings.warn(
-                    "Profiler is not disabled becuase following exception:\n{}".
-                    format(e))
+                    "Profiler is not disabled becuase following exception:\n{}".format(
+                        e
+                    )
+                )

    def reset(self):
        if self.enabled and core.is_profiler_enabled():
@@ -137,6 +163,12 @@ class Profiler(object):
                self.stop()


+@deprecated(
+    since="2.4.2",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
+)
 def get_profiler():
    global _current_profiler
    if _current_profiler is None:

--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -1301,15 +1301,17 @@ def distribute_fpn_proposals(
    name=None,
 ):
    r"""
-        In Feature Pyramid Networks (FPN) models, it is needed to distribute
+
+    In Feature Pyramid Networks (FPN) models, it is needed to distribute
    all proposals into different FPN level, with respect to scale of the proposals,
    the referring scale and the referring level. Besides, to restore the order of
    proposals, we return an array which indicates the original index of rois
    in current proposals. To compute FPN level for each roi, the formula is given as follows:

    .. math::
-        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
-        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
+        level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)
+
    where BBoxArea is a function to compute the area of each roi.

    Args:
@@ -1333,13 +1335,13 @@ def distribute_fpn_proposals(
            None by default.

    Returns:
-        multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
-            and data type is same as `fpn_rois` . The length is max_level-min_level+1.
-        restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
-            , where N is the number of total rois. The data type is int32.
-        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is
-            the RoIs' number in each image on the corresponding level. The shape
-            is [B] and data type of int32, where B is the number of images.
+        - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
+          and data type is same as `fpn_rois` . The length is max_level-min_level+1.
+        - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
+          , where N is the number of total rois. The data type is int32.
+        - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
+          the RoIs' number in each image on the corresponding level. The shape
+          is [B] and data type of int32, where B is the number of images.

    Examples:
        .. code-block:: python
@@ -1356,6 +1358,7 @@ def distribute_fpn_proposals(
                refer_level=4,
                refer_scale=224,
                rois_num=rois_num)
+
    """
    num_lvl = max_level - min_level + 1

@@ -2441,6 +2444,7 @@ def matrix_nms(
    name=None,
 ):
    """
+
    This operator does matrix non maximum suppression (NMS).
    First selects a subset of candidate bounding boxes that have higher scores
    than score_threshold (if provided), then the top k candidate is selected if
@@ -2448,6 +2452,7 @@ def matrix_nms(
    decayed according to the Matrix NMS scheme.
    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
    per image if keep_top_k is larger than -1.
+
    Args:
        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
                           predicted locations of M bounding bboxes,
@@ -2471,29 +2476,32 @@ def matrix_nms(
                         on score_threshold.
        keep_top_k (int): Number of total bboxes to be kept per image after NMS
                          step. -1 means keeping all bboxes after NMS step.
-        use_gaussian (bool): Use Gaussian as the decay function. Default: False
-        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
-        background_label (int): The index of background label, the background
+        use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
+        gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
+        background_label (int, optional): The index of background label, the background
                                label will be ignored. If set to -1, then all
                                categories will be considered. Default: 0
-        normalized (bool): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
-        return_rois_num(bool): whether return rois_num. Default: True
-        name(str): Name of the matrix nms op. Default: None.
+        normalized (bool, optional): Whether detections are normalized. Default: True
+        return_index(bool, optional): Whether return selected index. Default: False
+        return_rois_num(bool, optional): whether return rois_num. Default: True
+        name(str, optional): Name of the matrix nms op. Default: None.
    Returns:
-        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
-        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
-        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
-             detection results.
-             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
-        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
-            selected indices, which are absolute values cross batches.
-        rois_num (Tensor): A 1-D Tensor with shape [N] containing
-            the number of detected boxes in each image.
+        - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
+          otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
+        - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
+          detection results.
+          Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
+        - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
+          selected indices, which are absolute values cross batches.
+        - rois_num (Tensor), A 1-D Tensor with shape [N] containing
+          the number of detected boxes in each image.
+
    Examples:
        .. code-block:: python
+
            import paddle
            from paddle.vision.ops import matrix_nms
+
            boxes = paddle.rand([4, 1, 4])
            boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
            boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
@@ -2501,6 +2509,7 @@ def matrix_nms(
            out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
                                 score_threshold=0.5, post_threshold=0.1,
                                 nms_top_k=400, keep_top_k=200, normalized=False)
+
    """
    check_variable_and_dtype(
        bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'