2.4.2

dbe08e9b · yuguo960516yuguo · b5499578 · dbe08e9b · dbe08e9b · dbe08e9b
Commit dbe08e9b authored Jun 12, 2023 by yuguo960516yuguo
20 changed files
--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
 import math
 import numpy as np
 import unittest
@@ -45,34 +46,64 @@ class TestFoldOp(OpTest):
    def calc_fold(self):
        output_shape = [0] * 4
        output_shape[0] = self.batch_size
-        output_shape[1] = int(self.input_channels /
+        output_shape[1] = int(
-                              (self.kernel_sizes[0] * self.kernel_sizes[1]))
+            self.input_channels / (self.kernel_sizes[0] * self.kernel_sizes[1])
+        )
        output_shape[2] = self.output_sizes[0]
        output_shape[3] = self.output_sizes[1]
        dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1
        dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
-        col_height = int((self.output_sizes[0] + self.paddings[0] +
+        col_height = (
-                          self.paddings[2] - dkernel_h) / self.strides[0]) + 1
+            int(
-        col_width = int((self.output_sizes[1] + self.paddings[1] +
+                (
-                         self.paddings[3] - dkernel_w) / self.strides[1]) + 1
+                    self.output_sizes[0]
+                    + self.paddings[0]
+                    + self.paddings[2]
+                    - dkernel_h
+                )
+                / self.strides[0]
+            )
+            + 1
+        )
+        col_width = (
+            int(
+                (
+                    self.output_sizes[1]
+                    + self.paddings[1]
+                    + self.paddings[3]
+                    - dkernel_w
+                )
+                / self.strides[1]
+            )
+            + 1
+        )
        output = np.zeros(output_shape).astype(np.float64)
        ############ calculate output ##############
        for b in range(output_shape[0]):
            for c in range(self.input_channels):
                w_offset = int(c % self.kernel_sizes[1])
                h_offset = int(
-                    (c / self.kernel_sizes[1]) % self.kernel_sizes[0])
+                    (c / self.kernel_sizes[1]) % self.kernel_sizes[0]
+                )
                c_out = int(c / self.kernel_sizes[0] / self.kernel_sizes[1])
                for h in range(col_height):
-                    h_out = int(h * self.strides[0] - self.paddings[0] +
+                    h_out = int(
-                                h_offset * self.dilations[0])
+                        h * self.strides[0]
+                        - self.paddings[0]
+                        + h_offset * self.dilations[0]
+                    )
                    for w in range(col_width):
-                        w_out = int(w * self.strides[1] - self.paddings[1] +
+                        w_out = int(
-                                    w_offset * self.dilations[1])
+                            w * self.strides[1]
+                            - self.paddings[1]
+                            + w_offset * self.dilations[1]
+                        )
                        if (h_out >= 0 and h_out < self.output_sizes[0]) and (
-                                w_out >= 0 and w_out < self.output_sizes[1]):
+                            w_out >= 0 and w_out < self.output_sizes[1]
-                            output[b, c_out, h_out,
+                        ):
-                                   w_out] += self.x[b, c, w + col_width * h]
+                            output[b, c_out, h_out, w_out] += self.x[
+                                b, c, w + col_width * h
+                            ]
        self.outputs = output
@@ -85,7 +116,7 @@ class TestFoldOp(OpTest):
            'paddings': self.paddings,
            'dilations': self.dilations,
            'strides': self.strides,
-            'output_sizes': self.output_sizes
+            'output_sizes': self.output_sizes,
        }
        self.outputs = {'Y': self.outputs}
@@ -101,9 +132,23 @@ class TestFoldOp(OpTest):
        self.check_grad(['X'], 'Y', check_eager=True)
+class TestFoldshape(TestFoldOp):
+    def init_data(self):
+        self.batch_size = 8
+        self.input_channels = 3 * 3 * 3
+        self.length = 6
+        self.kernel_sizes = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+        self.dilations = [1, 1]
+        self.output_sizes = [4, 5]
+        input_shape = [self.batch_size, self.input_channels, self.length]
+        self.x = np.random.rand(*input_shape).astype(np.float64)
 class TestFoldAPI(TestFoldOp):
-    #This is for test on paddle.nn.Fold
+    # This is for test on paddle.nn.Fold
    def setUp(self):
        self.op_type = 'fold'
@@ -120,19 +165,19 @@ class TestFoldAPI(TestFoldOp):
                m = paddle.nn.Fold(**self.attrs)
                m.eval()
                result = m(input)
-                np.testing.assert_allclose(result.numpy(),
+                np.testing.assert_allclose(
-                                           self.outputs['Y'],
+                    result.numpy(), self.outputs['Y'], rtol=1e-05
-                                           rtol=1e-05)
+                )
    def test_info(self):
        str(paddle.nn.Fold(**self.attrs))
 class TestFoldOpError(unittest.TestCase):
    def test_errors(self):
        from paddle.nn.functional import fold
        from paddle.fluid.framework import Program, program_guard
        with program_guard(Program(), Program()):
            def test_input_shape():
@@ -148,59 +193,67 @@ class TestFoldOpError(unittest.TestCase):
            def test_padding_shape():
                # padding_size must be 2 or 4
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[2, 3],
+                    x,
-                           kernel_sizes=[2, 2],
+                    output_sizes=[2, 3],
-                           paddings=[2, 2, 3])
+                    kernel_sizes=[2, 2],
+                    paddings=[2, 2, 3],
+                )
            def test_dilations_shape():
                # dialtions_size must be 2
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[2, 3],
+                    x,
-                           kernel_sizes=[2, 2],
+                    output_sizes=[2, 3],
-                           dilations=[2, 2, 3])
+                    kernel_sizes=[2, 2],
+                    dilations=[2, 2, 3],
+                )
            def test_strides_shape():
                # strids_size must be 2
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[2, 3],
+                    x,
-                           kernel_sizes=[2, 2],
+                    output_sizes=[2, 3],
-                           strides=[2, 2, 3])
+                    kernel_sizes=[2, 2],
+                    strides=[2, 2, 3],
+                )
            def test_output_size():
                # im_h * im_w must be L
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[6, 6],
+                    x, output_sizes=[6, 6], kernel_sizes=[2, 2], strides=[1, 1]
-                           kernel_sizes=[2, 2],
+                )
-                           strides=[1, 1])
            def test_output_size_2():
                # out_size must GT 1
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[0.1, 0.2],
+                    x,
-                           kernel_sizes=[2, 2],
+                    output_sizes=[0.1, 0.2],
-                           strides=[1, 1])
+                    kernel_sizes=[2, 2],
+                    strides=[1, 1],
+                )
            def test_block_h_w():
                # test_block_h_w GT 0
                x = paddle.randn(shape=[2, 1, 1], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[1, 1],
+                    x, output_sizes=[1, 1], kernel_sizes=[2, 2], strides=1
-                           kernel_sizes=[2, 2],
+                )
-                           strides=1)
            def test_GT_0():
                x = paddle.randn(shape=[2, 1, 1], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[0, 0],
+                    x,
-                           kernel_sizes=[0, 0],
+                    output_sizes=[0, 0],
-                           dilations=0,
+                    kernel_sizes=[0, 0],
-                           paddings=[0, 0],
+                    dilations=0,
-                           strides=0)
+                    paddings=[0, 0],
+                    strides=0,
+                )
            self.assertRaises(AssertionError, test_input_shape)
            self.assertRaises(AssertionError, test_kernel_shape)

--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -30,10 +30,10 @@ from paddle.fluid.framework import default_main_program
 from paddle.fluid import core
-@unittest.skipIf(not core.is_compiled_with_cuda(),
+@unittest.skipIf(
-                 "Paddle is not compiled with CUDA")
+    not core.is_compiled_with_cuda(), "Paddle is not compiled with CUDA"
+)
 class TestFusedGateAttentionOp(OpTest):
    def setUp(self):
        self.__class__.op_type = "fused_gate_attention"
        # use autograd to check grad in this unittest.
@@ -57,7 +57,6 @@ class TestFusedGateAttentionOp(OpTest):
        self.bias_attr = True
    def generate_input_data(self):
        def _random(shape):
            if self.dtype == "bfloat16":
                data = np.random.random(shape).astype("float32")
@@ -67,7 +66,8 @@ class TestFusedGateAttentionOp(OpTest):
        np.random.seed(123)
        self.query = _random(
-            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim)
+        )
        self.q_weight = _random((self.q_dim, self.num_heads, self.head_dim))
        self.k_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
        self.v_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
@@ -80,15 +80,18 @@ class TestFusedGateAttentionOp(OpTest):
            self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t])
        else:
            self.key = _random(
-                (self.batch_size, self.msa_len, self.m_size, self.kv_dim))
+                (self.batch_size, self.msa_len, self.m_size, self.kv_dim)
+            )
            self.qkv_weight = None
        self.attn_mask = _random(
-            (self.batch_size, self.msa_len, 1, 1, self.m_size))
+            (self.batch_size, self.msa_len, 1, 1, self.m_size)
+        )
        if self.bias_attr:
            self.nonbatched_bias = _random(
-                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size))
+                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size)
+            )
        if self.has_gating:
            self.gating_w = _random((self.q_dim, self.num_heads, self.head_dim))
@@ -98,12 +101,17 @@ class TestFusedGateAttentionOp(OpTest):
        self.output_b = _random((self.out_dim))
        self.dout = _random(
-            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim)
+        )
    def collect_outputs(self, query, key, softmax_out, fmha_out, gate_out, out):
        outputs = [
-            softmax_out, fmha_out, gate_out if self.has_gating else None, out,
+            softmax_out,
-            query.grad, None if self.merge_qkv else key.grad
+            fmha_out,
+            gate_out if self.has_gating else None,
+            out,
+            query.grad,
+            None if self.merge_qkv else key.grad,
        ]
        return outputs
@@ -111,14 +119,17 @@ class TestFusedGateAttentionOp(OpTest):
        paddle.disable_static(place=paddle.CUDAPlace(0))
        query = paddle.to_tensor(self.query, stop_gradient=False)
-        key = query if self.merge_qkv else paddle.to_tensor(self.key,
+        key = (
-                                                            stop_gradient=False)
+            query
+            if self.merge_qkv
+            else paddle.to_tensor(self.key, stop_gradient=False)
+        )
        q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
        k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
        v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
        src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
-        c = self.head_dim**(-0.5)
+        c = self.head_dim ** (-0.5)
        # [batch_size, msa_len, res_len, q_dim], [q_dim, num_heads, head_dim]
        #   -> [batch_size, msa_len, res_len, num_heads, head_dim]
        q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c
@@ -136,8 +147,9 @@ class TestFusedGateAttentionOp(OpTest):
        #   -> [batch_size, msa_len, num_heads, res_len, m_size]
        logits = logits + src_mask
        if self.bias_attr:
-            nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
+            nonbatched_bias = paddle.to_tensor(
-                                               stop_gradient=False)
+                self.nonbatched_bias, stop_gradient=False
+            )
            # [batch_size, msa_len, num_heads, res_len, m_size], [batch_size, 1, num_heads, res_len, m_size]
            #   -> [batch_size, msa_len, num_heads, res_len, m_size]
            logits = logits + nonbatched_bias
@@ -159,14 +171,22 @@ class TestFusedGateAttentionOp(OpTest):
            # gate_values = paddle.einsum('nbqc,chv->nbqhv', query,
            #                             gating_w) + gating_b
            gating_w_2d = paddle.reshape(
-                gating_w, shape=[self.q_dim, self.num_heads * self.head_dim])
+                gating_w, shape=[self.q_dim, self.num_heads * self.head_dim]
+            )
            gate_values_4d = paddle.matmul(query, gating_w_2d)
-            gate_values = paddle.reshape(
+            gate_values = (
-                gate_values_4d,
+                paddle.reshape(
-                shape=[
+                    gate_values_4d,
-                    self.batch_size, self.msa_len, self.res_len, self.num_heads,
+                    shape=[
-                    self.head_dim
+                        self.batch_size,
-                ]) + gating_b
+                        self.msa_len,
+                        self.res_len,
+                        self.num_heads,
+                        self.head_dim,
+                    ],
+                )
+                + gating_b
+            )
            gate_values = nn.functional.sigmoid(gate_values)
            gate_out = fmha_out * gate_values
        else:
@@ -183,20 +203,32 @@ class TestFusedGateAttentionOp(OpTest):
            gate_out,
            shape=[
                self.batch_size * self.msa_len * self.res_len,
-                self.num_heads * self.head_dim
+                self.num_heads * self.head_dim,
-            ])
+            ],
+        )
        output_w_2d = paddle.reshape(
-            output_w, shape=[self.num_heads * self.head_dim, self.out_dim])
+            output_w, shape=[self.num_heads * self.head_dim, self.out_dim]
+        )
        out_2d = paddle.matmul(gate_out_2d, output_w_2d)
-        out = paddle.reshape(
+        out = (
-            out_2d,
+            paddle.reshape(
-            shape=[self.batch_size, self.msa_len, self.res_len, self.out_dim
+                out_2d,
-                   ]) + output_b
+                shape=[
+                    self.batch_size,
-        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
+                    self.msa_len,
-                                 retain_graph=True)
+                    self.res_len,
-        return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out,
+                    self.out_dim,
-                                    out)
+                ],
+            )
+            + output_b
+        )
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True
+        )
+        return self.collect_outputs(
+            query, key, softmax_out, fmha_out, gate_out, out
+        )
    def get_fused_gate_attention_out(self):
        paddle.disable_static(place=paddle.CUDAPlace(0))
@@ -218,8 +250,9 @@ class TestFusedGateAttentionOp(OpTest):
        src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
        if self.bias_attr:
-            nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
+            nonbatched_bias = paddle.to_tensor(
-                                               stop_gradient=False)
+                self.nonbatched_bias, stop_gradient=False
+            )
        else:
            nonbatched_bias = None
        if self.has_gating:
@@ -232,18 +265,42 @@ class TestFusedGateAttentionOp(OpTest):
        output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
        output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
-        _, _, _, _, softmax_out, fmha_out, gate_out, out = _legacy_C_ops.fused_gate_attention(
+        (
-            query, key, q_weight, k_weight, v_weight, qkv_weight,
+            _,
-            nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
+            _,
-            'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
+            _,
+            _,
-        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
+            softmax_out,
-                                 retain_graph=True)
+            fmha_out,
-        return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out,
+            gate_out,
-                                    out)
+            out,
+        ) = _legacy_C_ops.fused_gate_attention(
+            query,
+            key,
+            q_weight,
+            k_weight,
+            v_weight,
+            qkv_weight,
+            nonbatched_bias,
+            src_mask,
+            gating_w,
+            gating_b,
+            output_w,
+            output_b,
+            'has_gating',
+            self.has_gating,
+            'merge_qkv',
+            self.merge_qkv,
+        )
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True
+        )
+        return self.collect_outputs(
+            query, key, softmax_out, fmha_out, gate_out, out
+        )
    def check(self, ref, out, atol, rtol, check_equal, name):
        def _convert(value):
            if self.dtype == "bfloat16":
                return convert_uint16_to_float(value)
@@ -252,19 +309,25 @@ class TestFusedGateAttentionOp(OpTest):
        if check_equal:
            self.assertTrue(
                np.equal(_convert(ref), _convert(out)).all(),
-                "Checking < {} > failed!".format(name))
+                "Checking < {} > failed!".format(name),
+            )
        else:
            np.testing.assert_allclose(
                _convert(ref),
                _convert(out),
                atol=atol,
                rtol=rtol,
-                err_msg="Checking < {} > failed!".format(name))
+                err_msg="Checking < {} > failed!".format(name),
+            )
    def check_output_and_grad(self, atol, rtol):
        output_names = [
-            "softmax_out", "fmha_out", "gate_out", "out", "query_grad",
+            "softmax_out",
-            "key_grad"
+            "fmha_out",
+            "gate_out",
+            "out",
+            "query_grad",
+            "key_grad",
        ]
        outputs_ref = self.get_reference_out()
        outputs_fused = self.get_fused_gate_attention_out()
@@ -280,22 +343,26 @@ class TestFusedGateAttentionOp(OpTest):
                # that in fused ops, check_equal is set to False and we use allclose
                # to check the correctness.
                check_equal = False
-                self.check(ref_res.numpy(), fused_res.numpy(), atol, rtol,
+                self.check(
-                           check_equal, output_names[i])
+                    ref_res.numpy(),
+                    fused_res.numpy(),
+                    atol,
+                    rtol,
+                    check_equal,
+                    output_names[i],
+                )
    def test_output_and_grad(self):
        self.check_output_and_grad(atol=1e-5, rtol=1e-6)
 class TestMergeQKVLargeBatchSizeCase(TestFusedGateAttentionOp):
    def config(self):
        super().config()
        self.batch_size = 2
 class TestSeparatedQKVCase(TestFusedGateAttentionOp):
    def config(self):
        self.dtype = "float32"
        self.has_gating = False
@@ -312,7 +379,6 @@ class TestSeparatedQKVCase(TestFusedGateAttentionOp):
 class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
    def config(self):
        super().config()
        self.has_gating = False
@@ -320,7 +386,6 @@ class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
 class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
    def config(self):
        super().config()
        self.dtype = "float16"
@@ -332,18 +397,18 @@ class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
 class TestMergeQKVLargeBatchSizeFp16Case(TestMergeQKVFp16Case):
    def config(self):
        super().config()
        self.batch_size = 2
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or get_cuda_version() < 11000,
+    not core.is_compiled_with_cuda()
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+    or get_cuda_version() < 11000
+    or paddle.device.cuda.get_device_capability()[0] < 8,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
 )
 class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
    def config(self):
        super().config()
        self.dtype = "bfloat16"
@@ -353,7 +418,6 @@ class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
 class TestMergeQKVLargeBatchSizeBF16Case(TestMergeQKVBF16Case):
    def config(self):
        super().config()
        self.batch_size = 2

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
@@ -20,19 +20,22 @@ from functools import partial
 class TestResnetGPU(TestResnetBase):
    def test_seresnext_with_learning_rate_decay(self):
        # NOTE(zcd): This test is compare the result of use parallel_executor
        # and executor, and the result of drop_out op and batch_norm op in
        # this two executor have diff, so the two ops should be removed
        # from the model.
-        check_func = partial(self.check_network_convergence,
+        check_func = partial(
-                             optimizer=seresnext_net.optimizer,
+            self.check_network_convergence,
-                             use_parallel_executor=False)
+            optimizer=seresnext_net.optimizer,
-        self._compare_result_with_origin_model(check_func,
+            use_parallel_executor=False,
-                                               use_device=DeviceType.CUDA,
+        )
-                                               delta2=1e-5,
+        self._compare_result_with_origin_model(
-                                               compare_separately=False)
+            check_func,
+            use_device=DeviceType.CUDA,
+            delta2=1e-3,
+            compare_separately=False,
+        )
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
@@ -93,14 +93,9 @@ def get_csr_value(mat, layout, nnz):
    return value
-def ref_sparse_attention(q,
+def ref_sparse_attention(
-                         k,
+    q, k, v, offset, columns, kp_mask=None, attn_mask=None, bsz=None
-                         v,
+):
-                         offset,
-                         columns,
-                         kp_mask=None,
-                         attn_mask=None,
-                         bsz=None):
    row, col, nnz = q.shape[0], q.shape[1], columns.shape[0]
    mat = np.zeros((row, row))
    for cur_row in range(row):
@@ -111,7 +106,7 @@ def ref_sparse_attention(q,
            mat[cur_row][cur_col] = 1
    a = np.dot(q, k.T) * mat
    a_value = get_csr_value(a, mat, nnz)
-    scaling = float(col)**-0.5
+    scaling = float(col) ** -0.5
    a = scaling * a
    for i in range(row):
        for j in range(row):
@@ -127,13 +122,9 @@ def ref_sparse_attention(q,
    return result, a_value, b_value
-def ref_batch_sparse_attention(q,
+def ref_batch_sparse_attention(
-                               k,
+    q, k, v, offset, columns, kp_mask=None, attn_mask=None
-                               v,
+):
-                               offset,
-                               columns,
-                               kp_mask=None,
-                               attn_mask=None):
    batch_size, num_heads, row, col = q.shape
    nnz = columns.shape[2]
    result = np.zeros((batch_size, num_heads, row, col))
@@ -141,11 +132,16 @@ def ref_batch_sparse_attention(q,
    result_softmax = np.zeros((batch_size, num_heads, nnz))
    for i in range(batch_size):
        for j in range(num_heads):
-            cur_q, cur_k, cur_v, = q[i][j], k[i][j], v[i][j]
+            cur_q, cur_k, cur_v, = (
+                q[i][j],
+                k[i][j],
+                v[i][j],
+            )
            cur_offset, cur_columns = offset[i][j], columns[i][j]
            if kp_mask is None and attn_mask is None:
                cur_result, cur_sdd, cur_softmax = ref_sparse_attention(
-                    cur_q, cur_k, cur_v, cur_offset, cur_columns)
+                    cur_q, cur_k, cur_v, cur_offset, cur_columns
+                )
            else:
                cur_result, cur_sdd, cur_softmax = ref_sparse_attention(
                    cur_q,
@@ -155,7 +151,8 @@ def ref_batch_sparse_attention(q,
                    cur_columns,
                    kp_mask=kp_mask,
                    attn_mask=attn_mask,
-                    bsz=i)
+                    bsz=i,
+                )
            result[i][j] = cur_result
            result_sdd[i][j], result_softmax[i][j] = cur_sdd, cur_softmax
    return result, result_sdd, result_softmax
@@ -193,10 +190,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize):
 @unittest.skipIf(
    not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
 )
 class TestSparseAttentionOp(OpTest):
    def config(self):
        self.shape = (1, 1, 16, 16)
        self.blocksize = 4
@@ -212,8 +208,9 @@ class TestSparseAttentionOp(OpTest):
        self.k = np.random.random(self.shape).astype(self.dtype)
        self.v = np.random.random(self.shape).astype(self.dtype)
        # init CSR tensor
-        offset, columns = init_csr_format(self.shape[0], self.shape[1],
+        offset, columns = init_csr_format(
-                                          self.shape[2], self.blocksize)
+            self.shape[0], self.shape[1], self.shape[2], self.blocksize
+        )
        self.offset = offset.astype('int32')
        self.columns = columns.astype('int32')
        # init mask tensor
@@ -234,10 +231,12 @@ class TestSparseAttentionOp(OpTest):
                self.offset,
                self.columns,
                kp_mask=self.key_padding_mask,
-                attn_mask=self.attn_mask)
+                attn_mask=self.attn_mask,
+            )
        else:
            result, result_sdd, result_softmax = ref_batch_sparse_attention(
-                self.q, self.k, self.v, self.offset, self.columns)
+                self.q, self.k, self.v, self.offset, self.columns
+            )
        if self.use_mask == True:
            self.inputs = {
@@ -260,7 +259,7 @@ class TestSparseAttentionOp(OpTest):
        self.outputs = {
            'Out': result.astype(self.dtype),
            'SparseDotSdd': result_sdd.astype(self.dtype),
-            'Softmax': result_softmax.astype(self.dtype)
+            'Softmax': result_softmax.astype(self.dtype),
        }
    def test_check_output(self):
@@ -273,7 +272,6 @@ class TestSparseAttentionOp(OpTest):
 class TestSparseAttentionOpFp32Test(TestSparseAttentionOp):
    def config(self):
        self.shape = (1, 1, 8, 16)
        self.blocksize = 2
@@ -282,7 +280,6 @@ class TestSparseAttentionOpFp32Test(TestSparseAttentionOp):
 class TestSparseAttentionOpShapeTest(TestSparseAttentionOp):
    def config(self):
        self.shape = (2, 2, 32, 8)
        self.blocksize = 8
@@ -292,10 +289,9 @@ class TestSparseAttentionOpShapeTest(TestSparseAttentionOp):
 @unittest.skipIf(
    not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
 )
 class TestSparseAttentionAPI(unittest.TestCase):
    def setUp(self):
        self.place = paddle.CUDAPlace(0)
        self.shape = (1, 1, 8, 4)
@@ -310,54 +306,62 @@ class TestSparseAttentionAPI(unittest.TestCase):
            K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype)
            V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype)
-            batch_size, num_heads, rows = self.shape[0], self.shape[
+            batch_size, num_heads, rows = (
-                1], self.shape[2]
+                self.shape[0],
+                self.shape[1],
+                self.shape[2],
+            )
            block_num = rows / self.blocksize
            block_last = rows % self.blocksize
-            sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last
+            sparse_nnz_num = (
+                block_num * self.blocksize * self.blocksize
+                + block_last * block_last
+            )
            offset_shape = (batch_size, num_heads, rows + 1)
            columns_shape = (batch_size, num_heads, int(sparse_nnz_num))
-            offset = paddle.static.data(name="Offset",
+            offset = paddle.static.data(
-                                        shape=offset_shape,
+                name="Offset", shape=offset_shape, dtype="int32"
-                                        dtype="int32")
+            )
-            columns = paddle.static.data(name="Columns",
+            columns = paddle.static.data(
-                                         shape=columns_shape,
+                name="Columns", shape=columns_shape, dtype="int32"
-                                         dtype="int32")
+            )
            key_padding_mask_shape = (self.shape[0], self.shape[2])
            attn_mask_shape = (self.shape[2], self.shape[2])
            if self.use_mask == True:
                key_padding_mask = paddle.static.data(
                    name="KeyPaddingMask",
                    shape=key_padding_mask_shape,
-                    dtype=self.dtype)
+                    dtype=self.dtype,
-                attn_mask = paddle.static.data(name="AttnMask",
+                )
-                                               shape=attn_mask_shape,
+                attn_mask = paddle.static.data(
-                                               dtype=self.dtype)
+                    name="AttnMask", shape=attn_mask_shape, dtype=self.dtype
-                Out = F.sparse_attention(Q,
+                )
-                                         K,
+                Out = F.sparse_attention(
-                                         V,
+                    Q,
-                                         offset,
+                    K,
-                                         columns,
+                    V,
-                                         key_padding_mask=key_padding_mask,
+                    offset,
-                                         attn_mask=attn_mask)
+                    columns,
+                    key_padding_mask=key_padding_mask,
+                    attn_mask=attn_mask,
+                )
            else:
                Out = F.sparse_attention(Q, K, V, offset, columns)
            Q_np = np.random.random(self.shape).astype(self.dtype)
            K_np = np.random.random(self.shape).astype(self.dtype)
            V_np = np.random.random(self.shape).astype(self.dtype)
-            offset_np, columns_np = init_csr_format(self.shape[0],
+            offset_np, columns_np = init_csr_format(
-                                                    self.shape[1],
+                self.shape[0], self.shape[1], self.shape[2], self.blocksize
-                                                    self.shape[2],
+            )
-                                                    self.blocksize)
            offset_np = offset_np.astype('int32')
            columns_np = columns_np.astype('int32')
            # init mask tensor
-            key_padding_mask_np = np.random.randint(0,
+            key_padding_mask_np = np.random.randint(
-                                                    2,
+                0, 2, size=key_padding_mask_shape
-                                                    size=key_padding_mask_shape)
+            )
            attn_mask_np = np.random.randint(0, 2, size=attn_mask_shape)
            key_padding_mask_np = init_mask(key_padding_mask_np)
            attn_mask_np = init_mask(attn_mask_np)
@@ -366,16 +370,18 @@ class TestSparseAttentionAPI(unittest.TestCase):
            exe = fluid.Executor(self.place)
            if self.use_mask == True:
-                fetches_result = exe.run(feed={
+                fetches_result = exe.run(
-                    "Q": Q_np,
+                    feed={
-                    "K": K_np,
+                        "Q": Q_np,
-                    "V": V_np,
+                        "K": K_np,
-                    "Offset": offset_np,
+                        "V": V_np,
-                    "Columns": columns_np,
+                        "Offset": offset_np,
-                    'KeyPaddingMask': key_padding_mask_np,
+                        "Columns": columns_np,
-                    'AttnMask': attn_mask_np
+                        'KeyPaddingMask': key_padding_mask_np,
-                },
+                        'AttnMask': attn_mask_np,
-                                         fetch_list=[Out])
+                    },
+                    fetch_list=[Out],
+                )
                expected_result, __, __ = ref_batch_sparse_attention(
                    Q_np,
                    K_np,
@@ -383,28 +389,32 @@ class TestSparseAttentionAPI(unittest.TestCase):
                    offset_np,
                    columns_np,
                    kp_mask=key_padding_mask_np,
-                    attn_mask=attn_mask_np)
+                    attn_mask=attn_mask_np,
+                )
            else:
-                fetches_result = exe.run(feed={
+                fetches_result = exe.run(
-                    "Q": Q_np,
+                    feed={
-                    "K": K_np,
+                        "Q": Q_np,
-                    "V": V_np,
+                        "K": K_np,
-                    "Offset": offset_np,
+                        "V": V_np,
-                    "Columns": columns_np
+                        "Offset": offset_np,
-                },
+                        "Columns": columns_np,
-                                         fetch_list=[Out])
+                    },
+                    fetch_list=[Out],
+                )
                expected_result, __, __ = ref_batch_sparse_attention(
-                    Q_np, K_np, V_np, offset_np, columns_np)
+                    Q_np, K_np, V_np, offset_np, columns_np
+                )
-            np.testing.assert_allclose(fetches_result,
+            np.testing.assert_allclose(
-                                       expected_result,
+                fetches_result[0], expected_result, rtol=1e-05, atol=1e-05
-                                       rtol=1e-05,
+            )
-                                       atol=1e-05)
    def test_dygraph(self):
        paddle.disable_static()
-        offset, columns = init_csr_format(self.shape[0], self.shape[1],
+        offset, columns = init_csr_format(
-                                          self.shape[2], self.blocksize)
+            self.shape[0], self.shape[1], self.shape[2], self.blocksize
+        )
        offset = offset.astype('int32')
        columns = columns.astype('int32')
        query = np.random.random(self.shape).astype(self.dtype)
@@ -429,13 +439,15 @@ class TestSparseAttentionAPI(unittest.TestCase):
        paddle_attn_mask = paddle.to_tensor(attn_mask, place=self.place)
        if self.use_mask == True:
-            paddle_result = F.sparse_attention(paddle_query,
+            paddle_result = F.sparse_attention(
-                                               paddle_key,
+                paddle_query,
-                                               paddle_value,
+                paddle_key,
-                                               paddle_offset,
+                paddle_value,
-                                               paddle_colunmns,
+                paddle_offset,
-                                               key_padding_mask=paddle_kp_mask,
+                paddle_colunmns,
-                                               attn_mask=paddle_attn_mask)
+                key_padding_mask=paddle_kp_mask,
+                attn_mask=paddle_attn_mask,
+            )
            numpy_result, __, __ = ref_batch_sparse_attention(
                query,
@@ -444,25 +456,29 @@ class TestSparseAttentionAPI(unittest.TestCase):
                offset,
                columns,
                kp_mask=key_padding_mask,
-                attn_mask=attn_mask)
+                attn_mask=attn_mask,
+            )
            numpy_result = numpy_result.astype(self.dtype)
        else:
-            paddle_result = F.sparse_attention(paddle_query, paddle_key,
+            paddle_result = F.sparse_attention(
-                                               paddle_value, paddle_offset,
+                paddle_query,
-                                               paddle_colunmns)
+                paddle_key,
+                paddle_value,
+                paddle_offset,
+                paddle_colunmns,
+            )
            numpy_result, __, __ = ref_batch_sparse_attention(
-                query, key, value, offset, columns)
+                query, key, value, offset, columns
+            )
            numpy_result = numpy_result.astype(self.dtype)
-        np.testing.assert_allclose(paddle_result.numpy(),
+        np.testing.assert_allclose(
-                                   numpy_result,
+            paddle_result.numpy(), numpy_result, rtol=1e-05, atol=1e-05
-                                   rtol=1e-05,
+        )
-                                   atol=1e-05)
 class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
    def setUp(self):
        self.place = paddle.CUDAPlace(0)
        self.shape = (2, 2, 8, 4)
@@ -472,7 +488,6 @@ class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
 class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
    def setUp(self):
        self.place = paddle.CUDAPlace(0)
        self.shape = (2, 2, 64, 32)
@@ -482,7 +497,6 @@ class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
 class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
    def setUp(self):
        self.place = paddle.CUDAPlace(0)
        self.shape = (2, 1, 64, 32)
@@ -492,7 +506,6 @@ class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
 class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
    def setUp(self):
        self.place = paddle.CUDAPlace(0)
        self.shape = (4, 4, 128, 32)
@@ -502,7 +515,6 @@ class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
 class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI):
    def setUp(self):
        self.place = paddle.CUDAPlace(0)
        self.shape = (3, 3, 35, 15)

--- a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
@@ -64,42 +64,50 @@ class TestSparseElementWiseAPI(unittest.TestCase):
            csr_y = s_dense_y.to_sparse_csr()
            actual_res = get_actual_res(csr_x, csr_y, op)
-            actual_res.backward(actual_res)
            expect_res = op(dense_x, dense_y)
            expect_res.backward(expect_res)
-            np.testing.assert_allclose(expect_res.numpy(),
+            np.testing.assert_allclose(
-                                       actual_res.to_dense().numpy(),
+                expect_res.numpy(),
-                                       rtol=1e-05,
+                actual_res.to_dense().numpy(),
-                                       equal_nan=True)
+                rtol=1e-05,
+                equal_nan=True,
+            )
            if not (op == __truediv__ and dtype in ['int32', 'int64']):
-                np.testing.assert_allclose(dense_x.grad.numpy(),
+                actual_res.backward(actual_res)
-                                           csr_x.grad.to_dense().numpy(),
+                np.testing.assert_allclose(
-                                           rtol=1e-05,
+                    dense_x.grad.numpy(),
-                                           equal_nan=True)
+                    csr_x.grad.to_dense().numpy(),
-                np.testing.assert_allclose(dense_y.grad.numpy(),
+                    rtol=1e-05,
-                                           csr_y.grad.to_dense().numpy(),
+                    equal_nan=True,
-                                           rtol=1e-05,
+                )
-                                           equal_nan=True)
+                np.testing.assert_allclose(
+                    dense_y.grad.numpy(),
+                    csr_y.grad.to_dense().numpy(),
+                    rtol=1e-05,
+                    equal_nan=True,
+                )
    def func_test_coo(self, op):
        for sparse_dim in range(len(self.coo_shape) - 1, len(self.coo_shape)):
            for dtype in self.support_dtypes:
-                x = np.random.randint(-255, 255,
+                x = np.random.randint(-255, 255, size=self.coo_shape).astype(
-                                      size=self.coo_shape).astype(dtype)
+                    dtype
-                y = np.random.randint(-255, 255,
+                )
-                                      size=self.coo_shape).astype(dtype)
+                y = np.random.randint(-255, 255, size=self.coo_shape).astype(
+                    dtype
+                )
                dense_x = paddle.to_tensor(x, dtype=dtype, stop_gradient=False)
                dense_y = paddle.to_tensor(y, dtype=dtype, stop_gradient=False)
-                s_dense_x = paddle.to_tensor(x,
+                s_dense_x = paddle.to_tensor(
-                                             dtype=dtype,
+                    x, dtype=dtype, stop_gradient=False
-                                             stop_gradient=False)
+                )
-                s_dense_y = paddle.to_tensor(y,
+                s_dense_y = paddle.to_tensor(
-                                             dtype=dtype,
+                    y, dtype=dtype, stop_gradient=False
-                                             stop_gradient=False)
+                )
                coo_x = s_dense_x.to_sparse_coo(sparse_dim)
                coo_y = s_dense_y.to_sparse_coo(sparse_dim)
@@ -109,18 +117,24 @@ class TestSparseElementWiseAPI(unittest.TestCase):
                expect_res = op(dense_x, dense_y)
                expect_res.backward(expect_res)
-                np.testing.assert_allclose(expect_res.numpy(),
+                np.testing.assert_allclose(
-                                           actual_res.to_dense().numpy(),
+                    expect_res.numpy(),
-                                           rtol=1e-05,
+                    actual_res.to_dense().numpy(),
-                                           equal_nan=True)
+                    rtol=1e-05,
-                np.testing.assert_allclose(dense_x.grad.numpy(),
+                    equal_nan=True,
-                                           coo_x.grad.to_dense().numpy(),
+                )
-                                           rtol=1e-05,
+                np.testing.assert_allclose(
-                                           equal_nan=True)
+                    dense_x.grad.numpy(),
-                np.testing.assert_allclose(dense_y.grad.numpy(),
+                    coo_x.grad.to_dense().numpy(),
-                                           coo_y.grad.to_dense().numpy(),
+                    rtol=1e-05,
-                                           rtol=1e-05,
+                    equal_nan=True,
-                                           equal_nan=True)
+                )
+                np.testing.assert_allclose(
+                    dense_y.grad.numpy(),
+                    coo_y.grad.to_dense().numpy(),
+                    rtol=1e-05,
+                    equal_nan=True,
+                )
    def test_support_dtypes_csr(self):
        paddle.device.set_device('cpu')
@@ -140,38 +154,37 @@ class TestSparseElementWiseAPI(unittest.TestCase):
        values2_data = [[1.0], [2.0]]
        shape = [2, 4, 2]
-        sp_a = sparse.sparse_coo_tensor(indices_data,
+        sp_a = sparse.sparse_coo_tensor(
-                                        values1_data,
+            indices_data, values1_data, shape, stop_gradient=False
-                                        shape,
+        )
-                                        stop_gradient=False)
+        sp_b = sparse.sparse_coo_tensor(
-        sp_b = sparse.sparse_coo_tensor(indices_data,
+            indices_data, values2_data, shape, stop_gradient=False
-                                        values2_data,
+        )
-                                        shape,
-                                        stop_gradient=False)
        values1 = paddle.to_tensor(values1_data, stop_gradient=False)
        values2 = paddle.to_tensor(values2_data, stop_gradient=False)
-        #c.values() = a.values() + b.values()
+        # c.values() = a.values() + b.values()
        sp_c = sparse.add(sp_a, sp_b)
        sp_c.backward()
        ref_c = values1 + values2
        ref_c.backward()
        np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy())
-        np.testing.assert_allclose(sp_a.grad.values().numpy(),
+        np.testing.assert_allclose(
-                                   values1.grad.numpy())
+            sp_a.grad.values().numpy(), values1.grad.numpy()
-        np.testing.assert_allclose(sp_b.grad.values().numpy(),
+        )
-                                   values2.grad.numpy())
+        np.testing.assert_allclose(
+            sp_b.grad.values().numpy(), values2.grad.numpy()
+        )
    def test_add_bias(self):
        indices_data = [[0, 1], [0, 3]]
        values_data = [[1.0, 1.0], [2.0, 2.0]]
        shape = [2, 4, 2]
-        sp_a = sparse.sparse_coo_tensor(indices_data,
+        sp_a = sparse.sparse_coo_tensor(
-                                        values_data,
+            indices_data, values_data, shape, stop_gradient=False
-                                        shape,
+        )
-                                        stop_gradient=False)
        bias_values = [1.0, 2.0]
@@ -179,14 +192,15 @@ class TestSparseElementWiseAPI(unittest.TestCase):
        values2 = paddle.to_tensor(bias_values, stop_gradient=False)
        values3 = paddle.to_tensor(bias_values, stop_gradient=False)
-        #c.values() = a.values() + b
+        # c.values() = a.values() + b
        sp_c = sparse.add(sp_a, values2)
        sp_c.backward()
        ref_c = values1 + values3
        ref_c.backward()
        np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy())
-        np.testing.assert_allclose(sp_a.grad.values().numpy(),
+        np.testing.assert_allclose(
-                                   values1.grad.numpy())
+            sp_a.grad.values().numpy(), values1.grad.numpy()
+        )
        np.testing.assert_allclose(values2.grad.numpy(), values3.grad.numpy())

--- a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
@@ -28,7 +28,6 @@ paddle.enable_static()
 # Correct: General.
 class TestSqueezeOp(OpTest):
    def setUp(self):
        self.op_type = "squeeze2"
        self.python_api = paddle.squeeze
@@ -40,7 +39,7 @@ class TestSqueezeOp(OpTest):
        self.init_attrs()
        self.outputs = {
            "Out": self.inputs["X"].reshape(self.new_shape),
-            "XShape": np.random.random(self.ori_shape).astype("float64")
+            "XShape": np.random.random(self.ori_shape).astype("float64"),
        }
    def test_check_output(self):
@@ -60,7 +59,6 @@ class TestSqueezeOp(OpTest):
 # Correct: There is mins axis.
 class TestSqueezeOp1(TestSqueezeOp):
    def init_test_case(self):
        self.ori_shape = (1, 20, 1, 5)
        self.axes = (0, -2)
@@ -69,7 +67,6 @@ class TestSqueezeOp1(TestSqueezeOp):
 # Correct: No axes input.
 class TestSqueezeOp2(TestSqueezeOp):
    def init_test_case(self):
        self.ori_shape = (1, 20, 1, 5)
        self.axes = ()
@@ -78,7 +75,6 @@ class TestSqueezeOp2(TestSqueezeOp):
 # Correct: Just part of axes be squeezed.
 class TestSqueezeOp3(TestSqueezeOp):
    def init_test_case(self):
        self.ori_shape = (6, 1, 5, 1, 4, 1)
        self.axes = (1, -1)
@@ -86,7 +82,6 @@ class TestSqueezeOp3(TestSqueezeOp):
 class TestSqueeze2AxesTensor(UnittestBase):
    def init_info(self):
        self.shapes = [[2, 3, 4]]
        self.save_path = os.path.join(self.temp_dir.name, 'squeeze_tensor')
@@ -123,7 +118,6 @@ class TestSqueeze2AxesTensor(UnittestBase):
 class TestSqueeze2AxesTensorList(UnittestBase):
    def init_info(self):
        self.shapes = [[2, 3, 4]]
        self.save_path = os.path.join(self.temp_dir.name, 'squeeze_tensor')
@@ -140,7 +134,7 @@ class TestSqueeze2AxesTensorList(UnittestBase):
            # axes is a list[Variable]
            axes = [
                paddle.full([1], 0, dtype='int32'),
-                paddle.full([1], 2, dtype='int32')
+                paddle.full([1], 2, dtype='int32'),
            ]
            out = paddle.squeeze(feat, axes)
            out2 = paddle.fluid.layers.squeeze(feat, axes)
@@ -162,5 +156,37 @@ class TestSqueeze2AxesTensorList(UnittestBase):
            self.assertEqual(infer_out.shape, (2, 3, 10))
+# test api
+class TestSqueezeAPI(unittest.TestCase):
+    def setUp(self):
+        self.executed_api()
+    def executed_api(self):
+        self.squeeze = paddle.squeeze
+    def test_api(self):
+        paddle.disable_static()
+        input_data = np.random.random([3, 2, 1]).astype("float32")
+        x = paddle.to_tensor(input_data)
+        out = self.squeeze(x, axis=2)
+        out.backward()
+        self.assertEqual(out.shape, [3, 2])
+        paddle.enable_static()
+    def test_error(self):
+        def test_axes_type():
+            x2 = paddle.static.data(name="x2", shape=[2, 1, 25], dtype="int32")
+            self.squeeze(x2, axis=2.1)
+        self.assertRaises(TypeError, test_axes_type)
+class TestSqueezeInplaceAPI(TestSqueezeAPI):
+    def executed_api(self):
+        self.squeeze = paddle.squeeze_
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
@@ -12,16 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
-import unittest
 import re
+import unittest
 import paddle.version as fluid_version
 class VersionTest(unittest.TestCase):
    def setUp(self):
        self._major_regex = "[0-9]+"
        self._minor_regex = "[0-9]+"
@@ -37,15 +34,20 @@ class VersionTest(unittest.TestCase):
        # check version format
        if fluid_version.istaged:
-            self.assertEqual(fluid_version.major, 0)
-            self.assertEqual(fluid_version.minor, 0)
-            self.assertEqual(fluid_version.patch, "0")
-            self.assertEqual(fluid_version.rc, 0)
-            self.assertEqual(fluid_version.full_version, "0.0.0")
-        else:
            self.assertTrue(re.match(self._major_regex, fluid_version.major))
            self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
            self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
            self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
            self.assertTrue(
-                re.match(self._version_regex, fluid_version.full_version))
+                re.match(self._version_regex, fluid_version.full_version)
+            )
+        else:
+            self.assertEqual(fluid_version.major, "0")
+            self.assertEqual(fluid_version.minor, "0")
+            self.assertEqual(fluid_version.patch, "0")
+            self.assertEqual(fluid_version.rc, "0")
+            self.assertEqual(fluid_version.full_version, "0.0.0")
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -241,13 +241,13 @@ def send_ue_recv(
        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
                            The available data type is int32, int64.
-        message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
+        message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
-        reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
+        reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                         Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
+        out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
                                    out_size is smaller or equal to 0, then this input will not be used.
                                    Otherwise, `out_size` should be equal with or larger than
-                                    max(dst_index) + 1.
+                                    max(dst_index) + 1. Default value is `None`.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
@@ -26,6 +26,7 @@ def reindex_graph(
    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
    """
    Reindex Graph API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -49,12 +50,12 @@ def reindex_graph(
                            should be the same with `x`.
        count (Tensor): The neighbor count of the input nodes `x`. And the
                        data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                    and should be filled with -1. Only useful for gpu version.
                                    `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -69,6 +70,7 @@ def reindex_graph(
        .. code-block:: python
            import paddle
            x = [0, 1, 2]
            neighbors = [8, 9, 0, 4, 7, 6, 7]
            count = [2, 3, 2]
@@ -138,6 +140,7 @@ def reindex_heter_graph(
    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
    """
    Reindex HeterGraph API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -161,12 +164,12 @@ def reindex_heter_graph(
                                The data type should be the same with `x`.
        count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
                            And the data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                    and should be filled with -1. Only useful for gpu version.
                                    `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -183,6 +186,7 @@ def reindex_heter_graph(
        .. code-block:: python
            import paddle
            x = [0, 1, 2]
            neighbors_a = [8, 9, 0, 4, 7, 6, 7]
            count_a = [2, 3, 2]

--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -32,6 +32,7 @@ def sample_neighbors(
    name=None,
 ):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -52,16 +53,16 @@ def sample_neighbors(
                         The data type should be the same with `row`.
        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                              data type should be the same with `row`.
-        sample_size (int): The number of neighbors we need to sample. Default value is -1,
+        sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
                           which means returning all the neighbors of the input nodes.
-        eids (Tensor): The eid information of the input graph. If return_eids is True,
+        eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
                            then `eids` should not be None. The data type should be the
                            same with `row`. Default is None.
-        return_eids (bool): Whether to return eid information of sample edges. Default is False.
+        return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
-        perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
+        perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
                              is True, then `perm_buffer` should not be None. The data type should
                              be the same with `row`. If not None, we will use fiser-yates sampling
-                              to speed up. Only useful for gpu version.
+                              to speed up. Only useful for gpu version. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -69,15 +70,16 @@ def sample_neighbors(
        - out_neighbors (Tensor), the sample neighbors of the input nodes.
        - out_count (Tensor), the number of sampling neighbors of each input node, and the shape
-                              should be the same with `input_nodes`.
+          should be the same with `input_nodes`.
        - out_eids (Tensor), if `return_eids` is True, we will return the eid information of the
-                             sample edges.
+          sample edges.
    Examples:
        .. code-block:: python
            import paddle
            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -69,8 +69,9 @@ def to_list(value):
 def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase,
+    assert isinstance(
-                            fluid.core.eager.Tensor)), "not a variable"
+        var, (Variable, fluid.core.VarBase, fluid.core.eager.Tensor)
+    ), "not a variable"
    if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
        return var.numpy()
    t = global_scope().find_var(var.name).get_tensor()
@@ -105,10 +106,9 @@ def extract_args(func):
 def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(x,
+    return collective._c_allgather(
-                                   nranks,
+        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream
-                                   ring_id=ring_id,
+    )
-                                   use_calc_stream=use_calc_stream)
 def wait_server_ready(endpoints):
@@ -119,7 +119,8 @@ def wait_server_ready(endpoints):
        for ep in endpoints:
            ip_port = ep.split(":")
            with contextlib.closing(
-                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as sock:
                sock.settimeout(2)
                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                if result != 0:
@@ -131,8 +132,9 @@ def wait_server_ready(endpoints):
            break
-def init_communicator(program, rank, nranks, wait_port, current_endpoint,
+def init_communicator(
-                      endpoints):
+    program, rank, nranks, wait_port, current_endpoint, endpoints
+):
    if nranks < 2:
        return
    other_endpoints = endpoints[:]
@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
        nccl_id_var = block.create_var(
            name=fluid.unique_name.generate('nccl_id'),
            persistable=True,
-            type=fluid.core.VarDesc.VarType.RAW)
+            type=fluid.core.VarDesc.VarType.RAW,
+        )
-        block.append_op(type='c_gen_nccl_id',
-                        inputs={},
+        block.append_op(
-                        outputs={'Out': nccl_id_var},
+            type='c_gen_nccl_id',
-                        attrs={
+            inputs={},
-                            'rank': rank,
+            outputs={'Out': nccl_id_var},
-                            'endpoint': current_endpoint,
+            attrs={
-                            'other_endpoints': other_endpoints
+                'rank': rank,
-                        })
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
-        block.append_op(type='c_comm_init',
+            },
-                        inputs={'X': nccl_id_var},
+        )
-                        outputs={},
-                        attrs={
+        block.append_op(
-                            'nranks': nranks,
+            type='c_comm_init',
-                            'rank': rank,
+            inputs={'X': nccl_id_var},
-                            'ring_id': 0,
+            outputs={},
-                        })
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': 0,
+            },
+        )
    elif core.is_compiled_with_npu():
        hccl_id_var = block.create_var(
            name=fluid.unique_name.generate('hccl_id'),
            persistable=True,
-            type=core.VarDesc.VarType.RAW)
+            type=core.VarDesc.VarType.RAW,
-        block.append_op(type='c_gen_hccl_id',
+        )
-                        inputs={},
+        block.append_op(
-                        outputs={'Out': hccl_id_var},
+            type='c_gen_hccl_id',
-                        attrs={
+            inputs={},
-                            'rank': rank,
+            outputs={'Out': hccl_id_var},
-                            'endpoint': current_endpoint,
+            attrs={
-                            'other_endpoints': other_endpoints
+                'rank': rank,
-                        })
+                'endpoint': current_endpoint,
-        block.append_op(type='c_comm_init_hccl',
+                'other_endpoints': other_endpoints,
-                        inputs={'X': hccl_id_var},
+            },
-                        outputs={},
+        )
-                        attrs={
+        block.append_op(
-                            'rank': rank,
+            type='c_comm_init_hccl',
-                            'ring_id': 0,
+            inputs={'X': hccl_id_var},
-                            'device_id': int(os.getenv("FLAGS_selected_npus")),
+            outputs={},
-                            'rank_ids': nranks
+            attrs={
-                        })
+                'rank': rank,
+                'ring_id': 0,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks,
+            },
+        )
 def prepare_distributed_context(place=None):
    if place is None:
-        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
+        place = (
+            fluid.CUDAPlace(ParallelEnv().dev_id)
+            if ParallelEnv().nranks > 1
            else fluid.CUDAPlace(0)
+        )
    place = _get_paddle_place(place)
    strategy = fluid.dygraph.parallel.ParallelStrategy()
@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None):
        def _init_context():
            communicator_prog = fluid.Program()
-            init_communicator(communicator_prog, strategy.local_rank,
+            init_communicator(
-                              strategy.nranks, True, strategy.current_endpoint,
+                communicator_prog,
-                              strategy.trainer_endpoints)
+                strategy.local_rank,
+                strategy.nranks,
+                True,
+                strategy.current_endpoint,
+                strategy.trainer_endpoints,
+            )
            exe = fluid.Executor(place)
            exe.run(communicator_prog)
@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None):
            fluid.enable_dygraph(place)
    else:
-        assert ("Only support CUDAPlace for now.")
+        assert "Only support CUDAPlace for now."
    _parallel_context_initialized = True
    return strategy
@@ -246,7 +266,9 @@ def _update_input_info(inputs):
 class StaticGraphAdapter(object):
    """
    Model traning/inference with a static graph.
    """
    def __init__(self, model):
@@ -269,7 +291,7 @@ class StaticGraphAdapter(object):
            'eval_total': 0,
            'test_total': 0,
            'eval_batch': 0,
-            'test_batch': 0
+            'test_batch': 0,
        }
        self._nranks = ParallelEnv().nranks
@@ -289,10 +311,13 @@ class StaticGraphAdapter(object):
        self.model.mode = value
    def train_batch(self, inputs, labels=None, update=True):
-        assert self.model._optimizer, \
+        assert (
-            "model not ready, please call `model.prepare()` first"
+            self.model._optimizer
+        ), "model not ready, please call `model.prepare()` first"
        self.mode = 'train'
-        assert update is True, "Does not support `update == False` in static mode by now."
+        assert (
+            update is True
+        ), "Does not support `update == False` in static mode by now."
        return self._run(inputs, labels)
    def eval_batch(self, inputs, labels=None):
@@ -307,7 +332,6 @@ class StaticGraphAdapter(object):
        return self.model.network.parameters(*args, **kwargs)
    def save(self, path):
        def _save(state, path):
            if not state:
                return
@@ -331,8 +355,7 @@ class StaticGraphAdapter(object):
        # XXX `optimizer.state_dict()` only work in dygraph mode
        optim_path = path + ".pdopt"
        optim = {
-            p.name: p
+            p.name: p for p in filter(is_belong_to_optimizer, prog.list_vars())
-            for p in filter(is_belong_to_optimizer, prog.list_vars())
        }
        if not optim:
            return
@@ -348,8 +371,10 @@ class StaticGraphAdapter(object):
        # restore parameter states
        fluid.core._create_loaded_parameter(
-            [param for param, state in param_state_pairs], global_scope(),
+            [param for param, state in param_state_pairs],
-            executor)
+            global_scope(),
+            executor,
+        )
        for param, state in param_state_pairs:
            self._set_var(param, state)
@@ -377,9 +402,10 @@ class StaticGraphAdapter(object):
                # static-graph, since the time of global_step to increase is
                # different.
                state_val = (
-                    np.array(converted_state.pop("global_step")) - 1
+                    (np.array(converted_state.pop("global_step")) - 1)
-                ) if "global_step" in converted_state else converted_state.pop(
+                    if "global_step" in converted_state
-                    "@LR_DECAY_COUNTER@", None)
+                    else converted_state.pop("@LR_DECAY_COUNTER@", None)
+                )
                if state_val is not None:
                    converted_state[var.name] = state_val
            elif var.name.startswith("learning_rate_"):
@@ -396,36 +422,61 @@ class StaticGraphAdapter(object):
                    opt_cls_name = self.model._optimizer.__class__.__name__
                    opt_unq_name = None
                    for name in self.model._optimizer._accumulators.keys():
-                        accum_name = name if opt_name is None else name[
+                        accum_name = (
-                            len(opt_name) + 1:]
+                            name
-                        for param_name, state_var in self.model._optimizer._accumulators[
+                            if opt_name is None
-                                name].items():
+                            else name[len(opt_name) + 1 :]
+                        )
+                        for (
+                            param_name,
+                            state_var,
+                        ) in self.model._optimizer._accumulators[name].items():
                            if opt_unq_name is None:
                                # can not infer out the exact unique(opt_name),
                                # thus try to extract rather than generate
-                                for state_key in sorted(state.keys(),
+                                for state_key in sorted(
-                                                        key=lambda x: len(x),
+                                    state.keys(),
-                                                        reverse=True):
+                                    key=lambda x: len(x),
-                                    prefix = param_name + "_" + (
+                                    reverse=True,
-                                        opt_cls_name
+                                ):
-                                        if opt_name is None else opt_name) + "_"
+                                    prefix = (
+                                        param_name
+                                        + "_"
+                                        + (
+                                            opt_cls_name
+                                            if opt_name is None
+                                            else opt_name
+                                        )
+                                        + "_"
+                                    )
                                    if state_key.startswith(prefix):
-                                        prefix_offset = state_key[len(
+                                        prefix_offset = state_key[
-                                            prefix):].find("_") + len(prefix)
+                                            len(prefix) :
+                                        ].find("_") + len(prefix)
                                        opt_unq_name = state_key[
-                                            len(param_name + "_"):prefix_offset]
+                                            len(
+                                                param_name + "_"
+                                            ) : prefix_offset
+                                        ]
                                        # TODO: assert
                                        # assert opt_unq_name is None
                                    # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
                                    # always end with "_0" since the unique optimizer._name
-                            dy_state_name = (param_name + "_" + opt_unq_name +
+                            dy_state_name = (
-                                             "_" + accum_name + "_0")
+                                param_name
+                                + "_"
+                                + opt_unq_name
+                                + "_"
+                                + accum_name
+                                + "_0"
+                            )
                            converted_state[
-                                state_var.name] = converted_state.pop(
+                                state_var.name
-                                    dy_state_name)
+                            ] = converted_state.pop(dy_state_name)
-            assert var.name in converted_state, \
+            assert (
-                "variable [{}] is not in optimizer state file".format(var.name)
+                var.name in converted_state
+            ), "variable [{}] is not in optimizer state file".format(var.name)
            self._set_var(var, converted_state[var.name])
    def _set_var(self, var, ndarray):
@@ -444,15 +495,17 @@ class StaticGraphAdapter(object):
    def _run(self, inputs, labels=None):
        compiled_prog = self._compiled_progs.get(self.mode, None)
-        assert compiled_prog, \
+        assert (
-            "Model is not ready, please call `model.prepare()` first"
+            compiled_prog
+        ), "Model is not ready, please call `model.prepare()` first"
        inputs = to_list(inputs)
        if labels is not None:
            labels = to_list(labels)
-        assert len(inputs) == len(self._input_vars[self.mode]), \
+        assert len(inputs) == len(self._input_vars[self.mode]), (
-            "number of inputs" \
+            "number of inputs"
            + " does not match number of arguments of `forward` method"
+        )
        feed = {}
        input_names = [v.name for v in self._input_vars[self.mode]]
@@ -462,8 +515,10 @@ class StaticGraphAdapter(object):
            # train and test may take different arguments
            if inputs[idx] is not None:
                feed[n] = inputs[idx]
-            if self._amp_level == 'O2' and input_dtypes[
+            if (
-                    idx] == core.VarDesc.VarType.FP16:
+                self._amp_level == 'O2'
+                and input_dtypes[idx] == core.VarDesc.VarType.FP16
+            ):
                if isinstance(feed[n], core.LoDTensor):
                    feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
                elif isinstance(feed[n], np.array):
@@ -491,10 +546,12 @@ class StaticGraphAdapter(object):
            else:
                pruned_fetch_list.append(fetch_var)
-        rets = self._executor.run(compiled_prog,
+        rets = self._executor.run(
-                                  feed=feed,
+            compiled_prog,
-                                  fetch_list=pruned_fetch_list,
+            feed=feed,
-                                  return_numpy=False)
+            fetch_list=pruned_fetch_list,
+            return_numpy=False,
+        )
        # restore pruned fetch_list Variable from feeds
        for i, name in enumerate(pruned_fetch_idx_name_map):
@@ -510,20 +567,24 @@ class StaticGraphAdapter(object):
        metrics = []
        for metric, state in zip(self.model._metrics, metric_states):
            # cut off padding size
-            if self.mode != 'train' and self.model._test_dataloader is not None \
+            if (
-                    and isinstance(self.model._test_dataloader, DataLoader) \
+                self.mode != 'train'
-                    and self._nranks > 1:
+                and self.model._test_dataloader is not None
+                and isinstance(self.model._test_dataloader, DataLoader)
+                and self._nranks > 1
+            ):
                total_size = len(self.model._test_dataloader.dataset)
                # TODO: fixme if have better way to get batch size
                samples = state[0].shape[0]
                current_count = self._merge_count.get(self.mode + '_total', 0)
                if current_count + samples >= total_size:
                    state = [
-                        s[:int(total_size - current_count), ...] for s in state
+                        s[: int(total_size - current_count), ...] for s in state
                    ]
                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode + '_batch'] = int(total_size -
+                    self._merge_count[self.mode + '_batch'] = int(
-                                                                  current_count)
+                        total_size - current_count
+                    )
                else:
                    self._merge_count[self.mode + '_total'] += samples
                    self._merge_count[self.mode + '_batch'] = samples
@@ -555,8 +616,11 @@ class StaticGraphAdapter(object):
        if mode != 'train':
            for op in list(prog.global_block().ops):
                prog.global_block()._remove_op(0)
-        if mode == 'train' and self.model._optimizer \
+        if (
-                and self.model._optimizer._learning_rate_map:
+            mode == 'train'
+            and self.model._optimizer
+            and self.model._optimizer._learning_rate_map
+        ):
            # HACK workaround learning rate map issue
            lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
            new_lr_var = prog.global_block().vars[lr_var.name]
@@ -594,20 +658,27 @@ class StaticGraphAdapter(object):
                        dist_strategy.amp = True
                        dist_strategy.amp_configs = self._amp_configs.copy()
                        dist_strategy.amp_configs.update(self._amp_custom_lists)
-                        dist_strategy.amp_configs[
+                        dist_strategy.amp_configs['use_pure_fp16'] = (
-                            'use_pure_fp16'] = self._amp_level == 'O2'
+                            self._amp_level == 'O2'
+                        )
                    self.model._optimizer = fleet.distributed_optimizer(
-                        self.model._optimizer, strategy=dist_strategy)
+                        self.model._optimizer, strategy=dist_strategy
+                    )
                elif self._amp_level != "O0" and core.is_compiled_with_cuda:
-                    amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
+                    amp_lists = (
-                        **self._amp_custom_lists
+                        paddle.static.amp.AutoMixedPrecisionLists(
-                    ) if self._amp_custom_lists else None
+                            **self._amp_custom_lists
+                        )
+                        if self._amp_custom_lists
+                        else None
+                    )
                    self.model._optimizer = paddle.static.amp.decorate(
                        self.model._optimizer,
                        amp_lists=amp_lists,
                        use_pure_fp16=self._amp_level == "O2",
                        use_fp16_guard=self._use_fp16_guard,
-                        **self._amp_configs)
+                        **self._amp_configs
+                    )
                self.model._optimizer.minimize(self._loss_endpoint)
@@ -620,7 +691,7 @@ class StaticGraphAdapter(object):
        self._endpoints[mode] = {
            "output": outputs,
            "loss": to_list(losses),
-            "metric": metrics
+            "metric": metrics,
        }
    def _compile_and_initialize(self, prog, mode):
@@ -628,8 +699,9 @@ class StaticGraphAdapter(object):
        if compiled_prog is not None:
            return compiled_prog
-        assert self.model._place is not None, \
+        assert (
-            "device is not set, please call `model.prepare()` first"
+            self.model._place is not None
+        ), "device is not set, please call `model.prepare()` first"
        place = self.model._place
@@ -642,8 +714,11 @@ class StaticGraphAdapter(object):
            uninitialized = []
            for var_py in self._startup_prog.list_vars():
                var = fluid.global_scope().find_var(var_py.name)
-                if not var_py.name.startswith('nccl_id') and var and \
+                if (
-                        var.get_tensor()._is_initialized():
+                    not var_py.name.startswith('nccl_id')
+                    and var
+                    and var.get_tensor()._is_initialized()
+                ):
                    continue
                uninitialized.append(var_py)
@@ -651,7 +726,10 @@ class StaticGraphAdapter(object):
                startup_prog = self._startup_prog._prune(uninitialized)
                self._executor.run(startup_prog)
-        if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
+        if (
+            self._amp_level == "O2"
+            and mode == 'train'
+            and core.is_compiled_with_cuda()
        ):
            self.model._optimizer.amp_init(place)
@@ -664,7 +742,6 @@ class StaticGraphAdapter(object):
 class DynamicGraphAdapter(object):
    def __init__(self, model):
        super(DynamicGraphAdapter, self).__init__()
        self.model = model
@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object):
            'eval_total': 0,
            'test_total': 0,
            'eval_batch': 0,
-            'test_batch': 0
+            'test_batch': 0,
        }
        self._input_info = None
@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object):
            stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
            stradegy.current_endpoint = ParallelEnv().current_endpoint
            self.ddp_model = fluid.dygraph.parallel.DataParallel(
-                self.model.network, stradegy)
+                self.model.network, stradegy
+            )
    @property
    def mode(self):
@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object):
    # TODO multi device in dygraph mode not implemented at present time
    def train_batch(self, inputs, labels=None, update=True):
-        assert self.model._optimizer, \
+        assert (
-            "model not ready, please call `model.prepare()` first"
+            self.model._optimizer
+        ), "model not ready, please call `model.prepare()` first"
        self.model.network.train()
        self.mode = 'train'
        inputs = to_list(inputs)
@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object):
        if self._amp_level != "O0" and self.model._scaler is None:
            self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
-        with paddle.amp.auto_cast(enable=self._amp_level != 'O0',
+        with paddle.amp.auto_cast(
-                                  **self._amp_custom_lists,
+            enable=self._amp_level != 'O0',
-                                  level=self._amp_level):
+            **self._amp_custom_lists,
+            level=self._amp_level
+        ):
            if self._nranks > 1:
                outputs = self.ddp_model(*[to_variable(x) for x in inputs])
            else:
@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object):
            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
            metrics.append(m)
-        return ([to_numpy(l) for l in losses], metrics) \
+        return (
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
+            ([to_numpy(l) for l in losses], metrics)
+            if len(metrics) > 0
+            else [to_numpy(l) for l in losses]
+        )
    def eval_batch(self, inputs, labels=None):
        self.model.network.eval()
@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object):
        metrics = []
        for metric in self.model._metrics:
            # cut off padding value.
-            if self.model._test_dataloader is not None and self._nranks > 1 \
+            if (
-                    and isinstance(self.model._test_dataloader, DataLoader):
+                self.model._test_dataloader is not None
+                and self._nranks > 1
+                and isinstance(self.model._test_dataloader, DataLoader)
+            ):
                total_size = len(self.model._test_dataloader.dataset)
                samples = outputs[0].shape[0]
                current_count = self._merge_count.get(self.mode + '_total', 0)
                if current_count + samples >= total_size:
                    outputs = [
-                        o[:int(total_size - current_count)] for o in outputs
+                        o[: int(total_size - current_count)] for o in outputs
                    ]
                    labels = [
-                        l[:int(total_size - current_count)] for l in labels
+                        l[: int(total_size - current_count)] for l in labels
                    ]
                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode + '_batch'] = int(total_size -
+                    self._merge_count[self.mode + '_batch'] = int(
-                                                                  current_count)
+                        total_size - current_count
+                    )
                else:
                    self._merge_count[self.mode + '_total'] += samples
                    self._merge_count[self.mode + '_batch'] = samples
@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object):
            opt_unq_name = ''
        opt_cls_name = self.model._optimizer.__class__.__name__
-        opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
+        opt_name = opt_unq_name[: opt_unq_name.rfind("_")]  # remove suffix idx
        param_names = [param.name for param in self.model.network.parameters()]
-        for var_name, state_var in sorted(optim_state.items(),
+        for var_name, state_var in sorted(
-                                          key=lambda x: len(x[0]),
+            optim_state.items(), key=lambda x: len(x[0]), reverse=True
-                                          reverse=True):
+        ):
            if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
                # NOTE: dygraph saved global_step is 1 larger than that in
                # static-graph, since the time of global_step to increase is
                # different.
                if var_name == "@LR_DECAY_COUNTER@":
-                    converted_state["global_step"] = np.array(
+                    converted_state["global_step"] = (
-                        converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+                        np.array(converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+                    )
            else:
                # moment and other accumulators
                # extend state dict to include promising dygraph names
                for param_name in param_names:
                    if var_name.startswith(param_name + "_" + opt_name):
                        # when init optimizer with name
-                        accum_name = var_name[len(param_name + "_" + opt_name +
+                        accum_name = var_name[
-                                                  "_"):]
+                            len(param_name + "_" + opt_name + "_") :
-                    elif var_name.startswith(param_name +
+                        ]
-                                             "_") and opt_name == opt_cls_name:
+                    elif (
+                        var_name.startswith(param_name + "_")
+                        and opt_name == opt_cls_name
+                    ):
                        # when init optimizer without name
-                        accum_name = var_name[len(param_name + "_"):]
+                        accum_name = var_name[len(param_name + "_") :]
                    else:
                        continue
                    # remove suffix idx
-                    accum_name = accum_name[:accum_name.rfind("_")]
+                    accum_name = accum_name[: accum_name.rfind("_")]
                    # state names always end with "_0" in dygraph because of the
                    # unique optimizer._name
-                    dy_state_name = (param_name + "_" + opt_unq_name + "_" +
+                    dy_state_name = (
-                                     accum_name + "_0")
+                        param_name
+                        + "_"
+                        + opt_unq_name
+                        + "_"
+                        + accum_name
+                        + "_0"
+                    )
                    converted_state[dy_state_name] = state_var
        if not hasattr(self.model._optimizer, 'set_state_dict'):
@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object):
            self.model._optimizer.set_state_dict(converted_state)
    def prepare(self):
-        if self._amp_level == "O2" and self.model.mode == 'train' and core.is_compiled_with_cuda(
+        if (
+            self._amp_level == "O2"
+            and self.model.mode == 'train'
+            and core.is_compiled_with_cuda()
        ):
            self.model.network, self.model._optimizer = paddle.amp.decorate(
                models=self.model.network,
                optimizers=self.model._optimizer,
-                level='O2')
+                level='O2',
+            )
        if self._amp_level != "O0":
            self.model._scaler = None
 class Model(object):
    """
    An Model object is network with training and inference features.
    Dynamic graph and static graph are supported at the same time,
    switched by `paddle.enable_static()`. The usage is as follows.
@@ -920,7 +1023,7 @@ class Model(object):
    instantiating a Model. The input description, i.e, paddle.static.InputSpec,
    must be required for static graph.
-    When training on GPU, auto mixed precision (AMP O1) and pure float16 
+    When training on GPU, auto mixed precision (AMP O1) and pure float16
    (AMP O2) training are both supported in static mode and dynamic mode.
    In static graph mode, before training with pure float16 (AMP O2),
    `multi_precision` could be set to True when creating optimizer, which can
@@ -965,7 +1068,7 @@ class Model(object):
            # inputs and labels are not required for dynamic graph.
            input = InputSpec([None, 784], 'float32', 'x')
            label = InputSpec([None, 1], 'int64', 'label')
            model = paddle.Model(net, input, label)
            optim = paddle.optimizer.SGD(learning_rate=1e-3,
                parameters=model.parameters())
@@ -1053,16 +1156,17 @@ class Model(object):
    def train_batch(self, inputs, labels=None, update=True):
        """
        Run one training step on one batch of data. And using `update` indicates
        whether optimizer update gradients computing by this batch.
        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
-                (in case the model has multiple labels). If has no labels, 
+                (in case the model has multiple labels). If has no labels,
                set None. Default: None.
            update (bool, optional): Whether update parameters after loss.backward() computing.
                Set it to False to accumulate gradients. Default: True.
@@ -1075,7 +1179,7 @@ class Model(object):
        Examples:
            .. code-block:: python
                import paddle
                import paddle.nn as nn
                from paddle.static import InputSpec
@@ -1098,6 +1202,7 @@ class Model(object):
                loss = model.train_batch([data], [label])
                print(loss)
                # [array([2.192784], dtype=float32)]
        """
        loss = self._adapter.train_batch(inputs, labels, update)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1107,15 +1212,16 @@ class Model(object):
    @no_grad()
    def eval_batch(self, inputs, labels=None):
        """
        Run one evaluating step on a batch of data.
        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
-                (in case the model has multiple labels). If has no labels, 
+                (in case the model has multiple labels). If has no labels,
                set None. Default: None.
        Returns:
@@ -1150,6 +1256,7 @@ class Model(object):
                loss, acc = model.eval_batch([data], [label])
                print(loss, acc)
                # [array([2.8825705], dtype=float32)] [0.0]
        """
        loss = self._adapter.eval_batch(inputs, labels)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1159,11 +1266,12 @@ class Model(object):
    @no_grad()
    def predict_batch(self, inputs):
        """
        Run one predicting step on a batch of data.
        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).
        Returns:
@@ -1179,7 +1287,7 @@ class Model(object):
                from paddle.static import InputSpec
                device = paddle.set_device('cpu') # or 'gpu'
                input = InputSpec([None, 784], 'float32', 'x')
                label = InputSpec([None, 1], 'int64', 'label')
@@ -1197,6 +1305,7 @@ class Model(object):
                # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
                #          0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
                #          dtype=float32)]
        """
        loss = self._adapter.predict_batch(inputs)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1204,12 +1313,13 @@ class Model(object):
        return loss
    def save(self, path, training=True):
-        """  
+        """
-        This function saves parameters, optimizer information or model and 
+        This function saves parameters, optimizer information or model and
        paramters only for inference to path. It depends on the parameter
        `training`.
-        If `training` is set to True, the parameters saved contain all 
+        If `training` is set to True, the parameters saved contain all
        the trainable Variable, will save to a file with suffix ".pdparams".
        The optimizer information contains all the variable used by optimizer.
        For Adam optimizer, contains beta1, beta2, momentum etc. All the
@@ -1268,10 +1378,11 @@ class Model(object):
                    T.Normalize([127.5], [127.5])
                ])
                data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
                model.fit(data, epochs=1, batch_size=32, verbose=0)
                model.save('checkpoint/test')  # save for training
                model.save('inference_model', False)  # save for inference
        """
        if ParallelEnv().local_rank == 0:
@@ -1282,6 +1393,7 @@ class Model(object):
    def load(self, path, skip_mismatch=False, reset_optimizer=False):
        """
        Load from files storing the model states and optimizer states. The file
        for optimizer states is not necessary if no need to restore the optimizer.
@@ -1329,6 +1441,7 @@ class Model(object):
                model.save('checkpoint/test')
                model.load('checkpoint/test')
        """
        def _load_state_from_path(path):
@@ -1341,17 +1454,24 @@ class Model(object):
            state = param_state.get(key, None)
            if state is None:
                raise ValueError(
-                    "{} is not found in the providing file.".format(key))
+                    "{} is not found in the providing file.".format(key)
+                )
            if list(state.shape) != list(param.shape):
                raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
+                    "{} receives a shape {}, but the expected shape is {}.".format(
-                    format(key, list(state.shape), list(param.shape)))
+                        key, list(state.shape), list(param.shape)
+                    )
+                )
            return param, state
        def _strip_postfix(path):
            path, ext = os.path.splitext(path)
-            assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
+            assert ext in [
-                    "Unknown postfix {} from weights".format(ext)
+                '',
+                '.pdparams',
+                '.pdopt',
+                '.pdmodel',
+            ], "Unknown postfix {} from weights".format(ext)
            return path
        path = _strip_postfix(path)
@@ -1365,15 +1485,17 @@ class Model(object):
            except ValueError as err:
                if skip_mismatch:
                    warnings.warn(
-                        ("Skip loading for {}. ".format(key) + str(err)))
+                        ("Skip loading for {}. ".format(key) + str(err))
+                    )
                    # reset optimizer when mismatch happens
                    reset_optimizer = True
                else:
                    raise err
            matched_param_state.append(match_res)
-        optim_state = None if reset_optimizer else _load_state_from_path(
+        optim_state = (
-            path + ".pdopt")
+            None if reset_optimizer else _load_state_from_path(path + ".pdopt")
+        )
        # TODO: support save/load scaler state in static graph
        if _non_static_mode():
@@ -1382,13 +1504,15 @@ class Model(object):
                if os.path.exists(path + '.pdscaler'):
                    scaler_state = paddle.load(path + '.pdscaler')
-            return self._adapter.load(matched_param_state, optim_state,
+            return self._adapter.load(
-                                      scaler_state)
+                matched_param_state, optim_state, scaler_state
+            )
        else:
            return self._adapter.load(matched_param_state, optim_state)
    def parameters(self, *args, **kwargs):
        """
        Returns a list of parameters of the model.
        Returns:
@@ -1398,30 +1522,32 @@ class Model(object):
        Examples:
            .. code-block:: python
                import paddle
                import paddle.nn as nn
                from paddle.static import InputSpec
                input = InputSpec([None, 784], 'float32', 'x')
                model = paddle.Model(nn.Sequential(
                    nn.Linear(784, 200),
                    nn.Tanh(),
                    nn.Linear(200, 10)), input)
                params = model.parameters()
        """
        return self._adapter.parameters()
    def _prepare_amp(self, amp_configs):
        def _check_pure_fp16_configs():
            # pure float16 training has some restricts now
            if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
                # clip by value is not supported
-                assert isinstance(self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm)), \
+                assert isinstance(
-                     "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
+                    self._optimizer._grad_clip,
+                    (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
+                ), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
        self._adapter._amp_custom_lists = {}
        self._adapter._amp_configs = {}
@@ -1433,7 +1559,8 @@ class Model(object):
        elif isinstance(amp_configs, str):
            if amp_configs not in ('O0', 'O1', 'O2'):
                raise ValueError(
-                    "The level of amp_configs should be 'O0', 'O1' or 'O2'.")
+                    "The level of amp_configs should be 'O0', 'O1' or 'O2'."
+                )
            self._adapter._amp_level = amp_configs
            _check_pure_fp16_configs()
            return
@@ -1442,7 +1569,8 @@ class Model(object):
                self._adapter._amp_level = 'O1'
            elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
                raise ValueError(
-                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
+                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'."
+                )
            else:
                self._adapter._amp_level = amp_configs['level']
        amp_config_key_set = set(amp_configs.keys()) - {'level'}
@@ -1459,12 +1587,14 @@ class Model(object):
        # construct amp_custom_lists
        if self._adapter._amp_level != 'O0' and amp_config_key_set:
            for param_name in [
-                    'custom_white_list', 'custom_black_list',
+                'custom_white_list',
-                    'custom_black_varnames'
+                'custom_black_list',
+                'custom_black_varnames',
            ]:
                if param_name in amp_config_key_set:
                    self._adapter._amp_custom_lists[param_name] = amp_configs[
-                        param_name]
+                        param_name
+                    ]
                    amp_config_key_set -= {param_name}
        def _check_amp_configs(amp_config_key_set):
@@ -1479,13 +1609,16 @@ class Model(object):
            }
            if amp_config_key_set - accepted_param_set:
                raise ValueError(
-                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized."
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
-                    .format(tuple(amp_config_key_set - accepted_param_set)))
+                        tuple(amp_config_key_set - accepted_param_set)
+                    )
+                )
            if 'use_fp16_guard' in amp_config_key_set:
                if _non_static_mode():
                    raise ValueError(
-                        "'use_fp16_guard' is supported in static mode only.")
+                        "'use_fp16_guard' is supported in static mode only."
+                    )
                self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
                amp_config_key_set.remove('use_fp16_guard')
@@ -1495,12 +1628,11 @@ class Model(object):
        for key in amp_configs_set:
            self._adapter._amp_configs[key] = amp_configs[key]
-    def prepare(self,
+    def prepare(
-                optimizer=None,
+        self, optimizer=None, loss=None, metrics=None, amp_configs=None
-                loss=None,
+    ):
-                metrics=None,
-                amp_configs=None):
        """
        Configures the model before runing.
        Args:
@@ -1532,6 +1664,7 @@ class Model(object):
        Returns:
            None
        """
        self._place = _get_device()
        if isinstance(self._place, fluid.CUDAPlace):
@@ -1539,15 +1672,17 @@ class Model(object):
            if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
                if fluid._non_static_mode():
                    main_prog_seed = fluid.default_main_program().random_seed
-                    startup_prog_seed = fluid.default_startup_program(
+                    startup_prog_seed = (
-                    ).random_seed
+                        fluid.default_startup_program().random_seed
+                    )
                    fluid.disable_dygraph()
                    paddle.disable_static(self._place)
                    # enable_dygraph would create and switch to a new program,
                    # thus also copy seed to the new program
                    fluid.default_main_program().random_seed = main_prog_seed
-                    fluid.default_startup_program(
+                    fluid.default_startup_program().random_seed = (
-                    ).random_seed = startup_prog_seed
+                        startup_prog_seed
+                    )
                else:
                    prepare_distributed_context(self._place)
                _parallel_context_initialized = True
@@ -1562,43 +1697,46 @@ class Model(object):
        metrics = metrics or []
        for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
+            assert isinstance(
-                "{} is not sub class of Metric".format(
+                metric, Metric
-                    metric.__class__.__name__)
+            ), "{} is not sub class of Metric".format(metric.__class__.__name__)
        self._metrics = to_list(metrics)
        self._prepare_amp(amp_configs)
        self._adapter.prepare()
-    def fit(self,
+    def fit(
-            train_data=None,
+        self,
-            eval_data=None,
+        train_data=None,
-            batch_size=1,
+        eval_data=None,
-            epochs=1,
+        batch_size=1,
-            eval_freq=1,
+        epochs=1,
-            log_freq=10,
+        eval_freq=1,
-            save_dir=None,
+        log_freq=10,
-            save_freq=1,
+        save_dir=None,
-            verbose=2,
+        save_freq=1,
-            drop_last=False,
+        verbose=2,
-            shuffle=True,
+        drop_last=False,
-            num_workers=0,
+        shuffle=True,
-            callbacks=None,
+        num_workers=0,
-            accumulate_grad_batches=1,
+        callbacks=None,
-            num_iters=None):
+        accumulate_grad_batches=1,
+        num_iters=None,
+    ):
        """
        Trains the model for a fixed number of epochs. If `eval_data` is set,
        evaluation will be done at the end of each epoch.
        Args:
-            train_data (Dataset|DataLoader, optional): An iterable data loader is used for 
+            train_data (Dataset|DataLoader, optional): An iterable data loader is used for
-                train. An instance of paddle paddle.io.Dataset or 
+                train. An instance of paddle paddle.io.Dataset or
                paddle.io.Dataloader is recomended. Default: None.
            eval_data (Dataset|DataLoader, optional): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
+                evaluation at the end of epoch. If None, will not do evaluation.
-                An instance of paddle.io.Dataset or paddle.io.Dataloader 
+                An instance of paddle.io.Dataset or paddle.io.Dataloader
                is recomended. Default: None.
-            batch_size (int, optional): The batch size of train_data and eval_data. When 
+            batch_size (int, optional): The batch size of train_data and eval_data. When
                train_data and eval_data are both the instance of Dataloader, this
                parameter will be ignored. Default: 1.
            epochs (int, optional): The number of epochs to train the model. Default: 1.
@@ -1626,7 +1764,7 @@ class Model(object):
            callbacks (Callback|None, optional): A list of `Callback` instances to apply
                during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and
                :ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None.
-            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident 
+            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident
                during training process before optimizer updates. It can mimic large batch
                size. Default: 1.
            num_iters (int|None, optional): The number of iterations to evaluate the model.
@@ -1641,7 +1779,7 @@ class Model(object):
               How to make a batch is done internally.
            .. code-block:: python
-              :name: code-example1
+              :name: code-example3
                import paddle
                import paddle.vision.transforms as T
@@ -1681,7 +1819,7 @@ class Model(object):
               DataLoader.
            .. code-block:: python
-              :name: code-example2
+              :name: code-example4
                import paddle
                import paddle.vision.transforms as T
@@ -1691,7 +1829,7 @@ class Model(object):
                dynamic = True
                if not dynamic:
                    paddle.enable_static()
                transform = T.Compose([
                        T.Transpose(),
                        T.Normalize([127.5], [127.5])
@@ -1718,31 +1856,38 @@ class Model(object):
                            val_loader,
                            epochs=2,
                            save_dir='mnist_checkpoint')
        """
-        assert train_data is not None, \
+        assert train_data is not None, "train_data must be given!"
-                "train_data must be given!"
        if isinstance(train_data, Dataset):
-            train_sampler = DistributedBatchSampler(train_data,
+            train_sampler = DistributedBatchSampler(
-                                                    batch_size=batch_size,
+                train_data,
-                                                    shuffle=shuffle,
+                batch_size=batch_size,
-                                                    drop_last=drop_last)
+                shuffle=shuffle,
-            train_loader = DataLoader(train_data,
+                drop_last=drop_last,
-                                      batch_sampler=train_sampler,
+            )
-                                      places=self._place,
+            train_loader = DataLoader(
-                                      num_workers=num_workers,
+                train_data,
-                                      return_list=True)
+                batch_sampler=train_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
        else:
            train_loader = train_data
        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(eval_data,
+            eval_sampler = DistributedBatchSampler(
-                                                   batch_size=batch_size)
+                eval_data, batch_size=batch_size
-            eval_loader = DataLoader(eval_data,
+            )
-                                     batch_sampler=eval_sampler,
+            eval_loader = DataLoader(
-                                     places=self._place,
+                eval_data,
-                                     num_workers=num_workers,
+                batch_sampler=eval_sampler,
-                                     return_list=True)
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
        elif eval_data is not None:
            eval_loader = eval_data
        else:
@@ -1755,8 +1900,11 @@ class Model(object):
        steps = self._len_data_loader(train_loader)
        self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int) and isinstance(
+        if (
-                steps, int):
+            num_iters is not None
+            and isinstance(num_iters, int)
+            and isinstance(steps, int)
+        ):
            assert num_iters > 0, "num_iters must be greater than 0!"
            epochs = (num_iters // steps) + 1
            steps = min(num_iters, steps)
@@ -1784,10 +1932,10 @@ class Model(object):
            if do_eval and epoch % eval_freq == 0:
                eval_steps = self._len_data_loader(eval_loader)
-                cbks.on_begin('eval', {
+                cbks.on_begin(
-                    'steps': eval_steps,
+                    'eval',
-                    'metrics': self._metrics_name()
+                    {'steps': eval_steps, 'metrics': self._metrics_name()},
-                })
+                )
                eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
@@ -1798,20 +1946,22 @@ class Model(object):
        cbks.on_end('train', logs)
        self._test_dataloader = None
-    def evaluate(self,
+    def evaluate(
-                 eval_data,
+        self,
-                 batch_size=1,
+        eval_data,
-                 log_freq=10,
+        batch_size=1,
-                 verbose=2,
+        log_freq=10,
-                 num_workers=0,
+        verbose=2,
-                 callbacks=None,
+        num_workers=0,
-                 num_iters=None):
+        callbacks=None,
+        num_iters=None,
+    ):
        """
        Evaluate the loss and metrics of the model on input dataset.
        Args:
            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.io.Dataset or 
+                evaluation. An instance of paddle.io.Dataset or
                paddle.io.Dataloader is recomended.
            batch_size (int, optional): The batch size of train_data and eval_data.
                When eval_data is the instance of Dataloader, this argument will be
@@ -1859,13 +2009,16 @@ class Model(object):
        """
        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(eval_data,
+            eval_sampler = DistributedBatchSampler(
-                                                   batch_size=batch_size)
+                eval_data, batch_size=batch_size
-            eval_loader = DataLoader(eval_data,
+            )
-                                     batch_sampler=eval_sampler,
+            eval_loader = DataLoader(
-                                     places=self._place,
+                eval_data,
-                                     num_workers=num_workers,
+                batch_sampler=eval_sampler,
-                                     return_list=True)
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
        else:
            eval_loader = eval_data
@@ -1881,15 +2034,17 @@ class Model(object):
        eval_steps = self._len_data_loader(eval_loader)
        self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int) and isinstance(
+        if (
-                eval_steps, int):
+            num_iters is not None
+            and isinstance(num_iters, int)
+            and isinstance(eval_steps, int)
+        ):
            assert num_iters > 0, "num_iters must be greater than 0!"
            eval_steps = min(num_iters, eval_steps)
            self.num_iters = eval_steps
-        cbks.on_begin('eval', {
+        cbks.on_begin(
-            'steps': eval_steps,
+            'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
-            'metrics': self._metrics_name()
+        )
-        })
        logs = self._run_one_epoch(eval_loader, cbks, 'eval')
@@ -1903,13 +2058,15 @@ class Model(object):
        return eval_result
-    def predict(self,
+    def predict(
-                test_data,
+        self,
-                batch_size=1,
+        test_data,
-                num_workers=0,
+        batch_size=1,
-                stack_outputs=False,
+        num_workers=0,
-                verbose=1,
+        stack_outputs=False,
-                callbacks=None):
+        verbose=1,
+        callbacks=None,
+    ):
        """
        Compute the output predictions on testing data.
@@ -1919,7 +2076,7 @@ class Model(object):
                is recomended.
            batch_size (int, optional): The batch size of test_data. When test_data is the
                instance of Dataloader, this argument will be ignored. Default: 1.
-            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess 
+            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess
                used and loading data in main process. When test_data is the instance of Dataloader,
                this argument will be ignored. Default: 0.
            stack_outputs (bool, optional): Whether stack output field like a batch, as for an output
@@ -1980,13 +2137,16 @@ class Model(object):
        """
        if test_data is not None and isinstance(test_data, Dataset):
-            test_sampler = DistributedBatchSampler(test_data,
+            test_sampler = DistributedBatchSampler(
-                                                   batch_size=batch_size)
+                test_data, batch_size=batch_size
-            test_loader = DataLoader(test_data,
+            )
-                                     batch_sampler=test_sampler,
+            test_loader = DataLoader(
-                                     places=self._place,
+                test_data,
-                                     num_workers=num_workers,
+                batch_sampler=test_sampler,
-                                     return_list=True)
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
        else:
            test_loader = test_data
@@ -2036,7 +2196,8 @@ class Model(object):
                if self._is_shape_inferred:
                    warnings.warn(
                        "'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
-                        % self._input_info[0])
+                        % self._input_info[0]
+                    )
                paddle.jit.save(layer, path, input_spec=self._inputs)
@@ -2047,7 +2208,8 @@ class Model(object):
                raise ValueError(
                    "The input path MUST be format of dirname/file_prefix "
                    "[dirname\\file_prefix in Windows system], but received "
-                    "file_prefix is empty string.")
+                    "file_prefix is empty string."
+                )
            dirname = os.path.dirname(path)
            if dirname and not os.path.exists(dirname):
@@ -2058,21 +2220,24 @@ class Model(object):
            params_filename = file_prefix + INFER_PARAMS_SUFFIX
            prog = self._adapter._progs.get('test', None)
-            assert prog, \
+            assert (
-                "Model is not ready, please call `model.prepare()` first"
+                prog
+            ), "Model is not ready, please call `model.prepare()` first"
            infer_prog = prog.clone(for_test=True)
            input_names = [v.name for v in self._adapter._input_vars['test']]
            endpoints = self._adapter._endpoints['test']['output']
-            fluid.io.save_inference_model(model_path,
+            fluid.io.save_inference_model(
-                                          input_names,
+                model_path,
-                                          endpoints,
+                input_names,
-                                          self._adapter._executor,
+                endpoints,
-                                          main_program=infer_prog,
+                self._adapter._executor,
-                                          model_filename=model_filename,
+                main_program=infer_prog,
-                                          params_filename=params_filename)
+                model_filename=model_filename,
+                params_filename=params_filename,
+            )
    def _run_one_epoch(
        self,
@@ -2098,16 +2263,21 @@ class Model(object):
            # LoDTensor.shape is callable, where LoDTensor comes from
            # DataLoader in static graph
-            batch_size = data[0].shape()[0] if callable(
+            batch_size = (
-                data[0].shape) else data[0].shape[0]
+                data[0].shape()[0]
+                if callable(data[0].shape)
+                else data[0].shape[0]
+            )
            callbacks.on_batch_begin(mode, step, logs)
            if mode != 'predict':
-                _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
+                _inputs = [data[: len(self._inputs)], data[len(self._inputs) :]]
                if mode == 'train':
-                    _inputs.append((step + 1) % self._accumulate == 0
+                    _inputs.append(
-                                   or step + 1 == len(data_loader))
+                        (step + 1) % self._accumulate == 0
+                        or step + 1 == len(data_loader)
+                    )
                outs = getattr(self, mode + '_batch')(*_inputs)
@@ -2128,15 +2298,17 @@ class Model(object):
                    logs[k] = v
            else:
                if self._inputs is not None:
-                    outs = self.predict_batch(data[:len(self._inputs)])
+                    outs = self.predict_batch(data[: len(self._inputs)])
                else:
                    outs = self.predict_batch(data)
                outputs.append(outs)
            logs['step'] = step
-            if mode == 'train' or self._adapter._merge_count.get(
+            if (
-                    mode + '_batch', 0) <= 0:
+                mode == 'train'
+                or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
+            ):
                logs['batch_size'] = batch_size * ParallelEnv().nranks
            else:
                logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
@@ -2158,10 +2330,10 @@ class Model(object):
        """Prints a string summary of the network.
        Args:
-            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
-                    if not set, input_size will get from ``self._inputs`` if network only have 
+                    if not set, input_size will get from ``self._inputs`` if network only have
-                    one input, input_size can be tuple or InputSpec. if model have multiple 
+                    one input, input_size can be tuple or InputSpec. if model have multiple
-                    input, input_size must be a list which contain every input's shape. 
+                    input, input_size must be a list which contain every input's shape.
                    Default: None.
            dtype (str, optional): if dtype is None, 'float32' will be used, Default: None.
@@ -2190,8 +2362,9 @@ class Model(object):
                # {'total_params': 61610, 'trainable_params': 61610}
        """
-        assert (input_size is not None or self._inputs
+        assert (
-                is not None), "'input_size' or 'self._input' must be set"
+            input_size is not None or self._inputs is not None
+        ), "'input_size' or 'self._input' must be set"
        if input_size is not None:
            _input_size = input_size
        else:
@@ -2208,7 +2381,10 @@ class Model(object):
            if is_input:
                arg_names = extract_args(self.network.forward)[1:]
                # While Saving inference model in dygraph, and providing inputs only in running.
-                if shapes is not None and dtypes is not None and fluid._non_static_mode(
+                if (
+                    shapes is not None
+                    and dtypes is not None
+                    and fluid._non_static_mode()
                ):
                    out_specs = [
                        Input(name=n, dtype=dtypes[i], shape=shapes[i])
@@ -2221,7 +2397,8 @@ class Model(object):
        elif isinstance(specs, dict):
            assert is_input is False
            out_specs = [
-                specs[n] for n in extract_args(self.network.forward)
+                specs[n]
+                for n in extract_args(self.network.forward)
                if n != 'self'
            ]
        else:
@@ -2232,8 +2409,10 @@ class Model(object):
                assert isinstance(spec, Input)
                if spec.name is None:
                    raise ValueError(
-                        "Requires Input[{}].name != None, but receive `None` with {}."
+                        "Requires Input[{}].name != None, but receive `None` with {}.".format(
-                        .format(i, spec))
+                            i, spec
+                        )
+                    )
        return out_specs
@@ -2258,6 +2437,7 @@ class Model(object):
        "Update self._inputs according to given inputs."
        self._input_info = self._adapter._input_info
        if self._input_info is not None and len(self._input_info) == 2:
-            self._inputs = self._verify_spec(None, self._input_info[0],
+            self._inputs = self._verify_spec(
-                                             self._input_info[1], True)
+                None, self._input_info[0], self._input_info[1], True
+            )
            self._is_shape_inferred = True
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm(
    name=None,
 ):
    r"""
    The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
    .. code-block:: python
        y = layer_norm(residual + dropout(bias + x))
    Parameters:
@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor: The output Tensor, the data type and shape is same as `x`.
+        Tensor, The output Tensor, the data type and shape is same as `x`.
    Examples:
        .. code-block:: python
            # required: gpu
@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm(
                x, residual, bias)
            # [2, 4, 128]
            print(output.shape)
    """
    seed = None
    if mode not in ('downscale_in_infer', 'upscale_in_train'):

--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f
 from paddle.nn import Layer
 from paddle.framework import ParamAttr
 import paddle
-from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list
+from paddle.nn.layer.transformer import (
+    _convert_attention_mask,
+    _convert_param_attr_to_list,
+)
 from paddle.nn.initializer import Constant
 from paddle.fluid.dygraph import no_grad
 from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode
@@ -51,7 +54,8 @@ def _to_dtype(t, dtype):
    if t.place.is_gpu_place():
        size_dtype = core.size_of_dtype(dtype)
        waiting_alloc_memory = (
-            (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+        )
        gpu_memory_available = core.gpu_memory_available()
        if gpu_memory_available < waiting_alloc_memory:
            t_used = t._copy_to(paddle.CPUPlace(), False)
@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
            output = fused_bias_dropout_residual_ln(x, residual)  # [2, 4, 128]
    """
-    def __init__(self,
+    def __init__(
-                 embed_dim,
+        self,
-                 dropout_rate=0.5,
+        embed_dim,
-                 weight_attr=None,
+        dropout_rate=0.5,
-                 bias_attr=None,
+        weight_attr=None,
-                 epsilon=1e-5,
+        bias_attr=None,
-                 name=None):
+        epsilon=1e-5,
+        name=None,
+    ):
        super(FusedBiasDropoutResidualLayerNorm, self).__init__()
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+        assert embed_dim > 0, (
-                               "but recieved {}".format(embed_dim))
+            "Expected embed_dim to be greater than 0, "
+            "but recieved {}".format(embed_dim)
+        )
        self._dtype = self._helper.get_default_dtype()
        self._bias_attr = bias_attr
        self._weight_attr = weight_attr
        self.embed_dim = embed_dim
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
+        self.linear_bias = self.create_parameter(
-                                                 attr=self._bias_attr,
+            shape=[embed_dim],
-                                                 dtype=self._dtype,
+            attr=self._bias_attr,
-                                                 is_bias=True)
+            dtype=self._dtype,
+            is_bias=True,
+        )
        self.ln_scale = self.create_parameter(
            attr=self._weight_attr,
            shape=[embed_dim],
-            default_initializer=Constant(value=1.0))
+            default_initializer=Constant(value=1.0),
-        self.ln_bias = self.create_parameter(attr=self._bias_attr,
+        )
-                                             shape=[embed_dim],
+        self.ln_bias = self.create_parameter(
-                                             is_bias=True)
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True
+        )
        self.dropout_rate = dropout_rate
        self._epsilon = epsilon
@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
            ln_epsilon=self._epsilon,
            training=self.training,
            mode='upscale_in_train',
-            name=self.name)
+            name=self.name,
+        )
        return out
    def extra_repr(self):
        name_str = ', name={}'.format(self.name) if self.name else ''
        return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
-            self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
+            self.embed_dim,
-            self._dtype, name_str)
+            self.seq_len,
+            self.dropout_rate,
+            self._epsilon,
+            self._dtype,
+            name_str,
+        )
 class FusedMultiHeadAttention(Layer):
@@ -246,33 +263,40 @@ class FusedMultiHeadAttention(Layer):
            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
    """
-    def __init__(self,
+    def __init__(
-                 embed_dim,
+        self,
-                 num_heads,
+        embed_dim,
-                 dropout_rate=0.5,
+        num_heads,
-                 attn_dropout_rate=0.5,
+        dropout_rate=0.5,
-                 kdim=None,
+        attn_dropout_rate=0.5,
-                 vdim=None,
+        kdim=None,
-                 normalize_before=False,
+        vdim=None,
-                 need_weights=False,
+        normalize_before=False,
-                 qkv_weight_attr=None,
+        need_weights=False,
-                 qkv_bias_attr=None,
+        qkv_weight_attr=None,
-                 linear_weight_attr=None,
+        qkv_bias_attr=None,
-                 linear_bias_attr=None,
+        linear_weight_attr=None,
-                 pre_ln_scale_attr=None,
+        linear_bias_attr=None,
-                 pre_ln_bias_attr=None,
+        pre_ln_scale_attr=None,
-                 ln_scale_attr=None,
+        pre_ln_bias_attr=None,
-                 ln_bias_attr=None,
+        ln_scale_attr=None,
-                 epsilon=1e-5,
+        ln_bias_attr=None,
-                 nranks=1,
+        epsilon=1e-5,
-                 ring_id=-1,
+        nranks=1,
-                 name=None):
+        ring_id=-1,
+        name=None,
+    ):
        super(FusedMultiHeadAttention, self).__init__()
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+        assert embed_dim > 0, (
-                               "but received {}".format(embed_dim))
+            "Expected embed_dim to be greater than 0, "
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+            "but received {}".format(embed_dim)
-                               "but received {}".format(num_heads))
+        )
+        assert (
+            num_heads > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            num_heads
+        )
        self.normalize_before = normalize_before
        self._dtype = self._helper.get_default_dtype()
@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer):
        self.kdim = kdim
        self.vdim = vdim
        self.need_weights = need_weights
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
        assert need_weights is False, "Only support need_weight is False now."
        # tensor model parallel
@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer):
            shape=[3, num_heads, self.head_dim, embed_dim],
            attr=qkv_weight_attr,
            dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
+        )
        self.qkv_bias = self.create_parameter(
            shape=[3, num_heads, self.head_dim],
            attr=qkv_bias_attr,
            dtype=self._dtype,
-            is_bias=True)
+            is_bias=True,
+        )
        self.linear_weight = self.create_parameter(
            shape=[num_heads * self.head_dim, embed_dim],
            attr=linear_weight_attr,
            dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
+        )
-                                                 attr=linear_bias_attr,
+        self.linear_bias = self.create_parameter(
-                                                 dtype=self._dtype,
+            shape=[embed_dim],
-                                                 is_bias=True)
+            attr=linear_bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
        # tensor model parallel
        if nranks > 1:
@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer):
            self.pre_ln_scale = self.create_parameter(
                attr=pre_ln_scale_attr,
                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
+                default_initializer=Constant(value=1.0),
-            self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
+            )
-                                                     shape=[embed_dim],
+            self.pre_ln_bias = self.create_parameter(
-                                                     is_bias=True)
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
            self.ln_scale = None
            self.ln_bias = None
        else:
@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer):
            self.ln_scale = self.create_parameter(
                attr=ln_scale_attr,
                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
+                default_initializer=Constant(value=1.0),
-            self.ln_bias = self.create_parameter(attr=ln_bias_attr,
+            )
-                                                 shape=[embed_dim],
+            self.ln_bias = self.create_parameter(
-                                                 is_bias=True)
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
        self.dropout_rate = dropout_rate
        self.attn_dropout_rate = attn_dropout_rate
@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer):
            ln_epsilon=self._epsilon,
            training=self.training,
            ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
        return out
    def extra_repr(self):
        name_str = ', name={}'.format(self.name) if self.name else ''
        return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
-            self.embed_dim, self.num_heads, self.dropout_rate,
+            self.embed_dim,
-            self.attn_dropout_rate, self._epsilon, self.kdim, self.vdim,
+            self.num_heads,
-            self.normalize_before, self.need_weights, self._dtype, name_str)
+            self.dropout_rate,
+            self.attn_dropout_rate,
+            self._epsilon,
+            self.kdim,
+            self.vdim,
+            self.normalize_before,
+            self.need_weights,
+            self._dtype,
+            name_str,
+        )
    def _amp_decorate(self, dtype):
        # tmp fix for amp.decorator(O2)
@@ -495,33 +538,39 @@ class FusedFeedForward(Layer):
            # (1, 8, 8)
    """
-    def __init__(self,
+    def __init__(
-                 d_model,
+        self,
-                 dim_feedforward,
+        d_model,
-                 dropout_rate=0.1,
+        dim_feedforward,
-                 epsilon=1e-05,
+        dropout_rate=0.1,
-                 activation="relu",
+        epsilon=1e-05,
-                 act_dropout_rate=None,
+        activation="relu",
-                 normalize_before=False,
+        act_dropout_rate=None,
-                 linear1_weight_attr=None,
+        normalize_before=False,
-                 linear1_bias_attr=None,
+        linear1_weight_attr=None,
-                 linear2_weight_attr=None,
+        linear1_bias_attr=None,
-                 linear2_bias_attr=None,
+        linear2_weight_attr=None,
-                 ln1_scale_attr=None,
+        linear2_bias_attr=None,
-                 ln1_bias_attr=None,
+        ln1_scale_attr=None,
-                 ln2_scale_attr=None,
+        ln1_bias_attr=None,
-                 ln2_bias_attr=None,
+        ln2_scale_attr=None,
-                 nranks=1,
+        ln2_bias_attr=None,
-                 ring_id=-1,
+        nranks=1,
-                 name=None):
+        ring_id=-1,
+        name=None,
+    ):
        super(FusedFeedForward, self).__init__()
-        assert d_model > 0, (
+        assert (
-            "Expected d_model to be greater than 0, but received {}".format(
+            d_model > 0
-                d_model))
+        ), "Expected d_model to be greater than 0, but received {}".format(
-        assert dim_feedforward > 0, (
+            d_model
-            "Expected dim_feedforward to be greater than 0, but received {}".
+        )
-            format(dim_feedforward))
+        assert (
+            dim_feedforward > 0
+        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
+            dim_feedforward
+        )
        self._dtype = self._helper.get_default_dtype()
        self._d_model = d_model
@@ -530,7 +579,9 @@ class FusedFeedForward(Layer):
        dim_feedforward = dim_feedforward // nranks
        self._dim_feedforward = dim_feedforward
        self._dropout_rate = dropout_rate
-        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_dropout_rate = (
+            dropout_rate if act_dropout_rate is None else act_dropout_rate
+        )
        self._act_method = activation
        self._normalize_before = normalize_before
        self._epsilon = epsilon
@@ -540,22 +591,28 @@ class FusedFeedForward(Layer):
            shape=[d_model, dim_feedforward],
            attr=linear1_weight_attr,
            dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
-        self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
+        )
-                                                   attr=linear1_bias_attr,
+        self._linear1_bias = self.create_parameter(
-                                                   dtype=self._dtype,
+            shape=[dim_feedforward],
-                                                   is_bias=True)
+            attr=linear1_bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
        self._linear2_weight = self.create_parameter(
            shape=[dim_feedforward, d_model],
            attr=linear2_weight_attr,
            dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
+        )
-        self._linear2_bias = self.create_parameter(shape=[d_model],
+        self._linear2_bias = self.create_parameter(
-                                                   attr=linear2_bias_attr,
+            shape=[d_model],
-                                                   dtype=self._dtype,
+            attr=linear2_bias_attr,
-                                                   is_bias=True)
+            dtype=self._dtype,
+            is_bias=True,
+        )
        if nranks > 1:
            assert ring_id != -1
@@ -569,10 +626,11 @@ class FusedFeedForward(Layer):
                shape=[d_model],
                attr=ln1_scale_attr,
                is_bias=False,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
-            self._ln1_bias = self.create_parameter(shape=[d_model],
+            )
-                                                   attr=ln1_bias_attr,
+            self._ln1_bias = self.create_parameter(
-                                                   is_bias=True)
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True
+            )
            self._ln2_scale = None
            self._ln2_bias = None
        else:
@@ -582,10 +640,11 @@ class FusedFeedForward(Layer):
                shape=[d_model],
                attr=ln2_scale_attr,
                is_bias=False,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
-            self._ln2_bias = self.create_parameter(shape=[d_model],
+            )
-                                                   attr=ln2_bias_attr,
+            self._ln2_bias = self.create_parameter(
-                                                   is_bias=True)
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True
+            )
        self.name = name
@@ -608,15 +667,23 @@ class FusedFeedForward(Layer):
            pre_layer_norm=self._normalize_before,
            training=self.training,
            ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
        return out
    def extra_repr(self):
        name_str = ', name={}'.format(self.name) if self.name else ''
        return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
-            self._d_model, self._dim_feedforward, self._dropout_rate,
+            self._d_model,
-            self._epsilon, self._act_method, self._act_dropout_rate,
+            self._dim_feedforward,
-            self._normalize_before, self._dtype, name_str)
+            self._dropout_rate,
+            self._epsilon,
+            self._act_method,
+            self._act_dropout_rate,
+            self._normalize_before,
+            self._dtype,
+            name_str,
+        )
    def _amp_decorate(self, dtype):
        # tmp fix for amp.decorator(O2)
@@ -640,6 +707,7 @@ class FusedFeedForward(Layer):
 class FusedTransformerEncoderLayer(Layer):
    """
    FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
    attention and feedforward network. Before and after each sub-layer, pre-process
    and post-precess would be applied on the input and output accordingly. If
@@ -681,10 +749,9 @@ class FusedTransformerEncoderLayer(Layer):
    Examples:
        .. code-block:: python
-	    # required: gpu
+            # required: gpu
            import paddle
            from paddle.incubate.nn import FusedTransformerEncoderLayer
@@ -694,33 +761,47 @@ class FusedTransformerEncoderLayer(Layer):
            attn_mask = paddle.rand((2, 2, 4, 4))
            encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
    """
-    def __init__(self,
+    def __init__(
-                 d_model,
+        self,
-                 nhead,
+        d_model,
-                 dim_feedforward,
+        nhead,
-                 dropout_rate=0.1,
+        dim_feedforward,
-                 activation="relu",
+        dropout_rate=0.1,
-                 attn_dropout_rate=None,
+        activation="relu",
-                 act_dropout_rate=None,
+        attn_dropout_rate=None,
-                 normalize_before=False,
+        act_dropout_rate=None,
-                 weight_attr=None,
+        normalize_before=False,
-                 bias_attr=None):
+        weight_attr=None,
+        bias_attr=None,
+    ):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3
        super(FusedTransformerEncoderLayer, self).__init__()
-        assert d_model > 0, ("Expected d_model to be greater than 0, "
+        assert (
-                             "but received {}".format(d_model))
+            d_model > 0
-        assert nhead > 0, ("Expected nhead to be greater than 0, "
+        ), "Expected d_model to be greater than 0, " "but received {}".format(
-                           "but received {}".format(nhead))
+            d_model
+        )
+        assert (
+            nhead > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            nhead
+        )
        assert dim_feedforward > 0, (
            "Expected dim_feedforward to be greater than 0, "
-            "but received {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward)
-        attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
+        )
-        act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        attn_dropout_rate = (
+            dropout_rate if attn_dropout_rate is None else attn_dropout_rate
+        )
+        act_dropout_rate = (
+            dropout_rate if act_dropout_rate is None else act_dropout_rate
+        )
        self.normalize_before = normalize_before
        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
@@ -739,22 +820,27 @@ class FusedTransformerEncoderLayer(Layer):
            pre_ln_scale_attr=weight_attrs[0],
            pre_ln_bias_attr=bias_attrs[0],
            ln_scale_attr=weight_attrs[0],
-            ln_bias_attr=bias_attrs[0])
+            ln_bias_attr=bias_attrs[0],
+        )
-        self.ffn = FusedFeedForward(d_model,
-                                    dim_feedforward,
+        self.ffn = FusedFeedForward(
-                                    dropout_rate=dropout_rate,
+            d_model,
-                                    activation=activation,
+            dim_feedforward,
-                                    act_dropout_rate=act_dropout_rate,
+            dropout_rate=dropout_rate,
-                                    normalize_before=self.normalize_before,
+            activation=activation,
-                                    linear1_weight_attr=weight_attrs[1],
+            act_dropout_rate=act_dropout_rate,
-                                    linear1_bias_attr=bias_attrs[1],
+            normalize_before=self.normalize_before,
-                                    linear2_weight_attr=weight_attrs[1],
+            linear1_weight_attr=weight_attrs[1],
-                                    linear2_bias_attr=bias_attrs[1])
+            linear1_bias_attr=bias_attrs[1],
+            linear2_weight_attr=weight_attrs[1],
+            linear2_bias_attr=bias_attrs[1],
+        )
    def forward(self, src, src_mask=None, cache=None):
        """
        Applies a Transformer encoder layer on the input.
        Parameters:
            src (Tensor): The input of Transformer encoder layer. It is
                a tensor with shape `[batch_size, sequence_length, d_model]`.
@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer):
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
-                See `TransformerEncoderLayer.gen_cache` for more details. It is
+                See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
                only used for inference and should be None for training. Default
                None.
        Returns:
-            Tensor|tuple: It is a tensor that has the same shape and data type \
+            Tensor|tuple, It is a tensor that has the same shape and data type \
                as `enc_input`, representing the output of Transformer encoder \
                layer. Or a tuple if `cache` is not None, except for encoder \
                layer output, the tuple includes the new cache which is same \
                as input `cache` argument but `incremental_cache` has an \
                incremental length. See `MultiHeadAttention.gen_cache` and \
                `MultiHeadAttention.forward` for more details.
        """
        src_mask = _convert_attention_mask(src_mask, src.dtype)
        if cache is None:
            attn_out = self.fused_attn(src, attn_mask=src_mask)
        else:
-            attn_out, incremental_cache = self.fused_attn(src,
+            attn_out, incremental_cache = self.fused_attn(
-                                                          attn_mask=src_mask,
+                src, attn_mask=src_mask, cache=cache
-                                                          cache=cache)
+            )
        ffn_out = self.ffn(attn_out)
@@ -889,21 +977,23 @@ class FusedTransformer(Layer):
                                 cross_attn_mask)  # [2, 6, 128]
    """
-    def __init__(self,
+    def __init__(
-                 d_model=512,
+        self,
-                 nhead=8,
+        d_model=512,
-                 num_encoder_layers=6,
+        nhead=8,
-                 num_decoder_layers=6,
+        num_encoder_layers=6,
-                 dim_feedforward=2048,
+        num_decoder_layers=6,
-                 dropout=0.1,
+        dim_feedforward=2048,
-                 activation="relu",
+        dropout=0.1,
-                 attn_dropout=None,
+        activation="relu",
-                 act_dropout=None,
+        attn_dropout=None,
-                 normalize_before=False,
+        act_dropout=None,
-                 weight_attr=None,
+        normalize_before=False,
-                 bias_attr=None,
+        weight_attr=None,
-                 custom_encoder=None,
+        bias_attr=None,
-                 custom_decoder=None):
+        custom_encoder=None,
+        custom_decoder=None,
+    ):
        super(fusedTransformer, self).__init__()
        raise NotImplementedError()
@@ -1071,40 +1161,49 @@ class FusedMultiTransformer(Layer):
            enc_output = encoder_layers(enc_input, attn_mask)  # [2, 4, 128]
    """
-    def __init__(self,
+    def __init__(
-                 embed_dim,
+        self,
-                 num_heads,
+        embed_dim,
-                 dim_feedforward,
+        num_heads,
-                 dropout_rate=0.0,
+        dim_feedforward,
-                 activation="gelu",
+        dropout_rate=0.0,
-                 normalize_before=True,
+        activation="gelu",
-                 ln_scale_attrs=None,
+        normalize_before=True,
-                 ln_bias_attrs=None,
+        ln_scale_attrs=None,
-                 qkv_weight_attrs=None,
+        ln_bias_attrs=None,
-                 qkv_bias_attrs=None,
+        qkv_weight_attrs=None,
-                 linear_weight_attrs=None,
+        qkv_bias_attrs=None,
-                 linear_bias_attrs=None,
+        linear_weight_attrs=None,
-                 ffn_ln_scale_attrs=None,
+        linear_bias_attrs=None,
-                 ffn_ln_bias_attrs=None,
+        ffn_ln_scale_attrs=None,
-                 ffn1_weight_attrs=None,
+        ffn_ln_bias_attrs=None,
-                 ffn1_bias_attrs=None,
+        ffn1_weight_attrs=None,
-                 ffn2_weight_attrs=None,
+        ffn1_bias_attrs=None,
-                 ffn2_bias_attrs=None,
+        ffn2_weight_attrs=None,
-                 epsilon=1e-5,
+        ffn2_bias_attrs=None,
-                 num_layers=-1,
+        epsilon=1e-5,
-                 nranks=1,
+        num_layers=-1,
-                 trans_qkvw=True,
+        nranks=1,
-                 ring_id=-1,
+        trans_qkvw=True,
-                 name=None):
+        ring_id=-1,
+        name=None,
+    ):
        super(FusedMultiTransformer, self).__init__()
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+        assert embed_dim > 0, (
-                               "but received {}".format(embed_dim))
+            "Expected embed_dim to be greater than 0, "
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+            "but received {}".format(embed_dim)
-                               "but received {}".format(num_heads))
+        )
-        assert dim_feedforward > 0, (
+        assert (
-            "Expected dim_feedforward to be greater than 0, but received {}".
+            num_heads > 0
-            format(dim_feedforward))
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            num_heads
+        )
+        assert (
+            dim_feedforward > 0
+        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
+            dim_feedforward
+        )
        self.normalize_before = normalize_before
        self._dtype = self._helper.get_default_dtype()
@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer):
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
        # tensor model parallel
        if nranks > 1:
@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer):
            ln_scale = self.create_parameter(
                attr=ln_scale_attr,
                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
+                default_initializer=Constant(value=1.0),
-            ln_bias = self.create_parameter(attr=ln_bias_attr,
+            )
-                                            shape=[embed_dim],
+            ln_bias = self.create_parameter(
-                                            is_bias=True)
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
            qkv_weight = self.create_parameter(
                shape=[3, num_heads, self.head_dim, embed_dim]
-                if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim],
+                if trans_qkvw
+                else [embed_dim, 3, num_heads, self.head_dim],
                attr=qkv_weight_attr,
                dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
+            )
            qkv_bias = self.create_parameter(
                shape=[3, num_heads, self.head_dim],
                attr=qkv_bias_attr,
                dtype=self._dtype,
-                is_bias=True)
+                is_bias=True,
+            )
            linear_weight = self.create_parameter(
                shape=[num_heads * self.head_dim, embed_dim],
                attr=linear_weight_attr,
                dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
-            linear_bias = self.create_parameter(shape=[embed_dim],
+            )
-                                                attr=linear_bias_attr,
+            linear_bias = self.create_parameter(
-                                                dtype=self._dtype,
+                shape=[embed_dim],
-                                                is_bias=True)
+                attr=linear_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
            ffn_ln_scale = self.create_parameter(
                shape=[embed_dim],
                attr=ffn_ln_scale_attr,
                is_bias=False,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
-            ffn_ln_bias = self.create_parameter(shape=[embed_dim],
+            )
-                                                attr=ffn_ln_bias_attr,
+            ffn_ln_bias = self.create_parameter(
-                                                is_bias=True)
+                shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True
+            )
            ffn1_weight = self.create_parameter(
                shape=[embed_dim, dim_feedforward],
                attr=ffn1_weight_attr,
                dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
-            ffn1_bias = self.create_parameter(shape=[dim_feedforward],
+            )
-                                              attr=ffn1_bias_attr,
+            ffn1_bias = self.create_parameter(
-                                              dtype=self._dtype,
+                shape=[dim_feedforward],
-                                              is_bias=True)
+                attr=ffn1_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
            ffn2_weight = self.create_parameter(
                shape=[dim_feedforward, embed_dim],
                attr=ffn2_weight_attr,
                dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
-            ffn2_bias = self.create_parameter(shape=[embed_dim],
+            )
-                                              attr=ffn2_bias_attr,
+            ffn2_bias = self.create_parameter(
-                                              dtype=self._dtype,
+                shape=[embed_dim],
-                                              is_bias=True)
+                attr=ffn2_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
            # tensor model parallel
            if nranks > 1:
@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer):
            mode='upscale_in_train',
            trans_qkvw=self._trans_qkvw,
            ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
        return out
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -20,104 +20,134 @@ from paddle.fluid import core
 from paddle import _C_ops, _legacy_C_ops
-def graph_khop_sampler(row,
+def graph_khop_sampler(
-                       colptr,
+    row,
-                       input_nodes,
+    colptr,
-                       sample_sizes,
+    input_nodes,
-                       sorted_eids=None,
+    sample_sizes,
-                       return_eids=False,
+    sorted_eids=None,
-                       name=None):
+    return_eids=False,
+    name=None,
+):
    """
    Graph Khop Sampler API.
-    This API is mainly used in Graph Learning domain, and the main purpose is to 
+    This API is mainly used in Graph Learning domain, and the main purpose is to
    provide high performance graph khop sampling method with subgraph reindex step.
    For example, we get the CSC(Compressed Sparse Column) format of the input graph
-    edges as `row` and `colptr`, so as to covert graph data into a suitable format 
+    edges as `row` and `colptr`, so as to covert graph data into a suitable format
    for sampling. And the `input_nodes` means the nodes we need to sample neighbors,
    and `sample_sizes` means the number of neighbors and number of layers we want
-    to sample. 
+    to sample.
    Args:
-        row (Tensor): One of the components of the CSC format of the input graph, and 
+        row (Tensor): One of the components of the CSC format of the input graph, and
                      the shape should be [num_edges, 1] or [num_edges]. The available
                      data type is int32, int64.
        colptr (Tensor): One of the components of the CSC format of the input graph,
-                         and the shape should be [num_nodes + 1, 1] or [num_nodes]. 
+                         and the shape should be [num_nodes + 1, 1] or [num_nodes].
                         The data type should be the same with `row`.
-        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the 
+        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                              data type should be the same with `row`.
        sample_sizes (list|tuple): The number of neighbors and number of layers we want
                                   to sample. The data type should be int, and the shape
                                   should only have one dimension.
-        sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids`
+        sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
                              is True. The shape should be [num_edges, 1], and the data
-                              type should be the same with `row`.
+                              type should be the same with `row`. Default is None.
-        return_eids (bool): Whether to return the id of the sample edges. Default is False.
+        return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        edge_src (Tensor): The src index of the output edges, also means the first column of 
+        - edge_src (Tensor), The src index of the output edges, also means the first column of
-                           the edges. The shape is [num_sample_edges, 1] currently.
+          the edges. The shape is [num_sample_edges, 1] currently.
-        edge_dst (Tensor): The dst index of the output edges, also means the second column
+        - edge_dst (Tensor), The dst index of the output edges, also means the second column
-                           of the edges. The shape is [num_sample_edges, 1] currently.
+          of the edges. The shape is [num_sample_edges, 1] currently.
-        sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes.
+        - sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
-        reindex_nodes (Tensor): The reindex id of the input nodes.
+        - reindex_nodes (Tensor), The reindex id of the input nodes.
-        edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True.
+        - edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
    Examples:
        .. code-block:: python
-        import paddle
+            import paddle
+            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+            nodes = [0, 8, 1, 2]
+            sample_sizes = [2, 2]
+            row = paddle.to_tensor(row, dtype="int64")
+            colptr = paddle.to_tensor(colptr, dtype="int64")
+            nodes = paddle.to_tensor(nodes, dtype="int64")
-        row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
-        colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
-        nodes = [0, 8, 1, 2]
-        sample_sizes = [2, 2]
-        row = paddle.to_tensor(row, dtype="int64")
-        colptr = paddle.to_tensor(colptr, dtype="int64")
-        nodes = paddle.to_tensor(nodes, dtype="int64")
-        edge_src, edge_dst, sample_index, reindex_nodes = \
-            paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
    """
    if _non_static_mode():
        if return_eids:
            if sorted_eids is None:
-                raise ValueError(f"`sorted_eid` should not be None "
+                raise ValueError(
-                                 f"if return_eids is True.")
+                    f"`sorted_eid` should not be None "
-            edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
+                    f"if return_eids is True."
-                _legacy_C_ops.graph_khop_sampler(row, sorted_eids,
+                )
-                                              colptr, input_nodes,
+            (
-                                              "sample_sizes", sample_sizes,
+                edge_src,
-                                              "return_eids", True)
+                edge_dst,
+                sample_index,
+                reindex_nodes,
+                edge_eids,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                sorted_eids,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                True,
+            )
            return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
        else:
-            edge_src, edge_dst, sample_index, reindex_nodes, _ = \
+            (
-                _legacy_C_ops.graph_khop_sampler(row, None,
+                edge_src,
-                                              colptr, input_nodes,
+                edge_dst,
-                                              "sample_sizes", sample_sizes,
+                sample_index,
-                                              "return_eids", False)
+                reindex_nodes,
+                _,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                None,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                False,
+            )
            return edge_src, edge_dst, sample_index, reindex_nodes
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_khop_sampler")
+        row, "Row", ("int32", "int64"), "graph_khop_sampler"
+    )
    if return_eids:
        if sorted_eids is None:
-            raise ValueError(f"`sorted_eid` should not be None "
+            raise ValueError(
-                             f"if return_eids is True.")
+                f"`sorted_eid` should not be None " f"if return_eids is True."
-        check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"),
+            )
-                                 "graph_khop_sampler")
+        check_variable_and_dtype(
+            sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
+        )
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_khop_sampler")
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
+    )
-                             "graph_khop_sampler")
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
+    )
    helper = LayerHelper("graph_khop_sampler", **locals())
    edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
    sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
    reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
    edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_khop_sampler",
+    helper.append_op(
-                     inputs={
+        type="graph_khop_sampler",
-                         "Row": row,
+        inputs={
-                         "Eids": sorted_eids,
+            "Row": row,
-                         "Col_Ptr": colptr,
+            "Eids": sorted_eids,
-                         "X": input_nodes
+            "Col_Ptr": colptr,
-                     },
+            "X": input_nodes,
-                     outputs={
+        },
-                         "Out_Src": edge_src,
+        outputs={
-                         "Out_Dst": edge_dst,
+            "Out_Src": edge_src,
-                         "Sample_Index": sample_index,
+            "Out_Dst": edge_dst,
-                         "Reindex_X": reindex_nodes,
+            "Sample_Index": sample_index,
-                         "Out_Eids": edge_eids
+            "Reindex_X": reindex_nodes,
-                     },
+            "Out_Eids": edge_eids,
-                     attrs={
+        },
-                         "sample_sizes": sample_sizes,
+        attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
-                         "return_eids": return_eids
+    )
-                     })
    if return_eids:
        return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
    else:

--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops
 import paddle.utils.deprecated as deprecated
-@deprecated(since="2.4.0",
+@deprecated(
-            update_to="paddle.geometric.reindex_graph",
+    since="2.4.0",
-            level=1,
+    update_to="paddle.geometric.reindex_graph",
-            reason="paddle.incubate.graph_reindex will be removed in future")
+    level=1,
-def graph_reindex(x,
+    reason="paddle.incubate.graph_reindex will be removed in future",
-                  neighbors,
+)
-                  count,
+def graph_reindex(
-                  value_buffer=None,
+    x,
-                  index_buffer=None,
+    neighbors,
-                  flag_buffer_hashtable=False,
+    count,
-                  name=None):
+    value_buffer=None,
+    index_buffer=None,
+    flag_buffer_hashtable=False,
+    name=None,
+):
    """
    Graph Reindex API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -40,11 +45,11 @@ def graph_reindex(x,
    is to reindex the ids information of the input nodes, and return the 
    corresponding graph edges after reindex.
-    **Notes**: 
+    Notes:
        The number in x should be unique, otherwise it would cause potential errors.
-    Besides, we also support multi-edge-types neighbors reindexing. If we have different
+        Besides, we also support multi-edge-types neighbors reindexing. If we have different
-    edge_type neighbors for x, we should concatenate all the neighbors and count of x. 
+        edge_type neighbors for x, we should concatenate all the neighbors and count of x.
-    We will reindex all the nodes from 0. 
+        We will reindex all the nodes from 0.
    Take input nodes x = [0, 1, 2] as an example. 
    If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], 
@@ -58,98 +63,105 @@ def graph_reindex(x,
                            should be the same with `x`.
        count (Tensor): The neighbor count of the input nodes `x`. And the 
                        data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should 
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
-                                    be int32, and should be filled with -1.
+                                    be int32, and should be filled with -1. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should 
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
-                                    be int32, and should be filled with -1.
+                                    be int32, and should be filled with -1. Default is None.
-        flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up.
+        flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
                                      Default is False. Only useful for gpu version currently.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        reindex_src (Tensor): The source node index of graph edges after reindex.
+        - reindex_src (Tensor), The source node index of graph edges after reindex.
-        reindex_dst (Tensor): The destination node index of graph edges after reindex.
+        - reindex_dst (Tensor), The destination node index of graph edges after reindex.
-        out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
+        - out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
-                            where we put the input nodes `x` in the front, and put neighbor
+          where we put the input nodes `x` in the front, and put neighbor
-                            nodes in the back.
+          nodes in the back.
    Examples:
        .. code-block:: python
-        import paddle
+            import paddle
-        x = [0, 1, 2]
+            x = [0, 1, 2]
-        neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
+            neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
-        count_e1 = [2, 3, 2]
+            count_e1 = [2, 3, 2]
-        x = paddle.to_tensor(x, dtype="int64")
+            x = paddle.to_tensor(x, dtype="int64")
-        neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
+            neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
-        count_e1 = paddle.to_tensor(count_e1, dtype="int32")
+            count_e1 = paddle.to_tensor(count_e1, dtype="int32")
-        reindex_src, reindex_dst, out_nodes = \
+            reindex_src, reindex_dst, out_nodes = \
-             paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
+                paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
-        # reindex_src: [3, 4, 0, 5, 6, 7, 6]
+            # reindex_src: [3, 4, 0, 5, 6, 7, 6]
-        # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
+            # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
-        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
+            # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
-        neighbors_e2 = [0, 2, 3, 5, 1]
+            neighbors_e2 = [0, 2, 3, 5, 1]
-        count_e2 = [1, 3, 1]
+            count_e2 = [1, 3, 1]
-        neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
+            neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
-        count_e2 = paddle.to_tensor(count_e2, dtype="int32")
+            count_e2 = paddle.to_tensor(count_e2, dtype="int32")
-        neighbors = paddle.concat([neighbors_e1, neighbors_e2])
+            neighbors = paddle.concat([neighbors_e1, neighbors_e2])
-        count = paddle.concat([count_e1, count_e2])
+            count = paddle.concat([count_e1, count_e2])
-        reindex_src, reindex_dst, out_nodes = \
+            reindex_src, reindex_dst, out_nodes = \
-             paddle.incubate.graph_reindex(x, neighbors, count)
+                paddle.incubate.graph_reindex(x, neighbors, count)
-        # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
+            # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
-        # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
+            # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
-        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
+            # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
    """
    if flag_buffer_hashtable:
        if value_buffer is None or index_buffer is None:
-            raise ValueError(f"`value_buffer` and `index_buffer` should not"
+            raise ValueError(
-                             "be None if `flag_buffer_hashtable` is True.")
+                f"`value_buffer` and `index_buffer` should not"
+                "be None if `flag_buffer_hashtable` is True."
+            )
    if _non_static_mode():
-        reindex_src, reindex_dst, out_nodes = \
+        reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
-            _legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
+            x,
-                                 "flag_buffer_hashtable", flag_buffer_hashtable)
+            neighbors,
+            count,
+            value_buffer,
+            index_buffer,
+            "flag_buffer_hashtable",
+            flag_buffer_hashtable,
+        )
        return reindex_src, reindex_dst, out_nodes
    check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
-    check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_reindex")
+        neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
+    )
    check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
    if flag_buffer_hashtable:
-        check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
+        check_variable_and_dtype(
-                                 "graph_reindex")
+            value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
-        check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
+        )
-                                 "graph_reindex")
+        check_variable_and_dtype(
+            index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
+        )
    helper = LayerHelper("graph_reindex", **locals())
    reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
    reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
    out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="graph_reindex",
+    helper.append_op(
-                     inputs={
+        type="graph_reindex",
-                         "X":
+        inputs={
-                         x,
+            "X": x,
-                         "Neighbors":
+            "Neighbors": neighbors,
-                         neighbors,
+            "Count": count,
-                         "Count":
+            "HashTable_Value": value_buffer if flag_buffer_hashtable else None,
-                         count,
+            "HashTable_Index": index_buffer if flag_buffer_hashtable else None,
-                         "HashTable_Value":
+        },
-                         value_buffer if flag_buffer_hashtable else None,
+        outputs={
-                         "HashTable_Index":
+            "Reindex_Src": reindex_src,
-                         index_buffer if flag_buffer_hashtable else None,
+            "Reindex_Dst": reindex_dst,
-                     },
+            "Out_Nodes": out_nodes,
-                     outputs={
+        },
-                         "Reindex_Src": reindex_src,
+        attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
-                         "Reindex_Dst": reindex_dst,
+    )
-                         "Out_Nodes": out_nodes
-                     },
-                     attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
    return reindex_src, reindex_dst, out_nodes
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -25,17 +25,21 @@ import paddle.utils.deprecated as deprecated
    since="2.4.0",
    update_to="paddle.geometric.sample_neighbors",
    level=1,
-    reason="paddle.incubate.graph_sample_neighbors will be removed in future")
+    reason="paddle.incubate.graph_sample_neighbors will be removed in future",
-def graph_sample_neighbors(row,
+)
-                           colptr,
+def graph_sample_neighbors(
-                           input_nodes,
+    row,
-                           eids=None,
+    colptr,
-                           perm_buffer=None,
+    input_nodes,
-                           sample_size=-1,
+    eids=None,
-                           return_eids=False,
+    perm_buffer=None,
-                           flag_perm_buffer=False,
+    sample_size=-1,
-                           name=None):
+    return_eids=False,
+    flag_perm_buffer=False,
+    name=None,
+):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -71,86 +75,109 @@ def graph_sample_neighbors(row,
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        out_neighbors (Tensor): The sample neighbors of the input nodes.
+        - out_neighbors (Tensor), The sample neighbors of the input nodes.
-        out_count (Tensor): The number of sampling neighbors of each input node, and the shape
+        - out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
-                            should be the same with `input_nodes`.
+        - out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the 
-                           sample edges.
    Examples:
        .. code-block:: python
-        import paddle
-        # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
+            import paddle
-        #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
+            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
-        row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
-        colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
-        nodes = [0, 8, 1, 2]
+            colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
-        sample_size = 2
+            nodes = [0, 8, 1, 2]
-        row = paddle.to_tensor(row, dtype="int64")
+            sample_size = 2
-        colptr = paddle.to_tensor(colptr, dtype="int64")
+            row = paddle.to_tensor(row, dtype="int64")
-        nodes = paddle.to_tensor(nodes, dtype="int64")
+            colptr = paddle.to_tensor(colptr, dtype="int64")
-        out_neighbors, out_count = \
+            nodes = paddle.to_tensor(nodes, dtype="int64")
-            paddle.incubate.graph_sample_neighbors(row, colptr, nodes, 
+            out_neighbors, out_count = \
-                                                   sample_size=sample_size)
+                paddle.incubate.graph_sample_neighbors(row, colptr, nodes,
+                                                    sample_size=sample_size)
    """
    if return_eids:
        if eids is None:
            raise ValueError(
-                f"`eids` should not be None if `return_eids` is True.")
+                f"`eids` should not be None if `return_eids` is True."
+            )
    if flag_perm_buffer:
        if perm_buffer is None:
            raise ValueError(
                f"`perm_buffer` should not be None if `flag_perm_buffer`"
-                "is True.")
+                "is True."
+            )
    if _non_static_mode():
-        out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
+        (
-            row, colptr, input_nodes, eids, perm_buffer, "sample_size",
+            out_neighbors,
-            sample_size, "return_eids", return_eids, "flag_perm_buffer",
+            out_count,
-            flag_perm_buffer)
+            out_eids,
+        ) = _legacy_C_ops.graph_sample_neighbors(
+            row,
+            colptr,
+            input_nodes,
+            eids,
+            perm_buffer,
+            "sample_size",
+            sample_size,
+            "return_eids",
+            return_eids,
+            "flag_perm_buffer",
+            flag_perm_buffer,
+        )
        if return_eids:
            return out_neighbors, out_count, out_eids
        return out_neighbors, out_count
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_sample_neighbors")
+        row, "Row", ("int32", "int64"), "graph_sample_neighbors"
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
+    )
-                             "graph_sample_neighbors")
+    check_variable_and_dtype(
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
-                             "graph_sample_neighbors")
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
+    )
    if return_eids:
-        check_variable_and_dtype(eids, "Eids", ("int32", "int64"),
+        check_variable_and_dtype(
-                                 "graph_sample_neighbors")
+            eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
+        )
    if flag_perm_buffer:
-        check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"),
+        check_variable_and_dtype(
-                                 "graph_sample_neighbors")
+            perm_buffer,
+            "Perm_Buffer",
+            ("int32", "int64"),
+            "graph_sample_neighbors",
+        )
    helper = LayerHelper("graph_sample_neighbors", **locals())
    out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
    out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
    out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_sample_neighbors",
+    helper.append_op(
-                     inputs={
+        type="graph_sample_neighbors",
-                         "Row": row,
+        inputs={
-                         "Col_Ptr": colptr,
+            "Row": row,
-                         "X": input_nodes,
+            "Col_Ptr": colptr,
-                         "Eids": eids if return_eids else None,
+            "X": input_nodes,
-                         "Perm_Buffer":
+            "Eids": eids if return_eids else None,
-                         perm_buffer if flag_perm_buffer else None
+            "Perm_Buffer": perm_buffer if flag_perm_buffer else None,
-                     },
+        },
-                     outputs={
+        outputs={
-                         "Out": out_neighbors,
+            "Out": out_neighbors,
-                         "Out_Count": out_count,
+            "Out_Count": out_count,
-                         "Out_Eids": out_eids
+            "Out_Eids": out_eids,
-                     },
+        },
-                     attrs={
+        attrs={
-                         "sample_size": sample_size,
+            "sample_size": sample_size,
-                         "return_eids": return_eids,
+            "return_eids": return_eids,
-                         "flag_perm_buffer": flag_perm_buffer
+            "flag_perm_buffer": flag_perm_buffer,
-                     })
+        },
+    )
    if return_eids:
        return out_neighbors, out_count, out_eids
    return out_neighbors, out_count
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
@@ -36,106 +36,232 @@ from paddle import _C_ops, _legacy_C_ops
 __all__ = ['resnet_basic_block', 'ResNetBasicBlock']
-def resnet_basic_block(x,
+def resnet_basic_block(
-                       filter1,
+    x,
-                       scale1,
+    filter1,
-                       bias1,
+    scale1,
-                       mean1,
+    bias1,
-                       var1,
+    mean1,
-                       filter2,
+    var1,
-                       scale2,
+    filter2,
-                       bias2,
+    scale2,
-                       mean2,
+    bias2,
-                       var2,
+    mean2,
-                       filter3,
+    var2,
-                       scale3,
+    filter3,
-                       bias3,
+    scale3,
-                       mean3,
+    bias3,
-                       var3,
+    mean3,
-                       stride1,
+    var3,
-                       stride2,
+    stride1,
-                       stride3,
+    stride2,
-                       padding1,
+    stride3,
-                       padding2,
+    padding1,
-                       padding3,
+    padding2,
-                       dilation1,
+    padding3,
-                       dilation2,
+    dilation1,
-                       dilation3,
+    dilation2,
-                       groups,
+    dilation3,
-                       momentum,
+    groups,
-                       eps,
+    momentum,
-                       data_format,
+    eps,
-                       has_shortcut,
+    data_format,
-                       use_global_stats=None,
+    has_shortcut,
-                       training=False,
+    use_global_stats=None,
-                       trainable_statistics=False,
+    training=False,
-                       find_conv_max=True):
+    trainable_statistics=False,
+    find_conv_max=True,
+):
    if fluid.framework.in_dygraph_mode():
-        attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3,
+        attrs = (
-                 'padding1', padding1, 'padding2', padding2, 'padding3',
+            'stride1',
-                 padding3, 'dilation1', dilation1, 'dilation2', dilation2,
+            stride1,
-                 'dilation3', dilation3, 'group', groups, 'momentum', momentum,
+            'stride2',
-                 'epsilon', eps, 'data_format', data_format, 'has_shortcut',
+            stride2,
-                 has_shortcut, 'use_global_stats', use_global_stats,
+            'stride3',
-                 "trainable_statistics", trainable_statistics, 'is_test',
+            stride3,
-                 not training, 'act_type', "relu", 'find_conv_input_max',
+            'padding1',
-                 find_conv_max)
+            padding1,
+            'padding2',
-        out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \
+            padding2,
-                getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \
+            'padding3',
-                filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs)
+            padding3,
+            'dilation1',
+            dilation1,
+            'dilation2',
+            dilation2,
+            'dilation3',
+            dilation3,
+            'group',
+            groups,
+            'momentum',
+            momentum,
+            'epsilon',
+            eps,
+            'data_format',
+            data_format,
+            'has_shortcut',
+            has_shortcut,
+            'use_global_stats',
+            use_global_stats,
+            "trainable_statistics",
+            trainable_statistics,
+            'is_test',
+            not training,
+            'act_type',
+            "relu",
+            'find_conv_input_max',
+            find_conv_max,
+        )
+        (
+            out,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+        ) = getattr(_C_ops, "resnet_basic_block")(
+            x,
+            filter1,
+            scale1,
+            bias1,
+            mean1,
+            var1,
+            filter2,
+            scale2,
+            bias2,
+            mean2,
+            var2,
+            filter3,
+            scale3,
+            bias3,
+            mean3,
+            var3,
+            mean1,
+            var1,
+            mean2,
+            var2,
+            mean3,
+            var3,
+            *attrs
+        )
        return out
    helper = LayerHelper('resnet_basic_block', **locals())
    bn_param_dtype = fluid.core.VarDesc.VarType.FP32
    max_dtype = fluid.core.VarDesc.VarType.FP32
-    out = helper.create_variable_for_type_inference(dtype=x.dtype,
+    out = helper.create_variable_for_type_inference(
-                                                    stop_gradient=True)
+        dtype=x.dtype, stop_gradient=True
-    conv1 = helper.create_variable_for_type_inference(dtype=x.dtype,
+    )
-                                                      stop_gradient=True)
+    conv1 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
    saved_mean1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
    saved_invstd1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
-    running_mean1 = helper.create_variable_for_type_inference(
+    )
-        dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1
+    running_mean1 = (
-    running_var1 = helper.create_variable_for_type_inference(
+        helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1
+            dtype=bn_param_dtype, stop_gradient=True
-    conv2 = helper.create_variable_for_type_inference(dtype=x.dtype,
+        )
-                                                      stop_gradient=True)
+        if mean1 is None
-    conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype,
+        else mean1
-                                                            stop_gradient=True)
+    )
+    running_var1 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var1 is None
+        else var1
+    )
+    conv2 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    conv2_input = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
    saved_mean2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
    saved_invstd2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
-    running_mean2 = helper.create_variable_for_type_inference(
+    )
-        dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2
+    running_mean2 = (
-    running_var2 = helper.create_variable_for_type_inference(
+        helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2
+            dtype=bn_param_dtype, stop_gradient=True
-    conv3 = helper.create_variable_for_type_inference(dtype=x.dtype,
+        )
-                                                      stop_gradient=True)
+        if mean2 is None
+        else mean2
+    )
+    running_var2 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var2 is None
+        else var2
+    )
+    conv3 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
    saved_mean3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
    saved_invstd3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
-    running_mean3 = helper.create_variable_for_type_inference(
+    )
-        dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3
+    running_mean3 = (
-    running_var3 = helper.create_variable_for_type_inference(
+        helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if mean3 is None
+        else mean3
+    )
+    running_var3 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var3 is None
+        else var3
+    )
    conv1_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv1_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv2_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv2_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv3_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv3_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    inputs = {
        'X': x,
@@ -175,7 +301,7 @@ def resnet_basic_block(x,
        "trainable_statistics": trainable_statistics,
        'is_test': not training,
        'act_type': "relu",
-        'find_conv_input_max': find_conv_max
+        'find_conv_input_max': find_conv_max,
    }
    outputs = {
@@ -203,88 +329,172 @@ def resnet_basic_block(x,
        'MaxInput3': conv3_input_max,
        'MaxFilter3': conv3_filter_max,
    }
-    helper.append_op(type='resnet_basic_block',
+    helper.append_op(
-                     inputs=inputs,
+        type='resnet_basic_block', inputs=inputs, outputs=outputs, attrs=attrs
-                     outputs=outputs,
+    )
-                     attrs=attrs)
    return out
 class ResNetBasicBlock(Layer):
-    """
+    r"""
    ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
-    The fusion op architecture like this:
+    If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time.
-            has_shortcut = True:       else:
+    If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this
-                    X                         X
+    case the shape of output is same with input.
-                  /                         /
-                |       |                 |       |
-              CONV1     |               CONV1     |
+    Args:
-                |       |                 |       |
+        num_channels (int): The number of input image channel.
-               BN1      |                BN1      |
+        num_filter (int): The number of filter. It is as same as the output image channel.
-                |       |                 |       |
+        filter_size (int|list|tuple): The filter size. If filter_size
-              RELU1     |               RELU1     |
+            is a tuple, it must contain two integers, (filter_size_height,
-                |       |                 |       |
+            filter_size_width). Otherwise, filter_size_height = filter_size_width =\
-              CONV2   CONV3             CONV2     |
+            filter_size.
-                |       |                 |       |
+        stride (int, optional): The stride size. It means the stride in convolution.
-               BN2     BN3               BN2      |
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width).
-                 \     /                   \     /
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-                   ADD                       ADD
+        act (str, optional): Activation type, if it is set to None, activation is not appended.
-                    |                         |
+            Default: None
-                   RELU                      RELU
+        momentum (float, optional): The value used for the moving_mean and
-                    |                         |
+            moving_var computation. This should be a float number or a Tensor with
-                    Y                         Y
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. Now is only support `"NCHW"`, the data is stored in
+            the order of: `[batch_size, input_channels, input_height, input_width]`.
+        has_shortcut (bool, optional): Whether to calculate CONV3 and BN3. Default: False.
+        use_global_stats (bool, optional): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        is_test (bool, optional): A flag indicating whether it is in
+            test phrase or not. Default: False.
+        filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. Default: None.
+        scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr
+            as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set,
+            the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero.
+            Default: None.
+        moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string. Default: None.
+        moving_var_name (str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string. Default: None.
+        padding (int, optional): The padding size. It is only spupport padding_height = padding_width = padding.
+            Default: padding = 0.
+        dilation (int, optional): The dilation size. It means the spacing between the kernel
+            points. It is only spupport dilation_height = dilation_width = dilation.
+            Default: dilation = 1.
+        trainable_statistics (bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
+            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
+            Default: False.
+        find_conv_max (bool, optional): Whether to calculate max value of each conv2d. Default: True.
+    Returns:
+        A Tensor representing the ResNetBasicBlock, whose data type is the same with input.
+    Examples:
+        .. code-block:: python
+            # required: xpu
+            import paddle
+            from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
+            ch_in = 4
+            ch_out = 8
+            x = paddle.uniform((2, ch_in, 16, 16), dtype='float32', min=-1., max=1.)
+            resnet_basic_block = ResNetBasicBlock(num_channels1=ch_in,
+                                                num_filter1=ch_out,
+                                                filter1_size=3,
+                                                num_channels2=ch_out,
+                                                num_filter2=ch_out,
+                                                filter2_size=3,
+                                                num_channels3=ch_in,
+                                                num_filter3=ch_out,
+                                                filter3_size=1,
+                                                stride1=1,
+                                                stride2=1,
+                                                stride3=1,
+                                                act='relu',
+                                                padding1=1,
+                                                padding2=1,
+                                                padding3=0,
+                                                has_shortcut=True)
+            out = resnet_basic_block.forward(x)
+            print(out.shape) # [2, 8, 16, 16]
    """
-    def __init__(self,
+    def __init__(
-                 num_channels1,
+        self,
-                 num_filter1,
+        num_channels1,
-                 filter1_size,
+        num_filter1,
-                 num_channels2,
+        filter1_size,
-                 num_filter2,
+        num_channels2,
-                 filter2_size,
+        num_filter2,
-                 num_channels3,
+        filter2_size,
-                 num_filter3,
+        num_channels3,
-                 filter3_size,
+        num_filter3,
-                 stride1=1,
+        filter3_size,
-                 stride2=1,
+        stride1=1,
-                 stride3=1,
+        stride2=1,
-                 act='relu',
+        stride3=1,
-                 momentum=0.9,
+        act='relu',
-                 eps=1e-5,
+        momentum=0.9,
-                 data_format='NCHW',
+        eps=1e-5,
-                 has_shortcut=False,
+        data_format='NCHW',
-                 use_global_stats=False,
+        has_shortcut=False,
-                 is_test=False,
+        use_global_stats=False,
-                 filter1_attr=None,
+        is_test=False,
-                 scale1_attr=None,
+        filter1_attr=None,
-                 bias1_attr=None,
+        scale1_attr=None,
-                 moving_mean1_name=None,
+        bias1_attr=None,
-                 moving_var1_name=None,
+        moving_mean1_name=None,
-                 filter2_attr=None,
+        moving_var1_name=None,
-                 scale2_attr=None,
+        filter2_attr=None,
-                 bias2_attr=None,
+        scale2_attr=None,
-                 moving_mean2_name=None,
+        bias2_attr=None,
-                 moving_var2_name=None,
+        moving_mean2_name=None,
-                 filter3_attr=None,
+        moving_var2_name=None,
-                 scale3_attr=None,
+        filter3_attr=None,
-                 bias3_attr=None,
+        scale3_attr=None,
-                 moving_mean3_name=None,
+        bias3_attr=None,
-                 moving_var3_name=None,
+        moving_mean3_name=None,
-                 padding1=0,
+        moving_var3_name=None,
-                 padding2=0,
+        padding1=0,
-                 padding3=0,
+        padding2=0,
-                 dilation1=1,
+        padding3=0,
-                 dilation2=1,
+        dilation1=1,
-                 dilation3=1,
+        dilation2=1,
-                 trainable_statistics=False,
+        dilation3=1,
-                 find_conv_max=True):
+        trainable_statistics=False,
+        find_conv_max=True,
+    ):
        super(ResNetBasicBlock, self).__init__()
        self._stride1 = stride1
        self._stride2 = stride2
-        self._kernel1_size = utils.convert_to_list(filter1_size, 2,
+        self._kernel1_size = utils.convert_to_list(
-                                                   'filter1_size')
+            filter1_size, 2, 'filter1_size'
-        self._kernel2_size = utils.convert_to_list(filter2_size, 2,
+        )
-                                                   'filter2_size')
+        self._kernel2_size = utils.convert_to_list(
+            filter2_size, 2, 'filter2_size'
+        )
        self._dilation1 = dilation1
        self._dilation2 = dilation2
        self._padding1 = padding1
@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer):
        self._find_conv_max = find_conv_max
        if has_shortcut:
-            self._kernel3_size = utils.convert_to_list(filter3_size, 2,
+            self._kernel3_size = utils.convert_to_list(
-                                                       'filter3_size')
+                filter3_size, 2, 'filter3_size'
+            )
            self._padding3 = padding3
            self._stride3 = stride3
            self._dilation3 = dilation3
@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer):
        if data_format not in valid_format:
            raise ValueError(
                "conv_format must be one of {}, but got conv_format={}".format(
-                    valid_format, data_format))
+                    valid_format, data_format
+                )
+            )
        def _get_default_param_initializer(channels, kernel_size):
            filter_elem_num = np.prod(kernel_size) * channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
            return I.Normal(0.0, std)
        # init filter
@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer):
            shape=filter1_shape,
            attr=filter1_attr,
            default_initializer=_get_default_param_initializer(
-                num_channels1, self._kernel1_size))
+                num_channels1, self._kernel1_size
+            ),
+        )
        self.scale_1 = self.create_parameter(
            shape=bn1_param_shape,
            attr=scale1_attr,
            dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0))
+            default_initializer=I.Constant(1.0),
-        self.bias_1 = self.create_parameter(shape=bn1_param_shape,
+        )
-                                            attr=bias1_attr,
+        self.bias_1 = self.create_parameter(
-                                            dtype=bn_param_dtype,
+            shape=bn1_param_shape,
-                                            is_bias=True)
+            attr=bias1_attr,
-        self.mean_1 = self.create_parameter(attr=ParamAttr(
+            dtype=bn_param_dtype,
-            name=moving_mean1_name,
+            is_bias=True,
-            initializer=I.Constant(0.0),
+        )
-            trainable=False),
+        self.mean_1 = self.create_parameter(
-                                            shape=bn1_param_shape,
+            attr=ParamAttr(
-                                            dtype=bn_param_dtype)
+                name=moving_mean1_name,
+                initializer=I.Constant(0.0),
+                trainable=False,
+            ),
+            shape=bn1_param_shape,
+            dtype=bn_param_dtype,
+        )
        self.mean_1.stop_gradient = True
        self.var_1 = self.create_parameter(
-            attr=ParamAttr(name=moving_var1_name,
+            attr=ParamAttr(
-                           initializer=I.Constant(1.0),
+                name=moving_var1_name,
-                           trainable=False),
+                initializer=I.Constant(1.0),
+                trainable=False,
+            ),
            shape=bn1_param_shape,
-            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
        self.var_1.stop_gradient = True
        self.filter_2 = self.create_parameter(
            shape=filter2_shape,
            attr=filter2_attr,
            default_initializer=_get_default_param_initializer(
-                num_channels2, self._kernel2_size))
+                num_channels2, self._kernel2_size
+            ),
+        )
        self.scale_2 = self.create_parameter(
            shape=bn2_param_shape,
            attr=scale2_attr,
            dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0))
+            default_initializer=I.Constant(1.0),
-        self.bias_2 = self.create_parameter(shape=bn2_param_shape,
+        )
-                                            attr=bias2_attr,
+        self.bias_2 = self.create_parameter(
-                                            dtype=bn_param_dtype,
+            shape=bn2_param_shape,
-                                            is_bias=True)
+            attr=bias2_attr,
-        self.mean_2 = self.create_parameter(attr=ParamAttr(
+            dtype=bn_param_dtype,
-            name=moving_mean2_name,
+            is_bias=True,
-            initializer=I.Constant(0.0),
+        )
-            trainable=False),
+        self.mean_2 = self.create_parameter(
-                                            shape=bn2_param_shape,
+            attr=ParamAttr(
-                                            dtype=bn_param_dtype)
+                name=moving_mean2_name,
+                initializer=I.Constant(0.0),
+                trainable=False,
+            ),
+            shape=bn2_param_shape,
+            dtype=bn_param_dtype,
+        )
        self.mean_2.stop_gradient = True
        self.var_2 = self.create_parameter(
-            attr=ParamAttr(name=moving_var2_name,
+            attr=ParamAttr(
-                           initializer=I.Constant(1.0),
+                name=moving_var2_name,
-                           trainable=False),
+                initializer=I.Constant(1.0),
+                trainable=False,
+            ),
            shape=bn2_param_shape,
-            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
        self.var_2.stop_gradient = True
        if has_shortcut:
            bn3_param_shape = [1, 1, num_filter3]
            filter3_shape = [
-                num_filter3, num_channels3, filter3_size, filter3_size
+                num_filter3,
+                num_channels3,
+                filter3_size,
+                filter3_size,
            ]
            self.filter_3 = self.create_parameter(
                shape=filter3_shape,
                attr=filter3_attr,
                default_initializer=_get_default_param_initializer(
-                    num_channels3, self._kernel3_size))
+                    num_channels3, self._kernel3_size
+                ),
+            )
            self.scale_3 = self.create_parameter(
                shape=bn3_param_shape,
                attr=scale3_attr,
                dtype=bn_param_dtype,
-                default_initializer=I.Constant(1.0))
+                default_initializer=I.Constant(1.0),
-            self.bias_3 = self.create_parameter(shape=bn3_param_shape,
+            )
-                                                attr=bias3_attr,
+            self.bias_3 = self.create_parameter(
-                                                dtype=bn_param_dtype,
+                shape=bn3_param_shape,
-                                                is_bias=True)
+                attr=bias3_attr,
-            self.mean_3 = self.create_parameter(attr=ParamAttr(
+                dtype=bn_param_dtype,
-                name=moving_mean3_name,
+                is_bias=True,
-                initializer=I.Constant(0.0),
+            )
-                trainable=False),
+            self.mean_3 = self.create_parameter(
-                                                shape=bn3_param_shape,
+                attr=ParamAttr(
-                                                dtype=bn_param_dtype)
+                    name=moving_mean3_name,
+                    initializer=I.Constant(0.0),
+                    trainable=False,
+                ),
+                shape=bn3_param_shape,
+                dtype=bn_param_dtype,
+            )
            self.mean_3.stop_gradient = True
-            self.var_3 = self.create_parameter(attr=ParamAttr(
+            self.var_3 = self.create_parameter(
-                name=moving_var3_name,
+                attr=ParamAttr(
-                initializer=I.Constant(1.0),
+                    name=moving_var3_name,
-                trainable=False),
+                    initializer=I.Constant(1.0),
-                                               shape=bn3_param_shape,
+                    trainable=False,
-                                               dtype=bn_param_dtype)
+                ),
+                shape=bn3_param_shape,
+                dtype=bn_param_dtype,
+            )
            self.var_3.stop_gradient = True
        else:
            self.filter_3 = None
@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer):
            use_global_stats=self._use_global_stats,
            training=self.training,
            trainable_statistics=self._trainable_statistics,
-            find_conv_max=self._find_conv_max)
+            find_conv_max=self._find_conv_max,
+        )
        return out
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -715,6 +715,7 @@ def upsample(
    name=None,
 ):
    """
    This API resizes a batch of images.
    The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
@@ -725,11 +726,12 @@ def upsample(
    and the resizing only applies on the three dimensions(depth, height and width).
    Supporting resample methods:
-        'linear' : Linear interpolation
+    - 'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
+    - 'bilinear' : Bilinear interpolation
-        'trilinear' : Trilinear interpolation
+    - 'trilinear' : Trilinear interpolation
-        'nearest' : Nearest neighbor interpolation
+    - 'nearest' : Nearest neighbor interpolation
-        'bicubic' : Bicubic interpolation
+    - 'bicubic' : Bicubic interpolation
    Linear interpolation is the method of using a line connecting two known quantities
    to determine the value of an unknown quantity between the two known quantities.
@@ -762,77 +764,78 @@ def upsample(
    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
    Example:
-    .. code-block:: text
+        .. code-block:: text
-        For scale_factor:
+            For scale_factor:
-            if align_corners = True && out_size > 1 :
+                if align_corners = True && out_size > 1 :
-              scale_factor = (in_size-1.0)/(out_size-1.0)
+                scale_factor = (in_size-1.0)/(out_size-1.0)
+                else:
+                scale_factor = float(in_size/out_size)
+            Linear interpolation:
+                if:
+                    align_corners = False , align_mode = 0
+                    input : (N,C,W_in)
+                    output: (N,C,W_out) where:
+                    W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+                else:
+                    input : (N,C,W_in)
+                    output: (N,C,W_out) where:
+                    W_out = W_{in} * scale_{factor}
+            Nearest neighbor interpolation:
+            if:
+                align_corners = False
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = floor (H_{in} * scale_{factor})
+                W_out = floor (W_{in} * scale_{factor})
            else:
-              scale_factor = float(in_size/out_size)
+                align_corners = True
-        Linear interpolation:
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = round(H_{in} * scale_{factor})
+                W_out = round(W_{in} * scale_{factor})
+            Bilinear interpolation:
            if:
                align_corners = False , align_mode = 0
-                input : (N,C,W_in)
+                input : (N,C,H_in,W_in)
-                output: (N,C,W_out) where:
+                output: (N,C,H_out,W_out) where:
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
            else:
-                input : (N,C,W_in)
+                input : (N,C,H_in,W_in)
-                output: (N,C,W_out) where:
+                output: (N,C,H_out,W_out) where:
+                H_out = H_{in} * scale_{factor}
+                W_out = W_{in} * scale_{factor}
+            Bicubic interpolation:
+            if:
+                align_corners = False
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = H_{in} * scale_{factor}
+                W_out = W_{in} * scale_{factor}
+            Trilinear interpolation:
+            if:
+                align_corners = False , align_mode = 0
+                input : (N,C,D_in,H_in,W_in)
+                output: (N,C,D_out,H_out,W_out) where:
+                D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,D_in,H_in,W_in)
+                output: (N,C,D_out,H_out,W_out) where:
+                D_out = D_{in} * scale_{factor}
+                H_out = H_{in} * scale_{factor}
                W_out = W_{in} * scale_{factor}
-        Nearest neighbor interpolation:
-          if:
-              align_corners = False
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = floor (H_{in} * scale_{factor})
-              W_out = floor (W_{in} * scale_{factor})
-          else:
-              align_corners = True
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
-        Bilinear interpolation:
-          if:
-              align_corners = False , align_mode = 0
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-        Bicubic interpolation:
-          if:
-              align_corners = False
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-        Trilinear interpolation:
-          if:
-              align_corners = False , align_mode = 0
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-    https://en.wikipedia.org/wiki/Linear_interpolation.
    For details of linear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Linear_interpolation.
    For details of nearest neighbor interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
@@ -876,23 +879,24 @@ def upsample(
        name(str, optional): The default value is None.
                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
        A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
        or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
-        Examples:
+    Examples:
        .. code-block:: python
-                import paddle
+            import paddle
-                import paddle.nn as nn
+            import paddle.nn as nn
-                input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+            input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
-                upsample_out = paddle.nn.Upsample(size=[12,12])
+            upsample_out = paddle.nn.Upsample(size=[12,12])
-                output = upsample_out(x=input_data)
+            output = upsample_out(x=input_data)
-                print(output.shape)
+            print(output.shape)
-                # [2L, 3L, 12L, 12L]
+            # [2L, 3L, 12L, 12L]
    """
    return interpolate(

--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -23,6 +23,7 @@ __all__ = []
 def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
    r"""
    It computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:
@@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
    Returns:
        Tensor, the dtype is same as input tensor.
        - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
-            depending on whether the input has data shaped as :math:`[N, D]`.
+          depending on whether the input has data shaped as :math:`[N, D]`.
        - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
-            depending on whether the input has data shaped as :math:`[N, D]`.
+          depending on whether the input has data shaped as :math:`[N, D]`.
    Examples:
        .. code-block:: python

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1310,6 +1310,7 @@ def margin_ranking_loss(
 def l1_loss(input, label, reduction='mean', name=None):
    r"""
    Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
    If `reduction` set to ``'none'``, the loss is:
@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None):
    Returns:
        Tensor, the L1 Loss of Tensor ``input`` and ``label``.
-        If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+        If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
        If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
    Examples:
@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None):
            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
            print(l1_loss.numpy())
            # [1.4]
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(
@@ -2286,6 +2288,7 @@ def cross_entropy(
    name=None,
 ):
    r"""
    By default, this operator implements the cross entropy loss function with softmax. This function
    combines the calculation of the softmax operation and the cross entropy loss function
    to provide a more numerically stable computing.
@@ -2399,21 +2402,13 @@ def cross_entropy(
    Parameters:
+        input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
-        - **input** (Tensor)
-            Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
            Note:
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
-                output of softmax operator, which will produce incorrect results.
                2. when use_softmax=False, it expects the output of softmax operator.
-        - **label** (Tensor)
+        label (Tensor):
            1. If soft_label=False, the shape is
            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
            the data type is int32, int64, float32, float64, where each value is [0, C-1].
@@ -2421,48 +2416,27 @@ def cross_entropy(
            2. If soft_label=True, the shape and data type should be same with ``input`` ,
            and the sum of the labels for each sample should be 1.
-        - **weight** (Tensor, optional)
+        weight (Tensor, optional): a manual rescaling weight given to each class.
-            a manual rescaling weight given to each class.
            If given, has to be a Tensor of size C and the data type is float32, float64.
            Default is ``'None'`` .
+        ignore_index (int64, optional): Specifies a target value that is ignored
-        - **ignore_index** (int64, optional)
-            Specifies a target value that is ignored
            and does not contribute to the loss. A negative value means that no label
            value needs to be ignored. Only valid when soft_label = False.
            Default is ``-100`` .
+        reduction (str, optional): Indicate how to average the loss by batch_size,
-        - **reduction** (str, optional)
-            Indicate how to average the loss by batch_size,
            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
            Default is ``'mean'``.
+        soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
-        - **soft_label** (bool, optional)
+        axis (int, optional):The index of dimension to perform softmax calculations.
-            Indicate whether label is soft.
-            Default is ``False``.
-        - **axis** (int, optional)
-            The index of dimension to perform softmax calculations.
            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
            number of dimensions of input :attr:`input`.
            Default is ``-1`` .
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
-        - **use_softmax** (bool, optional)
-            Indicate whether compute softmax before cross_entropy.
            Default is ``True``.
+        name (str, optional): The name of the operator. Default is ``None`` .
-        - **name** (str, optional)
-            The name of the operator. Default is ``None`` .
            For more information, please refer to :ref:`api_guide_Name` .
    Returns:
@@ -2478,9 +2452,7 @@ def cross_entropy(
        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
    Examples:
        .. code-block:: python
            # hard labels
@@ -3834,6 +3806,7 @@ def triplet_margin_loss(
 def soft_margin_loss(input, label, reduction='mean', name=None):
    """
    The API measures the soft margin loss between input predictions ``input``
    and target labels ``label`` . It can be described as:
@@ -3842,9 +3815,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
    Parameters:
-        input (Tensor): The input predications tensor with shape: [N, *],
+        input (Tensor): The input predications tensor with shape: ``[N, *]``,
            N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
-             Available dtype is float32, float64.
+            Available dtype is float32, float64.
        label (Tensor): The target labels tensor with the same shape as
            ``input``. The target labels which values should be numbers -1 or 1.
@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
    Returns:
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
-            same as ``input`` , else the shape of output is [1].
    Examples:
        .. code-block:: python
@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
            #         [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
            #         [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
            #         [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(