2.4.2

dbe08e9b · yuguo960516yuguo · b5499578 · dbe08e9b · dbe08e9b · dbe08e9b
Commit dbe08e9b authored Jun 12, 2023 by yuguo960516yuguo
20 changed files
--- a/python/paddle/fluid/.gitignore
+++ b/python/paddle/fluid/.gitignore
+proto
+core.so
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -77,13 +77,14 @@ def process_image(sample, mode, color_jitter, rotate):
    return img, sample[1]


-def _reader_creator(file_list,
+def _reader_creator(
+    file_list,
    mode,
    shuffle=False,
    color_jitter=False,
    rotate=False,
-                    data_dir=DATA_DIR):
-
+    data_dir=DATA_DIR,
+):
    def reader():
        with open(file_list) as flist:
            full_lines = [line.strip() for line in flist]
@@ -98,10 +99,9 @@ def _reader_creator(file_list,
                    continue
                yield img_path, int(label)

-    mapper = functools.partial(process_image,
-                               mode=mode,
-                               color_jitter=color_jitter,
-                               rotate=rotate)
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate
+    )

    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)

@@ -112,11 +112,11 @@ def val(data_dir=DATA_DIR):


 class TestPostTrainingQuantization(unittest.TestCase):
-
    def setUp(self):
        self.int8_download = 'int8/download'
-        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
-                                               self.int8_download)
+        self.cache_folder = os.path.expanduser(
+            '~/.cache/paddle/dataset/' + self.int8_download
+        )
        self.data_cache_folder = ''
        data_urls = []
        data_md5s = []
@@ -129,31 +129,34 @@ class TestPostTrainingQuantization(unittest.TestCase):
                'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
            )
            data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
-            self.data_cache_folder = self.download_data(data_urls, data_md5s,
-                                                        "full_data", False)
+            self.data_cache_folder = self.download_data(
+                data_urls, data_md5s, "full_data", False
+            )
        else:
            data_urls.append(
                'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
            )
            data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
-            self.data_cache_folder = self.download_data(data_urls, data_md5s,
-                                                        "small_data", False)
+            self.data_cache_folder = self.download_data(
+                data_urls, data_md5s, "small_data", False
+            )

        # reader/decorator.py requires the relative path to the data folder
        if not os.path.exists("./data/ILSVRC2012"):
-            cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
-                                                       self.data_cache_folder)
+            cmd = 'rm -rf {0} && ln -s {1} {0}'.format(
+                "data", self.data_cache_folder
+            )
            os.system(cmd)

        self.batch_size = 1 if os.environ.get('DATASET') == 'full' else 50
-        self.sample_iterations = 50 if os.environ.get(
-            'DATASET') == 'full' else 2
-        self.infer_iterations = 50000 if os.environ.get(
-            'DATASET') == 'full' else 2
+        self.infer_iterations = (
+            50000 if os.environ.get('DATASET') == 'full' else 2
+        )

        self.root_path = tempfile.TemporaryDirectory()
-        self.int8_model = os.path.join(self.root_path.name,
-                                       "post_training_quantization")
+        self.int8_model = os.path.join(
+            self.root_path.name, "post_training_quantization"
+        )

    def tearDown(self):
        self.root_path.cleanup()
@@ -161,7 +164,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
    def cache_unzipping(self, target_folder, zip_path):
        if not os.path.exists(target_folder):
            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path)
+                target_folder, zip_path
+            )
            os.system(cmd)

    def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
@@ -173,13 +177,15 @@ class TestPostTrainingQuantization(unittest.TestCase):
                download(data_urls[i], self.int8_download, data_md5s[i])
                file_names.append(data_urls[i].split('/')[-1])

-            zip_path = os.path.join(self.cache_folder,
-                                    'full_imagenet_val.tar.gz')
+            zip_path = os.path.join(
+                self.cache_folder, 'full_imagenet_val.tar.gz'
+            )
            if not os.path.exists(zip_path):
                cat_command = 'cat'
                for file_name in file_names:
-                    cat_command += ' ' + os.path.join(self.cache_folder,
-                                                      file_name)
+                    cat_command += ' ' + os.path.join(
+                        self.cache_folder, file_name
+                    )
                cat_command += ' > ' + zip_path
                os.system(cat_command)

@@ -199,8 +205,16 @@ class TestPostTrainingQuantization(unittest.TestCase):
        image_shape = [3, 224, 224]
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
-        [infer_program, feed_dict, fetch_targets] = \
-            fluid.io.load_inference_model(model_path, exe)
+        [
+            infer_program,
+            feed_dict,
+            fetch_targets,
+        ] = fluid.io.load_inference_model(
+            model_path,
+            exe,
+            model_filename="inference.pdmodel",
+            params_filename="inference.pdiparams",
+        )
        val_reader = paddle.batch(val(), batch_size)
        iterations = infer_iterations

@@ -208,23 +222,28 @@ class TestPostTrainingQuantization(unittest.TestCase):
        cnt = 0
        periods = []
        for batch_id, data in enumerate(val_reader()):
-            image = np.array([x[0].reshape(image_shape)
-                              for x in data]).astype("float32")
+            image = np.array([x[0].reshape(image_shape) for x in data]).astype(
+                "float32"
+            )
            label = np.array([x[1] for x in data]).astype("int64")
            label = label.reshape([-1, 1])

            t1 = time.time()
-            _, acc1, _ = exe.run(infer_program,
-                                 feed={
-                                     feed_dict[0]: image,
-                                     feed_dict[1]: label
-                                 },
-                                 fetch_list=fetch_targets)
+            pred = exe.run(
+                infer_program,
+                feed={feed_dict[0]: image},
+                fetch_list=fetch_targets,
+            )
            t2 = time.time()
            period = t2 - t1
            periods.append(period)

-            test_info.append(np.mean(acc1) * len(data))
+            pred = np.array(pred[0])
+            sort_array = pred.argsort(axis=1)
+            top_1_pred = sort_array[:, -1:][:, ::-1]
+            top_1 = np.mean(label == top_1_pred)
+
+            test_info.append(np.mean(top_1) * len(data))
            cnt += len(data)

            if (batch_id + 1) % 100 == 0:
@@ -238,7 +257,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
        acc1 = np.sum(test_info) / cnt
        return (throughput, latency, acc1)

-    def generate_quantized_model(self,
+    def generate_quantized_model(
+        self,
        model_path,
        quantizable_op_type,
        batch_size,
@@ -248,12 +268,14 @@ class TestPostTrainingQuantization(unittest.TestCase):
        is_use_cache_file=False,
        is_optimize_model=False,
        batch_nums=10,
-                                 onnx_format=False):
+        onnx_format=False,
+    ):
        try:
            os.system("mkdir " + self.int8_model)
        except Exception as e:
-            print("Failed to create {} due to {}".format(
-                self.int8_model, str(e)))
+            print(
+                "Failed to create {} due to {}".format(self.int8_model, str(e))
+            )
            sys.exit(-1)

        place = fluid.CPUPlace()
@@ -261,9 +283,12 @@ class TestPostTrainingQuantization(unittest.TestCase):
        scope = fluid.global_scope()
        val_reader = val()

-        ptq = PostTrainingQuantization(executor=exe,
+        ptq = PostTrainingQuantization(
+            executor=exe,
            sample_generator=val_reader,
            model_dir=model_path,
+            model_filename="inference.pdmodel",
+            params_filename="inference.pdiparams",
            batch_size=batch_size,
            batch_nums=batch_nums,
            algo=algo,
@@ -272,11 +297,17 @@ class TestPostTrainingQuantization(unittest.TestCase):
            is_full_quantize=is_full_quantize,
            optimize_model=is_optimize_model,
            onnx_format=onnx_format,
-                                       is_use_cache_file=is_use_cache_file)
+            is_use_cache_file=is_use_cache_file,
+        )
        ptq.quantize()
-        ptq.save_quantized_model(self.int8_model)
+        ptq.save_quantized_model(
+            self.int8_model,
+            model_filename="inference.pdmodel",
+            params_filename="inference.pdiparams",
+        )

-    def run_test(self,
+    def run_test(
+        self,
        model,
        algo,
        round_type,
@@ -288,43 +319,62 @@ class TestPostTrainingQuantization(unittest.TestCase):
        is_optimize_model,
        diff_threshold,
        onnx_format=False,
-                 batch_nums=10):
+        batch_nums=10,
+    ):
        infer_iterations = self.infer_iterations
        batch_size = self.batch_size
-        sample_iterations = self.sample_iterations

        model_cache_folder = self.download_data(data_urls, data_md5s, model)

-        print("Start FP32 inference for {0} on {1} images ...".format(
-            model, infer_iterations * batch_size))
+        print(
+            "Start FP32 inference for {0} on {1} images ...".format(
+                model, infer_iterations * batch_size
+            )
+        )
        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
-            os.path.join(model_cache_folder, "model"), batch_size,
-            infer_iterations)
-
-        print("Start INT8 post training quantization for {0} on {1} images ...".
-              format(model, sample_iterations * batch_size))
-        self.generate_quantized_model(os.path.join(model_cache_folder, "model"),
-                                      quantizable_op_type, batch_size,
-                                      sample_iterations, algo, round_type,
-                                      is_full_quantize, is_use_cache_file,
-                                      is_optimize_model, batch_nums,
-                                      onnx_format)
-
-        print("Start INT8 inference for {0} on {1} images ...".format(
-            model, infer_iterations * batch_size))
-        (int8_throughput, int8_latency,
-         int8_acc1) = self.run_program(self.int8_model, batch_size,
-                                       infer_iterations)
+            os.path.join(model_cache_folder, "MobileNetV1_infer"),
+            batch_size,
+            infer_iterations,
+        )
+
+        print(
+            "Start INT8 post training quantization for {0} on {1} images ...".format(
+                model, batch_nums * batch_size
+            )
+        )
+        self.generate_quantized_model(
+            os.path.join(model_cache_folder, "MobileNetV1_infer"),
+            quantizable_op_type,
+            batch_size,
+            algo,
+            round_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            batch_nums,
+            onnx_format,
+        )
+
+        print(
+            "Start INT8 inference for {0} on {1} images ...".format(
+                model, infer_iterations * batch_size
+            )
+        )
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model, batch_size, infer_iterations
+        )

        print("---Post training quantization of {} method---".format(algo))
        print(
-            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}."
-            .format(model, batch_size, fp32_throughput, fp32_latency,
-                    fp32_acc1))
+            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.".format(
+                model, batch_size, fp32_throughput, fp32_latency, fp32_acc1
+            )
+        )
        print(
-            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.\n"
-            .format(model, batch_size, int8_throughput, int8_latency,
-                    int8_acc1))
+            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.\n".format(
+                model, batch_size, int8_throughput, int8_latency, int8_acc1
+            )
+        )
        sys.stdout.flush()

        delta_value = fp32_acc1 - int8_acc1
@@ -332,15 +382,14 @@ class TestPostTrainingQuantization(unittest.TestCase):


 class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
-
    def test_post_training_kl_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "KL"
        round_type = "round"
        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
        quantizable_op_type = [
            "conv2d",
            "depthwise_conv2d",
@@ -351,21 +400,30 @@ class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
        is_use_cache_file = False
        is_optimize_model = True
        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
+        batch_nums = 3
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+        )


 class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
-
    def test_post_training_avg_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "avg"
        round_type = "round"
        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
        quantizable_op_type = [
            "conv2d",
            "depthwise_conv2d",
@@ -375,21 +433,29 @@ class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
        is_use_cache_file = False
        is_optimize_model = True
        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+        )


 class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
-
    def test_post_training_hist_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "hist"
        round_type = "round"
        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
        quantizable_op_type = [
            "conv2d",
            "depthwise_conv2d",
@@ -400,7 +466,8 @@ class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
        is_optimize_model = True
        diff_threshold = 0.03
        batch_nums = 3
-        self.run_test(model,
+        self.run_test(
+            model,
            algo,
            round_type,
            data_urls,
@@ -410,19 +477,19 @@ class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
            is_use_cache_file,
            is_optimize_model,
            diff_threshold,
-                      batch_nums=batch_nums)
+            batch_nums=batch_nums,
+        )


 class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
-
    def test_post_training_abs_max_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "abs_max"
        round_type = "round"
        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
        quantizable_op_type = [
            "conv2d",
            "mul",
@@ -432,21 +499,29 @@ class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
        is_optimize_model = False
        # The accuracy diff of post-training quantization (abs_max) maybe bigger
        diff_threshold = 0.05
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+        )


 class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
-
    def test_post_training_onnx_format_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "emd"
        round_type = "round"
        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
        quantizable_op_type = [
            "conv2d",
            "depthwise_conv2d",
@@ -458,7 +533,8 @@ class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
        onnx_format = True
        diff_threshold = 0.05
        batch_nums = 3
-        self.run_test(model,
+        self.run_test(
+            model,
            algo,
            round_type,
            data_urls,
@@ -469,7 +545,8 @@ class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
            is_optimize_model,
            diff_threshold,
            onnx_format=onnx_format,
-                      batch_nums=batch_nums)
+            batch_nums=batch_nums,
+        )


 if __name__ == '__main__':

--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -23,9 +23,9 @@ from ...log_helper import get_logger

 __all__ = ['add_supported_layer']

-_logger = get_logger(__name__,
-                     logging.INFO,
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)


 def _default_pruning(weight_nparray, m, n, func_name, param_name):
@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
    exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
    if exlude_cond_shape2:
        _logger.warning(
-            '{} is not pruned because the first dimension of {} is smaller than {}'
-            .format(param_name, shape, m))
+            '{} is not pruned because the first dimension of {} is smaller than {}'.format(
+                param_name, shape, m
+            )
+        )
        return weight_pruned_nparray, weight_sparse_mask
    if exlude_cond_shape4:
        _logger.warning(
-            '{} is not pruned because the second dimension of {} is smaller than {}'
-            .format(param_name, shape, m))
+            '{} is not pruned because the second dimension of {} is smaller than {}'.format(
+                param_name, shape, m
+            )
+        )
        return weight_pruned_nparray, weight_sparse_mask

    checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight
    # matrices beforce invoking create_mask. Then we transpose the result mask to make
    # sure its shape to be the same as the input weight.
-    weight_sparse_mask = sparsity.create_mask(weight_nparray.T,
-                                              func_name=func_name,
-                                              n=n,
-                                              m=m).T
+    weight_sparse_mask = sparsity.create_mask(
+        weight_nparray.T, func_name=func_name, n=n, m=m
+    ).T
    weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
-    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
-                    'Pruning {} weight matrix failure!!!'.format(param_name)
+    assert sparsity.check_sparsity(
+        weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name
+    ), 'Pruning {} weight matrix failure!!!'.format(param_name)
    return weight_pruned_nparray, weight_sparse_mask


@@ -78,6 +82,7 @@ supported_layers_and_prune_func_map = {}

 def add_supported_layer(layer, pruning_func=None):
    r"""
+
    Add supported layers and its corresponding pruning function.

    Args:
@@ -87,19 +92,25 @@ def add_supported_layer(layer, pruning_func=None):
        pruning_func (function, optional): a function type which receives five argument (weight_nparray,
                                           m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
                                           m, n, and func_name, please see `prune_model` for details.
+
    """
    name = None
    if isinstance(layer, str):
        name = layer
    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            type(layer).__name__)
+            type(layer).__name__
+        )
    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            layer.__name__)
+            layer.__name__
+        )
    else:
-        assert "The type of layer should be string of Layer, but got {}!".format(
-            type(layer))
+        assert (
+            "The type of layer should be string of Layer, but got {}!".format(
+                type(layer)
+            )
+        )
    if pruning_func is None:
        pruning_func = _default_pruning
    _supported_layers_and_prune_func_map_lock.acquire()

--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -27,9 +27,16 @@ from itertools import permutations
 import threading

 __all__ = [
-    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
-    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
-    'MaskAlgo', 'CheckMethod'
+    'calculate_density',
+    'check_mask_1d',
+    'get_mask_1d',
+    'check_mask_2d',
+    'get_mask_2d_greedy',
+    'get_mask_2d_best',
+    'create_mask',
+    'check_sparsity',
+    'MaskAlgo',
+    'CheckMethod',
 ]


@@ -76,8 +83,9 @@ class CheckMethod(Enum):
            CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
            # CheckMethod.CHECK_2D
        """
-        assert isinstance(mask_algo, MaskAlgo), \
-               "mask_algo should be MaskAlgo type"
+        assert isinstance(
+            mask_algo, MaskAlgo
+        ), "mask_algo should be MaskAlgo type"
        if mask_algo == MaskAlgo.MASK_1D:
            return CheckMethod.CHECK_1D
        else:
@@ -86,20 +94,25 @@ class CheckMethod(Enum):

 def calculate_density(x):
    r"""
+
    Return the density of the input tensor.

    Args:
        x (nparray): The input tensor.
+
    Returns:
-        float: The density of :attr:`x`.
+        float, The density of :attr:`x`.
+
    Examples:
        .. code-block:: python
+
            import paddle
            import numpy as np

            x = np.array([[0, 1, 3, 0],
                        [1, 1, 0, 1]])
            paddle.incubate.asp.calculate_density(x) # 0.625
+
    """
    x_flattened = x.flatten()
    return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
@@ -126,7 +139,7 @@ def _reshape_1d(mat, m):
    remainder = mat.shape[1] % m
    if mat.shape[1] % m > 0:
        mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
-        mat_padded[:, :mat.shape[1]] = mat
+        mat_padded[:, : mat.shape[1]] = mat
        shape = mat_padded.shape
        return mat_padded.reshape(-1, m), shape
    else:
@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m):
        min_order_indices = np.argsort(np.absolute(sub_mat))
        mask_flattern[i, min_order_indices[:n].tolist()] = 0
    mask_flattern = mask_flattern.reshape(shape)
-    mask[:, :] = mask_flattern[:, :mat.shape[1]]
+    mask[:, :] = mask_flattern[:, : mat.shape[1]]
    return mask


@@ -239,12 +252,12 @@ def _reshape_2d(mat, m):
    remainder_0 = mat.shape[0] % m
    remainder_1 = mat.shape[1] % m

-    new_shape = (mat.shape[0] if remainder_0 == 0 \
-                 else mat.shape[0] + (m - remainder_0),
-                 mat.shape[1] if remainder_1 == 0 \
-                 else mat.shape[1] + (m - remainder_1))
+    new_shape = (
+        mat.shape[0] if remainder_0 == 0 else mat.shape[0] + (m - remainder_0),
+        mat.shape[1] if remainder_1 == 0 else mat.shape[1] + (m - remainder_1),
+    )
    mat_padded = np.zeros(new_shape)
-    mat_padded[:mat.shape[0], :mat.shape[1]] = mat
+    mat_padded[: mat.shape[0], : mat.shape[1]] = mat

    mat_flattern = np.empty(new_shape).reshape(-1, m * m)
    curr_idx = 0
@@ -252,9 +265,9 @@ def _reshape_2d(mat, m):
        row_end = row_start + m
        for col_start in range(0, mat_padded.shape[1], m):
            col_end = col_start + m
-            sub_mat = np.squeeze(mat_padded[row_start:row_end, \
-                                            col_start:col_end] \
-                                            .reshape(-1))
+            sub_mat = np.squeeze(
+                mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
+            )
            mat_flattern[curr_idx] = sub_mat
            curr_idx += 1
    return mat_flattern, mat_padded.shape
@@ -304,8 +317,9 @@ def check_mask_2d(mat, n, m):
    mat_padded, shape = _reshape_2d(mat, m)
    for sub_mat in mat_padded:
        sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
-        if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
-            (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
+        if (np.sum(np.sum(sub_mask, axis=1) > (m - n)) != 0) and (
+            np.sum(np.sum(sub_mask, axis=0) > (m - n)) != 0
+        ):
            return False
    return True

@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m):
        sub_mask = np.squeeze(mask_padded[idx])

        min_order_1d_indices = np.argsort(sub_mat)
-        min_order_2d_indices = [(int(x / m), x % m)
-                                for x in min_order_1d_indices]
+        min_order_2d_indices = [
+            (int(x / m), x % m) for x in min_order_1d_indices
+        ]
        row_counter = collections.Counter()
        col_counter = collections.Counter()

        for i in range(len(min_order_1d_indices) - 1, -1, -1):
            matrix_entry = min_order_2d_indices[i]
-            if (row_counter[matrix_entry[0]] == n) or \
-               (col_counter[matrix_entry[1]] == n):
+            if (row_counter[matrix_entry[0]] == n) or (
+                col_counter[matrix_entry[1]] == n
+            ):
                continue

            sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m):
            col_end = col_start + m
            mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
            curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]


 _valid_2d_patterns_lock = threading.Lock()
@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m):
        patterns = patterns + patterns
        patterns = np.asarray(list(set(permutations(patterns, m))))

-        valid = ((patterns.sum(axis=1) <= n).sum(
-            axis=1) == m).nonzero()[0].reshape(-1)
+        valid = (
+            ((patterns.sum(axis=1) <= n).sum(axis=1) == m)
+            .nonzero()[0]
+            .reshape(-1)
+        )
        valid_patterns = np.empty((valid.shape[0], m, m))
        valid_patterns[:] = patterns[valid[:]]

@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m):

    mat_flattern, shape = _reshape_2d(mat, m)
    mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
-    pmax = np.argmax(np.matmul(mat_flattern,
-                               patterns.reshape(patterns.shape[0], m * m).T),
-                     axis=1)
+    pmax = np.argmax(
+        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
+        axis=1,
+    )

    mask_flattern[:] = patterns[pmax[:]]
    mask = np.empty(shape)
@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m):
            col_end = col_start + m
            mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
            curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]


 def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
    dtype = tensor.dtype
    t = tensor.astype(float)

-    assert isinstance(func_name, MaskAlgo), \
-           "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
+    assert isinstance(func_name, MaskAlgo), (
+        "func_name argumet of create_mask is only accepted as type MaskAlgo. "
        "But got {}".format(type(func_name))
+    )
    func = getattr(sys.modules[__name__], func_name.value, None)
    if len(shape) == 1:
        t = t.reshape(1, shape[0])
@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
        t = t.reshape(shape[0] * shape[1], shape[2])
    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
    elif len(shape) == 4:
-        t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3],
-                                              shape[2])
+        t = t.transpose([0, 1, 3, 2]).reshape(
+            shape[0] * shape[1] * shape[3], shape[2]
+        )
        mask = func(t, n=n, m=m)
-        return mask.reshape([shape[0], shape[1], shape[3],
-                             shape[2]]).transpose([0, 1, 3, 2]).astype(dtype)
+        return (
+            mask.reshape([shape[0], shape[1], shape[3], shape[2]])
+            .transpose([0, 1, 3, 2])
+            .astype(dtype)
+        )
    else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+        raise ValueError(
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )

    mask = func(t, n=n, m=m)
    return mask.reshape(shape).astype(dtype)
@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
    shape = tensor.shape
    t = tensor.astype(float)

-    assert type(func_name) == CheckMethod, \
-           "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
+    assert type(func_name) == CheckMethod, (
+        "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
        "But got {}".format(type(func_name))
+    )
    func = getattr(sys.modules[__name__], func_name.value, None)
    if len(shape) == 1:
        t = t.reshape(1, shape[0])
@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
        t = t.reshape(shape[0] * shape[1], shape[2])
    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
    elif len(shape) == 4:
-        t = t.transpose([0, 1, 3,
-                         2]).reshape([shape[0] * shape[1] * shape[3], shape[2]])
+        t = t.transpose([0, 1, 3, 2]).reshape(
+            [shape[0] * shape[1] * shape[3], shape[2]]
+        )
    else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+        raise ValueError(
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )

    return func(t, n=n, m=m)
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -35,9 +35,9 @@ try:
    if os.name == 'nt':
        third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
        # Will load shared library from 'path' on windows
-        os.environ[
-            'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
-                'path']
+        os.environ['path'] = (
+            current_path + ';' + third_lib_path + ';' + os.environ['path']
+        )
        sys.path.insert(0, third_lib_path)
        # Note: from python3.8, PATH will not take effect
        # https://github.com/python/cpython/pull/12302
@@ -47,20 +47,24 @@ try:

 except ImportError as e:
    from .. import compat as cpt
+
    if os.name == 'nt':
        executable_path = os.path.abspath(os.path.dirname(sys.executable))
        raise ImportError(
            """NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
        if you encounters \"DLL load failed\" errors. If you have python
        installed in other directory, replace \"%s\" with your own
-        directory. The original error is: \n %s""" %
-            (executable_path, executable_path, cpt.get_exception_message(e)))
+        directory. The original error is: \n %s"""
+            % (executable_path, executable_path, cpt.get_exception_message(e))
+        )
    else:
        raise ImportError(
            """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
        if you encounters \"libmkldnn.so not found\" errors. If you have python
        installed in other directory, replace \"/usr/local/lib\" with your own
-        directory. The original error is: \n""" + cpt.get_exception_message(e))
+        directory. The original error is: \n"""
+            + cpt.get_exception_message(e)
+        )
 except Exception as e:
    raise e

@@ -70,36 +74,45 @@ def avx_supported():
    Whether current system(Linux, MacOS, Windows) is supported with AVX.
    """
    from .. import compat as cpt
+
    sysstr = platform.system().lower()
    has_avx = False
    if sysstr == 'linux':
        try:
-            has_avx = os.popen('cat /proc/cpuinfo | grep -i avx').read() != ''
+            pipe = os.popen('cat /proc/cpuinfo | grep -i avx')
+            has_avx = pipe.read() != ''
+            pipe.close()
        except Exception as e:
-            sys.stderr.write('Can not get the AVX flag from /proc/cpuinfo.\n'
-                             'The original error is: %s\n' %
-                             cpt.get_exception_message(e))
+            sys.stderr.write(
+                'Can not get the AVX flag from /proc/cpuinfo.\n'
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
        return has_avx
    elif sysstr == 'darwin':
        try:
-            has_avx = os.popen(
-                'sysctl machdep.cpu.features | grep -i avx').read() != ''
+            pipe = os.popen('sysctl machdep.cpu.features | grep -i avx')
+            has_avx = pipe.read() != ''
+            pipe.close()
        except Exception as e:
            sys.stderr.write(
                'Can not get the AVX flag from machdep.cpu.features.\n'
-                'The original error is: %s\n' % cpt.get_exception_message(e))
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
        if not has_avx:
            import subprocess
+
            pipe = subprocess.Popen(
                'sysctl machdep.cpu.leaf7_features | grep -i avx',
                shell=True,
                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE)
+                stderr=subprocess.PIPE,
+            )
            _ = pipe.communicate()
            has_avx = True if pipe.returncode == 0 else False
        return has_avx
    elif sysstr == 'windows':
        import ctypes
+
        ONE_PAGE = ctypes.c_size_t(0x1000)

        def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
@@ -109,24 +122,31 @@ def avx_supported():
            pfnVirtualAlloc.restype = ctypes.c_void_p
            MEM_COMMIT = ctypes.c_ulong(0x1000)
            PAGE_READWRITE = ctypes.c_ulong(0x4)
-            address = pfnVirtualAlloc(None, ONE_PAGE, MEM_COMMIT,
-                                      PAGE_READWRITE)
+            address = pfnVirtualAlloc(
+                None, ONE_PAGE, MEM_COMMIT, PAGE_READWRITE
+            )
            if not address:
                raise Exception("Failed to VirtualAlloc")

            # Copy the code into the memory segment
-            memmove = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p,
+            memmove = ctypes.CFUNCTYPE(
+                ctypes.c_void_p,
+                ctypes.c_void_p,
                ctypes.c_void_p,
-                                       ctypes.c_size_t)(ctypes._memmove_addr)
+                ctypes.c_size_t,
+            )(ctypes._memmove_addr)
            if memmove(address, code_str, len(code_str)) < 0:
                raise Exception("Failed to memmove")

            # Enable execute permissions
            PAGE_EXECUTE = ctypes.c_ulong(0x10)
            pfnVirtualProtect = ctypes.windll.kernel32.VirtualProtect
-            res = pfnVirtualProtect(ctypes.c_void_p(address),
-                                    ONE_PAGE, PAGE_EXECUTE,
-                                    ctypes.byref(ctypes.c_ulong(0)))
+            res = pfnVirtualProtect(
+                ctypes.c_void_p(address),
+                ONE_PAGE,
+                PAGE_EXECUTE,
+                ctypes.byref(ctypes.c_ulong(0)),
+            )
            if not res:
                raise Exception("Failed VirtualProtect")

@@ -135,7 +155,8 @@ def avx_supported():
            pfnGetCurrentProcess.restype = ctypes.c_void_p
            prochandle = ctypes.c_void_p(pfnGetCurrentProcess())
            res = ctypes.windll.kernel32.FlushInstructionCache(
-                prochandle, ctypes.c_void_p(address), ONE_PAGE)
+                prochandle, ctypes.c_void_p(address), ONE_PAGE
+            )
            if not res:
                raise Exception("Failed FlushInstructionCache")

@@ -153,12 +174,14 @@ def avx_supported():
            # Convert the code_str into a function that returns uint
            func, address = asm_func(code_str)
            retval = func()
-            ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(address),
-                                               ctypes.c_size_t(0), ONE_PAGE)
+            ctypes.windll.kernel32.VirtualFree(
+                ctypes.c_void_p(address), ctypes.c_size_t(0), ONE_PAGE
+            )
        except Exception as e:
-            sys.stderr.write('Failed getting the AVX flag on Windows.\n'
-                             'The original error is: %s\n' %
-                             cpt.get_exception_message(e))
+            sys.stderr.write(
+                'Failed getting the AVX flag on Windows.\n'
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
        return (retval & (1 << avx_bit)) > 0
    else:
        sys.stderr.write('Do not get AVX flag on %s\n' % sysstr)
@@ -167,10 +190,10 @@ def avx_supported():

 def run_shell_command(cmd):
    import subprocess
-    out, err = subprocess.Popen(cmd,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE,
-                                shell=True).communicate()
+
+    out, err = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
+    ).communicate()
    if err:
        return None
    else:
@@ -179,8 +202,9 @@ def run_shell_command(cmd):

 def get_dso_path(core_so, dso_name):
    if core_so and dso_name:
-        return run_shell_command("ldd %s|grep %s|awk '{print $3}'" %
-                                 (core_so, dso_name))
+        return run_shell_command(
+            "ldd %s|grep %s|awk '{print $3}'" % (core_so, dso_name)
+        )
    else:
        return None

@@ -189,6 +213,7 @@ def load_dso(dso_absolute_path):
    if dso_absolute_path:
        try:
            from ctypes import cdll
+
            cdll.LoadLibrary(dso_absolute_path)
        except:
            warnings.warn("Load {} failed".format(dso_absolute_path))
@@ -247,12 +272,14 @@ if platform.system().lower() == 'linux':

 try:
    from . import libpaddle
+
    if avx_supported() and not libpaddle.is_compiled_with_avx():
        sys.stderr.write(
            "Hint: Your machine support AVX, but the installed paddlepaddle doesn't have avx core. "
            "Hence, no-avx core with worse preformance will be imported.\nIf you like, you could "
            "reinstall paddlepaddle by 'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' "
-            "to get better performance.\n")
+            "to get better performance.\n"
+        )

    # assign tensor alias
    libpaddle.LoDTensor = libpaddle.Tensor
@@ -283,6 +310,7 @@ try:
    from .libpaddle import _Profiler, _ProfilerResult, _RecordEvent
    from .libpaddle import _set_current_stream
    from .libpaddle import _get_phi_kernel_name
+
    if sys.platform != 'win32':
        from .libpaddle import _set_process_pids
        from .libpaddle import _erase_process_pids
@@ -295,12 +323,18 @@ try:
 except Exception as e:
    if has_paddle_dy_lib:
        sys.stderr.write(
-            'Error: Can not import paddle core while this file exists: ' +
-            current_path + os.sep + 'libpaddle.' + dy_lib_suffix + '\n')
+            'Error: Can not import paddle core while this file exists: '
+            + current_path
+            + os.sep
+            + 'libpaddle.'
+            + dy_lib_suffix
+            + '\n'
+        )
    if not avx_supported() and libpaddle.is_compiled_with_avx():
        sys.stderr.write(
            "Error: Your machine doesn't support AVX, but the installed PaddlePaddle is avx core, "
-            "you should reinstall paddlepaddle with no-avx core.\n")
+            "you should reinstall paddlepaddle with no-avx core.\n"
+        )
    raise e


@@ -317,22 +351,26 @@ def set_paddle_custom_device_lib_path(lib_path):

 # set paddle lib path
 def set_paddle_lib_path():
-    site_dirs = site.getsitepackages() if hasattr(
-        site,
-        'getsitepackages') else [x for x in sys.path if 'site-packages' in x]
+    site_dirs = (
+        site.getsitepackages()
+        if hasattr(site, 'getsitepackages')
+        else [x for x in sys.path if 'site-packages' in x]
+    )
    for site_dir in site_dirs:
        lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
        if os.path.exists(lib_dir):
            _set_paddle_lib_path(lib_dir)
            set_paddle_custom_device_lib_path(
-                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
+            )
            return
    if hasattr(site, 'USER_SITE'):
        lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
        if os.path.exists(lib_dir):
            _set_paddle_lib_path(lib_dir)
            set_paddle_custom_device_lib_path(
-                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
+            )


 set_paddle_lib_path()
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -18,19 +18,32 @@ import six

 import paddle
 from paddle.fluid import framework, backward, core, program_guard
-from paddle.fluid.executor import _is_enable_standalone_executor, _is_dy2st_enable_standalone_executor
+from paddle.fluid.executor import (
+    _is_enable_standalone_executor,
+    _is_dy2st_enable_standalone_executor,
+)
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
-from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
+from paddle.fluid.dygraph.dygraph_to_static.return_transformer import (
+    RETURN_NO_VALUE_MAGIC_NUM,
+)
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.compiler import BuildStrategy
 from paddle.fluid.framework import _apply_pass
-from paddle.fluid.contrib.mixed_precision.decorator import AutoMixedPrecisionLists
-from paddle.fluid.contrib.mixed_precision.fp16_utils import rewrite_program, cast_model_to_fp16
-from paddle.fluid.dygraph.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard
+from paddle.fluid.contrib.mixed_precision.decorator import (
+    AutoMixedPrecisionLists,
+)
+from paddle.fluid.contrib.mixed_precision.fp16_utils import (
+    rewrite_program,
+    cast_model_to_fp16,
+)
+from paddle.fluid.dygraph.amp.auto_cast import (
+    _in_amp_guard,
+    _in_pure_fp16_guard,
+)
 import paddle.compat as cpt
 from paddle import _C_ops, _legacy_C_ops

@@ -64,7 +77,8 @@ class NestSequence(object):
        var_ids = []
        for idx, var in enumerate(self.__input_list):
            if isinstance(
-                    var, (framework.Variable, core.VarBase, core.eager.Tensor)):
+                var, (framework.Variable, core.VarBase, core.eager.Tensor)
+            ):
                var_ids.append(idx)

        return var_ids
@@ -77,15 +91,17 @@ class NestSequence(object):
            warning_types = set()
            for var in self.__input_list:
                if not isinstance(
-                        var,
-                    (framework.Variable, core.VarBase, core.eager.Tensor)):
+                    var, (framework.Variable, core.VarBase, core.eager.Tensor)
+                ):
                    warning_types.add(type(var))
            if warning_types:
                logging_utils.warn(
                    "Output of traced function contains non-tensor type values: {}. "
                    "Currently, We don't support to update them while training and will return "
-                    "what we first saw. Please try to return them as tensor.".
-                    format(list(warning_types)))
+                    "what we first saw. Please try to return them as tensor.".format(
+                        list(warning_types)
+                    )
+                )

    @property
    def var_ids(self):
@@ -139,12 +155,9 @@ class PartialProgramLayer:
        Layer: A Layer object that run all ops internally in static mode.
    """

-    def __init__(self,
-                 main_program,
-                 inputs,
-                 outputs,
-                 parameters=None,
-                 **kwargs):
+    def __init__(
+        self, main_program, inputs, outputs, parameters=None, **kwargs
+    ):
        super(PartialProgramLayer, self).__init__()
        self._inputs = NestSequence(inputs)
        self._outputs = NestSequence(outputs, need_check=True)
@@ -167,7 +180,8 @@ class PartialProgramLayer:
        # For AMP training
        self._amp_list = AutoMixedPrecisionLists(
            custom_white_list=custom_white_list,
-            custom_black_list=custom_black_list)
+            custom_black_list=custom_black_list,
+        )

        # program_id -> list(scope)
        self._scope_cache = {}
@@ -188,10 +202,6 @@ class PartialProgramLayer:
        else:
            return core.Scope()

-    @LazyInitialized
-    def __fake_vars(self):
-        return _create_fake_var()
-
    @LazyInitialized
    def _double_grads(self):
        return self._get_double_grads(self._origin_main_program)
@@ -203,7 +213,8 @@ class PartialProgramLayer:
            return self._origin_main_program.clone(for_test=is_infer_mode)
        else:
            train_program = self._append_backward_desc(
-                self._origin_main_program)
+                self._origin_main_program
+            )
            # Note: Only set grad type once after initializing train program. So we put it here.
            self._set_grad_type(self._params, train_program)
            return train_program
@@ -223,16 +234,18 @@ class PartialProgramLayer:
    @switch_to_static_graph
    def _create_pure_fp16_program(self, is_infer_mode=False):
        pure_fp16_program = self._origin_main_program.clone(
-            for_test=is_infer_mode)
+            for_test=is_infer_mode
+        )
        with program_guard(pure_fp16_program):
-            cast_model_to_fp16(pure_fp16_program,
-                               self._amp_list,
-                               use_fp16_guard=False)
+            cast_model_to_fp16(
+                pure_fp16_program, self._amp_list, use_fp16_guard=False
+            )
        if is_infer_mode:
            return pure_fp16_program
        else:
            train_pure_fp16_program = self._append_backward_desc(
-                pure_fp16_program)
+                pure_fp16_program
+            )
            self._set_grad_type(self._params, train_pure_fp16_program)
            return train_pure_fp16_program

@@ -240,23 +253,27 @@ class PartialProgramLayer:
    def _create_forward_backward_train_program(self):
        whole_program = self._create_program()
        forward_end_op_index = self._infer_program.desc.block(0).op_size()
-        return self._get_forward_backward_program_form(whole_program,
-                                                       forward_end_op_index)
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )

    @switch_to_static_graph
    def _create_forward_backward_train_amp_program(self):
        whole_program = self._create_amp_program()
        forward_end_op_index = self._infer_amp_program.desc.block(0).op_size()
-        return self._get_forward_backward_program_form(whole_program,
-                                                       forward_end_op_index)
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )

    @switch_to_static_graph
    def _create_forward_backward_train_pure_fp16_program(self):
        whole_program = self._create_pure_fp16_program()
        forward_end_op_index = self._infer_pure_fp16_program.desc.block(
-            0).op_size()
-        return self._get_forward_backward_program_form(whole_program,
-                                                       forward_end_op_index)
+            0
+        ).op_size()
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )

    @LazyInitialized
    def _train_program(self):
@@ -352,8 +369,9 @@ class PartialProgramLayer:
    @LazyInitialized
    def _train_program_id(self):
        program_id = _hash_with_id(self._train_program, self)
-        core._set_cached_executor_build_strategy(program_id,
-                                                 self._build_strategy)
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
        return program_id

    @LazyInitialized
@@ -363,8 +381,9 @@ class PartialProgramLayer:
    @LazyInitialized
    def _train_amp_program_id(self):
        program_id = _hash_with_id(self._train_amp_program, self)
-        core._set_cached_executor_build_strategy(program_id,
-                                                 self._build_strategy)
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
        return program_id

    @LazyInitialized
@@ -374,8 +393,9 @@ class PartialProgramLayer:
    @LazyInitialized
    def _train_pure_fp16_program_id(self):
        program_id = _hash_with_id(self._train_pure_fp16_program, self)
-        core._set_cached_executor_build_strategy(program_id,
-                                                 self._build_strategy)
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
        return program_id

    @LazyInitialized
@@ -411,8 +431,9 @@ class PartialProgramLayer:

        return main_program

-    def prepare_gradient_aggregation(self, start_idx, main_program,
-                                     target_program):
+    def prepare_gradient_aggregation(
+        self, start_idx, main_program, target_program
+    ):
        """
        Why we need add gradient aggregation operation ?
        In some cases, if non leaf nodes are used as output, gradient overwriting will occur, such as
@@ -431,7 +452,7 @@ class PartialProgramLayer:
            """
            if not isinstance(var, framework.Variable) or var.type not in [
                core.VarDesc.VarType.LOD_TENSOR,
-                    core.VarDesc.VarType.SELECTED_ROWS
+                core.VarDesc.VarType.SELECTED_ROWS,
            ]:
                return False
            if var.dtype not in [paddle.float32, paddle.float64]:
@@ -448,20 +469,28 @@ class PartialProgramLayer:
            new_grad_name = var.name + suffix + "@GRAD"
            finded_ops = list(
                filter(
-                    lambda x: x[0] >= start_idx and any([
+                    lambda x: x[0] >= start_idx
+                    and any(
+                        [
                            out_arg == var_grad_name
                            for out_arg in x[1].output_arg_names
-                    ]), enumerate(target_program.block(0).ops)))
+                        ]
+                    ),
+                    enumerate(target_program.block(0).ops),
+                )
+            )

            # len(finded_ops) may equals zero when stop_gradient works.
            # len(finded_ops) may > 1, because we may have fill_constant op.
            if len(finded_ops) == 0:
                return None
            # step1: create a new var named var.name@GRAD
-            target_program.block(0).create_var(name=new_grad_name,
+            target_program.block(0).create_var(
+                name=new_grad_name,
                type=var.type,
                dtype=var.dtype,
-                                               shape=var.shape)
+                shape=var.shape,
+            )
            # step2: rename the var.name@GRAD to var.name@GRAD@dy2static
            for idx, op in finded_ops:
                op._rename_input(var_grad_name, new_grad_name)
@@ -472,11 +501,13 @@ class PartialProgramLayer:
                finded_ops[-1][0] + 1,
                type='sum',
                inputs={'X': [var_grad_name, new_grad_name]},
-                outputs={"Out": var_grad_name})
+                outputs={"Out": var_grad_name},
+            )
            return None

        to_processed_vars = list(
-            filter(_need_aggregation, self._outputs.tolist()))
+            filter(_need_aggregation, self._outputs.tolist())
+        )
        for _var in to_processed_vars:
            _insert_aggregation_ops_for_var(target_program, _var)

@@ -489,11 +520,12 @@ class PartialProgramLayer:
            if isinstance(out, framework.Variable):
                targets.append(program.global_block().var(out.name))

-        if targets and self._params:
+        if targets:
            backward.gradients(targets=targets, inputs=[])

-        start_idx = len(
-            main_program.block(0).ops) + 2 * len(self._outputs.tolist())
+        start_idx = len(main_program.block(0).ops) + 2 * len(
+            self._outputs.tolist()
+        )

        self.prepare_gradient_aggregation(start_idx, main_program, program)

@@ -512,7 +544,10 @@ class PartialProgramLayer:
            found_param = False
            for block in program.blocks:
                for op in block.ops:
-                    if param.name in op.input_arg_names or param.name in op.output_arg_names:
+                    if (
+                        param.name in op.input_arg_names
+                        or param.name in op.output_arg_names
+                    ):
                        required_params.append(param)
                        found_param = True
                        break
@@ -529,15 +564,21 @@ class PartialProgramLayer:
                    var_desc = block.vars[name].desc
                    var_base = None
                    if not framework._in_eager_mode_:
-                        var_base = core.VarBase(var_desc.dtype(),
+                        var_base = core.VarBase(
+                            var_desc.dtype(),
                            var_desc.shape(),
                            var_desc.name(),
-                                                var_desc.type(), False)
+                            var_desc.type(),
+                            False,
+                        )
                    else:
-                        var_base = core.eager.Tensor(var_desc.dtype(),
+                        var_base = core.eager.Tensor(
+                            var_desc.dtype(),
                            var_desc.shape(),
                            var_desc.name(),
-                                                     var_desc.type(), False)
+                            var_desc.type(),
+                            False,
+                        )
                    double_grads.append(var_base)
        return self._valid_vars(double_grads)

@@ -557,36 +598,62 @@ class PartialProgramLayer:

        attrs = [
            'global_block',
-            self.program.desc.block(0), 'start_op_index', 0, 'end_op_index',
-            self._get_end_op_index(), 'is_test', not self.training,
-            'program_id', self.program_id
+            self.program.desc.block(0),
+            'start_op_index',
+            0,
+            'end_op_index',
+            self._get_end_op_index(),
+            'is_test',
+            not self.training,
+            'program_id',
+            self.program_id,
        ]
        if self._cuda_graph_capture_mode:
            attrs.extend(
-                ('cuda_graph_capture_mode', self._cuda_graph_capture_mode,
-                 'cuda_graph_pool_id', self._cuda_graph_pool_id))
-
-        use_interpretorcore = _is_enable_standalone_executor(
-        ) and _is_dy2st_enable_standalone_executor()
+                (
+                    'cuda_graph_capture_mode',
+                    self._cuda_graph_capture_mode,
+                    'cuda_graph_pool_id',
+                    self._cuda_graph_pool_id,
+                )
+            )
+
+        use_interpretorcore = (
+            _is_enable_standalone_executor()
+            and _is_dy2st_enable_standalone_executor()
+        )
        attrs.extend(('use_interpretorcore', use_interpretorcore))
        if use_interpretorcore:
            attrs.extend(
-                ('forward_global_block', self.forward_program.desc.block(0),
-                 'backward_global_block', self.backward_program.desc.block(0)))
+                (
+                    'forward_global_block',
+                    self.forward_program.desc.block(0),
+                    'backward_global_block',
+                    self.backward_program.desc.block(0),
+                )
+            )

            _legacy_C_ops.run_program(
-                self._valid_vars(in_vars), self._valid_vars(self._params),
+                self._valid_vars(in_vars),
+                self._valid_vars(self._params),
                self._valid_vars(out_vars),
-                self._create_scope_vec(program_id=self.program_id,
-                                       use_scope_cache=True),
-                self._double_grads, self._cuda_graph_vec, *attrs)
+                self._create_scope_vec(
+                    program_id=self.program_id, use_scope_cache=True
+                ),
+                self._double_grads,
+                self._cuda_graph_vec,
+                *attrs
+            )
        else:
-            _legacy_C_ops.run_program(self._valid_vars(in_vars),
+            _legacy_C_ops.run_program(
+                self._valid_vars(in_vars),
                self._valid_vars(self._params),
                self._valid_vars(out_vars),
                self._create_scope_vec(),
-                                      self._double_grads, self._cuda_graph_vec,
-                                      *attrs)
+                self._double_grads,
+                self._cuda_graph_vec,
+                *attrs
+            )
        restored_nest_out = self._restore_out(out_vars)
        return self._remove_no_value(restored_nest_out)

@@ -594,9 +661,11 @@ class PartialProgramLayer:
        if _in_pure_fp16_guard():
            for i, var in enumerate(in_vars):
                name = var.name
-                if (self.program.global_block().has_var(name)
+                if (
+                    self.program.global_block().has_var(name)
                    and self.program.global_block().var(name).dtype
-                        == paddle.float16):
+                    == paddle.float16
+                ):
                    in_vars[i] = var.astype('float16')
                    in_vars[i].name = name

@@ -627,25 +696,32 @@ class PartialProgramLayer:
            return self._infer_program

    @switch_to_static_graph
-    def _get_forward_backward_program_form(self, whole_program,
-                                           forward_end_op_index):
+    def _get_forward_backward_program_form(
+        self, whole_program, forward_end_op_index
+    ):
        forward_builded_program = add_build_strategy_for(
-            whole_program, 0, forward_end_op_index, self._build_strategy)
+            whole_program, 0, forward_end_op_index, self._build_strategy
+        )
        backward_start_op_index = forward_end_op_index + 2 * len(
-            self._outputs.var_ids)
+            self._outputs.var_ids
+        )
        backward_end_op_index = whole_program.desc.block(0).op_size()
        backward_builded_program = add_build_strategy_for(
-            whole_program, backward_start_op_index, backward_end_op_index,
-            self._build_strategy)
-        self._apply_inplace_pass(forward_builded_program,
-                                 backward_builded_program)
+            whole_program,
+            backward_start_op_index,
+            backward_end_op_index,
+            self._build_strategy,
+        )
+        self._apply_inplace_pass(
+            forward_builded_program, backward_builded_program
+        )
        return [forward_builded_program, backward_builded_program]

    def _apply_inplace_pass(self, forward_program, backward_program):
        attr_types = {
            "use_cuda": "bool",
            "mem_opt_skip_vars": "list[str]",
-            "for_partial_block": "bool"
+            "for_partial_block": "bool",
        }
        empty_startup_program = paddle.static.Program()
        use_cuda = True if core.is_compiled_with_cuda() else False
@@ -667,22 +743,33 @@ class PartialProgramLayer:
                forward_mem_opt_skip_vars.append(var.desc.name())
                backward_mem_opt_skip_vars.append(var.desc.name())
        for var_name in core.parse_safe_eager_deletion_skip_vars(
-                backward_program.desc):
+            backward_program.desc
+        ):
            forward_mem_opt_skip_vars.append(var_name)
        attrs = {
            "use_cuda": use_cuda,
            "mem_opt_skip_vars": forward_mem_opt_skip_vars,
-            "for_partial_block": True
+            "for_partial_block": True,
        }
-        _apply_pass(forward_program, empty_startup_program,
-                    "buffer_shared_inplace_pass", attrs, attr_types)
+        _apply_pass(
+            forward_program,
+            empty_startup_program,
+            "buffer_shared_inplace_pass",
+            attrs,
+            attr_types,
+        )
        attrs = {
            "use_cuda": use_cuda,
            "mem_opt_skip_vars": backward_mem_opt_skip_vars,
-            "for_partial_block": True
+            "for_partial_block": True,
        }
-        _apply_pass(backward_program, empty_startup_program,
-                    "buffer_shared_inplace_pass", attrs, attr_types)
+        _apply_pass(
+            backward_program,
+            empty_startup_program,
+            "buffer_shared_inplace_pass",
+            attrs,
+            attr_types,
+        )

    def _prepare(self, inputs):
        """
@@ -698,23 +785,28 @@ class PartialProgramLayer:
            if isinstance(value, np.ndarray):
                var = None
                if not framework._in_eager_mode_:
-                    var = core.VarBase(value=value,
+                    var = core.VarBase(
+                        value=value,
                        name=self._inputs[i].desc.name(),
                        persistable=False,
                        place=expected_place,
-                                       zero_copy=True)
+                        zero_copy=True,
+                    )
                else:
-                    var = core.eager.Tensor(value=value,
+                    var = core.eager.Tensor(
+                        value=value,
                        name=self._inputs[i].desc.name(),
                        persistable=False,
                        place=expected_place,
-                                            zero_copy=True)
+                        zero_copy=True,
+                    )
            elif isinstance(value, (core.VarBase, core.eager.Tensor)):
                # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
                # into CUDAPlace when it's as input of multi Ops. so we move it in advance
                # to avoid this problem.
                if value.stop_gradient and not value.place._equals(
-                        expected_place):
+                    expected_place
+                ):
                    var = value._copy_to(expected_place, False)
                    var.stop_gradient = True
                else:
@@ -737,12 +829,21 @@ class PartialProgramLayer:
                return out_varbase_map[var_desc.name()]

            if not framework._in_eager_mode_:
-                var_base = core.VarBase(var_desc.dtype(), var_desc.shape(),
-                                        var_desc.name(), var_desc.type(), False)
+                var_base = core.VarBase(
+                    var_desc.dtype(),
+                    var_desc.shape(),
+                    var_desc.name(),
+                    var_desc.type(),
+                    False,
+                )
            else:
-                var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(),
-                                             var_desc.name(), var_desc.type(),
-                                             False)
+                var_base = core.eager.Tensor(
+                    var_desc.dtype(),
+                    var_desc.shape(),
+                    var_desc.name(),
+                    var_desc.type(),
+                    False,
+                )
            var_base.stop_gradient = var.stop_gradient
            out_varbase_map[var_desc.name()] = var_base
            return var_base
@@ -755,20 +856,30 @@ class PartialProgramLayer:
    def _create_scope_vec(self, program_id=None, use_scope_cache=False):
        # Hold forward variables
        tmp_scope_vec = None
-        inner_scope = self._get_scope(program_id=program_id,
-                                      use_scope_cache=use_scope_cache)
+        inner_scope = self._get_scope(
+            program_id=program_id, use_scope_cache=use_scope_cache
+        )
        if not framework._in_eager_mode_:
-            tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
+            tmp_scope_vec = core.VarBase(
+                core.VarDesc.VarType.FP32,
+                [],
                "program_out_scope",
-                                         core.VarDesc.VarType.STEP_SCOPES, True)
+                core.VarDesc.VarType.STEP_SCOPES,
+                True,
+            )
            tmp_scope_vec.value().set_scope(inner_scope)
        else:
            tmp_scope_vec = [inner_scope]
        return tmp_scope_vec

    def _create_cuda_graph_vec(self):
-        var = core.VarBase(core.VarDesc.VarType.FP32, [], "cuda_graph",
-                           core.VarDesc.VarType.RAW, True)
+        var = core.VarBase(
+            core.VarDesc.VarType.FP32,
+            [],
+            "cuda_graph",
+            core.VarDesc.VarType.RAW,
+            True,
+        )
        var.stop_gradient = True
        return var

@@ -791,8 +902,9 @@ class PartialProgramLayer:
        return main_program.clone(for_test=True)

    def _is_no_value(self, var):
-        if isinstance(var,
-                      (core.VarBase, core.eager.Tensor)) and var.shape == [1]:
+        if isinstance(var, (core.VarBase, core.eager.Tensor)) and var.shape == [
+            1
+        ]:
            # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
            if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
                return True
@@ -808,13 +920,14 @@ class PartialProgramLayer:
            return out_vars
        elif isinstance(out_vars, (tuple, list)):
            if isinstance(out_vars, tuple):
-                res = tuple(var for var in out_vars
-                            if not self._is_no_value(var))
+                res = tuple(
+                    var for var in out_vars if not self._is_no_value(var)
+                )
            else:
                # isinstance(out_vars, list)
                res = [var for var in out_vars if not self._is_no_value(var)]

-            has_removed = (len(out_vars) > len(res))
+            has_removed = len(out_vars) > len(res)
            # len(out_vars) > len(res) means we have removed var. This is
            # preventing out_vars is empty or just one element at the beginning
            if len(res) == 0 and has_removed:
@@ -835,7 +948,8 @@ class PartialProgramLayer:
        for param in params:
            grad_name = param.name + core.grad_var_suffix()
            grad_var = train_program.desc.block(0).find_var(
-                cpt.to_bytes(grad_name))
+                cpt.to_bytes(grad_name)
+            )
            # NOTE: cannot find var desc maybe no problem, such as in batch_norm
            if grad_var is None:
                continue
@@ -864,15 +978,18 @@ class PartialProgramLayer:
        if not isinstance(self._params, (list, tuple)):
            raise TypeError(
                "Type of self._params in PartialProgramLayer should be list or tuple, but received %s."
-                % type(self._params))
+                % type(self._params)
+            )

        param_and_buffer_names_set = set()
        for i, var in enumerate(self._params):
            # self._params constains parameters and buffers with persistable=True.
            if not isinstance(var, (core.VarBase, core.eager.Tensor)):
                raise TypeError(
-                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'
-                    .format(i, type(var)))
+                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.format(
+                        i, type(var)
+                    )
+                )
            param_and_buffer_names_set.add(var.name)

        for block in main_program.blocks:
@@ -886,15 +1003,11 @@ class PartialProgramLayer:
                            "\n\tRevise suggestion: "
                            "\n\t\t1. Please ensure all your sublayers are inheritted from nn.Layer."
                            "\n\t\t2. Please use nn.ParameterList and nn.LayerList as container instead of using a native Python container such as List"
-                            % name)
+                            % name
+                        )

    def _valid_vars(self, vars):
-        """
-        Note: run_program_op.InferShape requires `X`/'Out' not be null.
-        But it's common in dy2static, fake varBase is created to handle the
-        problem.
-        """
-        return vars if vars else self.__fake_vars
+        return vars if vars else None


 def _create_fake_var():
@@ -903,13 +1016,23 @@ def _create_fake_var():
    """
    if not framework._in_eager_mode_:
        return [
-            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
-                         core.VarDesc.VarType.RAW, False)
+            core.VarBase(
+                core.VarDesc.VarType.FP32,
+                [],
+                "Fake_var",
+                core.VarDesc.VarType.RAW,
+                False,
+            )
        ]
    else:
        return [
-            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
-                              core.VarDesc.VarType.RAW, False)
+            core.eager.Tensor(
+                core.VarDesc.VarType.FP32,
+                [],
+                "Fake_var",
+                core.VarDesc.VarType.RAW,
+                False,
+            )
        ]


@@ -918,23 +1041,27 @@ def partial_program_from(concrete_program):
    if inputs and isinstance(inputs[0], layers.Layer):
        inputs = inputs[1:]

-    return PartialProgramLayer(concrete_program.main_program, inputs,
+    return PartialProgramLayer(
+        concrete_program.main_program,
+        inputs,
        concrete_program.outputs,
        concrete_program.parameters,
-                               **concrete_program.kwargs)
+        **concrete_program.kwargs
+    )


 @switch_to_static_graph
-def add_build_strategy_for(program,
-                           start_op_index,
-                           end_op_index,
-                           build_strategy=None):
-    if (start_op_index < end_op_index):
+def add_build_strategy_for(
+    program, start_op_index, end_op_index, build_strategy=None
+):
+    if start_op_index < end_op_index:
        compiled_program = paddle.static.CompiledProgram(
            core.Graph(program.desc, start_op_index, end_op_index),
-            build_strategy=build_strategy)
-        compiled_program._compile(core.Scope(),
-                                  framework._current_expected_place())
+            build_strategy=build_strategy,
+        )
+        compiled_program._compile(
+            core.Scope(), framework._current_expected_place()
+        )
        ir_graph = framework.IrGraph(compiled_program._graph)
        builded_program = ir_graph.to_program()
        if hasattr(compiled_program._program, 'lr_sheduler'):

--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -32,12 +32,25 @@ from . import parallel_helper
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
-from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
-from .base import program_desc_tracing_guard, param_guard, in_declarative_mode, _convert_into_variable
+from .layer_hooks import (
+    record_program_ops_pre_hook,
+    set_op_customized_attrs_post_hook,
+    LayerOpsRecoder,
+)
+from .base import (
+    program_desc_tracing_guard,
+    param_guard,
+    in_declarative_mode,
+    _convert_into_variable,
+)
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
-from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_, in_dygraph_mode
+from paddle.fluid.framework import (
+    _non_static_mode,
+    convert_np_dtype_to_dtype_,
+    in_dygraph_mode,
+)
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.core import VarDesc
@@ -67,7 +80,7 @@ def _addindent(string, indent):


 class HookRemoveHelper(object):
-    """ A HookRemoveHelper that can be used to remove hook. """
+    """A HookRemoveHelper that can be used to remove hook."""

    next_hook_id = 0

@@ -153,13 +166,14 @@ class Layer(object):

    def train(self):
        """
+
        Sets this Layer and all its sublayers to training mode.
        This only effects certain modules like `Dropout` and `BatchNorm`.

        Returns:
            None

-        Example::
+        Examples:
            .. code-block:: python

                import paddle
@@ -236,6 +250,7 @@ class Layer(object):

    def apply(self, fn):
        """
+
        Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``)
        as well as self. Typical use includes initializing the parameters of a model.

@@ -243,7 +258,7 @@ class Layer(object):
            fn (function): a function to be applied to each sublayer

        Returns:
-            Layer: self
+            Layer, self

        Example::
            .. code-block:: python
@@ -263,6 +278,7 @@ class Layer(object):
              net.apply(init_weights)

              print(net.state_dict())
+
        """
        for layer in self.children():
            layer.apply(fn)
@@ -272,10 +288,12 @@ class Layer(object):
        return self

    def full_name(self):
-        """Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
+        """
+
+        Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__

        Returns:
-            str: full name of this layer.
+            str, full name of this layer.

        Example::
            .. code-block:: python
@@ -297,7 +315,9 @@ class Layer(object):
        return self._full_name

    def register_forward_post_hook(self, hook):
-        """Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
+        """
+
+        Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.

        It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively.
        User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer.
@@ -308,7 +328,7 @@ class Layer(object):
            hook(function): a function registered as a forward post-hook

        Returns:
-            HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
+            HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .

        Examples:
            .. code-block:: python
@@ -340,13 +360,16 @@ class Layer(object):

                # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
                assert (out0.numpy() == (out1.numpy()) * 2).any()
+
        """
        hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
        self._forward_post_hooks[hook_remove_helper._hook_id] = hook
        return hook_remove_helper

    def register_forward_pre_hook(self, hook):
-        """Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
+        """
+
+        Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.

        It should have the following form, `input` of the `hook` is `input` of the `Layer`,
        hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if
@@ -359,7 +382,7 @@ class Layer(object):
            hook(function): a function registered as a forward pre-hook

        Returns:
-            HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
+            HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .

        Examples:
            .. code-block:: python
@@ -398,12 +421,14 @@ class Layer(object):
        self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
        return hook_remove_helper

-    def create_parameter(self,
+    def create_parameter(
+        self,
        shape,
        attr=None,
        dtype=None,
        is_bias=False,
-                         default_initializer=None):
+        default_initializer=None,
+    ):
        """Create parameters for this layer.

        Parameters:
@@ -443,12 +468,15 @@ class Layer(object):
        temp_attr = copy.deepcopy(attr)
        if isinstance(temp_attr, six.string_types) and temp_attr == "":
            temp_attr = None
-        return self._helper.create_parameter(temp_attr, shape, dtype, is_bias,
-                                             default_initializer)
+        return self._helper.create_parameter(
+            temp_attr, shape, dtype, is_bias, default_initializer
+        )

-    @deprecated(since="2.0.0",
+    @deprecated(
+        since="2.0.0",
        update_to="paddle.nn.Layer.create_tensor",
-                reason="New api in create_tensor, easier to use.")
+        reason="New api in create_tensor, easier to use.",
+    )
    def create_variable(self, name=None, persistable=None, dtype=None):
        """

@@ -488,14 +516,16 @@ class Layer(object):
        if name is not None:
            var_name = ".".join([self._full_name, name])
        else:
-            var_name = unique_name.generate(".".join(
-                [self._full_name, "_generated_var"]))
+            var_name = unique_name.generate(
+                ".".join([self._full_name, "_generated_var"])
+            )

        return self._helper.main_program.current_block().create_var(
            name=var_name,
            persistable=persistable,
            dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )

    # TODO: Add more parameter list when we need them
    def create_tensor(self, name=None, persistable=None, dtype=None):
@@ -538,20 +568,24 @@ class Layer(object):
        if name is not None:
            var_name = ".".join([self._full_name, name])
        else:
-            var_name = unique_name.generate(".".join(
-                [self._full_name, "_generated_var"]))
+            var_name = unique_name.generate(
+                ".".join([self._full_name, "_generated_var"])
+            )

        return self._helper.main_program.current_block().create_var(
            name=var_name,
            persistable=persistable,
            dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )

    def parameters(self, include_sublayers=True):
-        """Returns a list of all Parameters from current layer and its sub-layers.
+        """
+
+        Returns a list of all Parameters from current layer and its sub-layers.

        Returns:
-            list of Tensor : a list of Parameters.
+            list of Tensor, a list of Parameters.

        Examples:
            .. code-block:: python
@@ -563,13 +597,17 @@ class Layer(object):

        """
        ret = [
-            param for _, param in self.named_parameters(
-                include_sublayers=include_sublayers)
+            param
+            for _, param in self.named_parameters(
+                include_sublayers=include_sublayers
+            )
        ]
        return ret

    def children(self):
-        """Returns an iterator over immediate children layers.
+        """
+
+        Returns an iterator over immediate children layers.

        Yields:
            Layer: a child layer
@@ -619,13 +657,15 @@ class Layer(object):
                yield name, layer

    def sublayers(self, include_self=False):
-        """Returns a list of sub layers.
+        """
+
+        Returns a list of sub layers.

        Parameters:
            include_self(bool, optional): Whether return self as sublayers. Default: False

        Returns:
-            list of Layer : a list of sub layers.
+            list of Layer, a list of sub layers.

        Examples:
            .. code-block:: python
@@ -678,9 +718,11 @@ class Layer(object):

        """
        params_set = set()
-        named_sublayers = self.named_sublayers(
-            prefix=prefix, include_self=True) if include_sublayers else zip(
-                [prefix], [self])
+        named_sublayers = (
+            self.named_sublayers(prefix=prefix, include_self=True)
+            if include_sublayers
+            else zip([prefix], [self])
+        )
        for layer_prefix, sublayer in named_sublayers:
            params = sublayer._parameters.items()
            for key, param in params:
@@ -724,9 +766,9 @@ class Layer(object):
            if layer is None:
                continue
            layer_prefix = prefix + ('.' if prefix else '') + key
-            for p, l in layer.named_sublayers(prefix=layer_prefix,
-                                              include_self=True,
-                                              layers_set=layers_set):
+            for p, l in layer.named_sublayers(
+                prefix=layer_prefix, include_self=True, layers_set=layers_set
+            ):
                yield p, l

    def register_buffer(self, name, tensor, persistable=True):
@@ -769,25 +811,32 @@ class Layer(object):

        if '_buffers' not in self.__dict__:
            raise ValueError(
-                "super(YourLayer, self).__init__() should be called first")
+                "super(YourLayer, self).__init__() should be called first"
+            )
        elif not isinstance(name, six.string_types):
            raise TypeError(
-                "The name of buffer should be a string, but received {}.".
-                format(type(name).__name__))
+                "The name of buffer should be a string, but received {}.".format(
+                    type(name).__name__
+                )
+            )
        elif '.' in name:
            raise KeyError(
                "The name of buffer can not contain `.`, "
                "because when you access the newly added buffer in the "
-                "form of `self.**.**`, it will cause AttributeError.")
+                "form of `self.**.**`, it will cause AttributeError."
+            )
        elif name == '':
            raise KeyError("The name of buffer can not be empty.")
        elif hasattr(self, name) and name not in self._buffers:
            raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not (type(tensor) == core.VarBase
-                                         or type(tensor) == core.eager.Tensor):
+        elif tensor is not None and not (
+            type(tensor) == core.VarBase or type(tensor) == core.eager.Tensor
+        ):
            raise TypeError(
-                "The registered buffer should be a Paddle.Tensor, but received {}."
-                .format(type(tensor).__name__))
+                "The registered buffer should be a Paddle.Tensor, but received {}.".format(
+                    type(tensor).__name__
+                )
+            )
        else:
            self._buffers[name] = tensor
            if persistable:
@@ -797,13 +846,14 @@ class Layer(object):

    def buffers(self, include_sublayers=True):
        """
+
        Returns a list of all buffers from current layer and its sub-layers.

        Parameters:
            include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True

        Returns:
-            list of Tensor : a list of buffers.
+            list of Tensor, a list of buffers.

        Examples:
            .. code-block:: python
@@ -820,8 +870,10 @@ class Layer(object):

        """
        ret = [
-            buffer for _, buffer in self.named_buffers(
-                include_sublayers=include_sublayers)
+            buffer
+            for _, buffer in self.named_buffers(
+                include_sublayers=include_sublayers
+            )
        ]
        return ret

@@ -862,9 +914,11 @@ class Layer(object):

        """
        buffers_set = set()
-        named_sublayers = self.named_sublayers(
-            prefix=prefix, include_self=True) if include_sublayers else zip(
-                [prefix], [self])
+        named_sublayers = (
+            self.named_sublayers(prefix=prefix, include_self=True)
+            if include_sublayers
+            else zip([prefix], [self])
+        )
        for layer_prefix, sublayer in named_sublayers:
            buffers = sublayer._buffers.items()
            for key, buffer in buffers:
@@ -910,7 +964,7 @@ class Layer(object):
            hook_result = forward_pre_hook(self, inputs)
            if hook_result is not None:
                if not isinstance(hook_result, tuple):
-                    hook_result = (hook_result, )
+                    hook_result = (hook_result,)
                inputs = hook_result

        if not self._built:
@@ -920,16 +974,20 @@ class Layer(object):
                # TODO(liuyuhui) Only xpu broadcast parameters here.
                # The other device is to call _sync_params_buffers in DataParallel
                # to realize the parameter synchronization among multiply cards.
-                if parallel_helper._is_data_parallel_mode(
-                ) and paddle.is_compiled_with_xpu():
+                if (
+                    parallel_helper._is_data_parallel_mode()
+                    and paddle.is_compiled_with_xpu()
+                ):
                    parallel_helper._broadcast_parameters(
-                        self._parameters.values())
+                        self._parameters.values()
+                    )

            self._built = True

        if in_profiler_mode():
-            with profiler.RecordEvent(self.__class__.__name__,
-                                      profiler.TracerEventType.Forward):
+            with profiler.RecordEvent(
+                self.__class__.__name__, profiler.TracerEventType.Forward
+            ):
                outputs = self.forward(*inputs, **kwargs)
        else:
            outputs = self.forward(*inputs, **kwargs)
@@ -942,8 +1000,14 @@ class Layer(object):
        return outputs

    def __call__(self, *inputs, **kwargs):
-        if (not in_declarative_mode()) and (not self._forward_pre_hooks) \
-            and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode() and (not in_profiler_mode()):
+        if (
+            (not in_declarative_mode())
+            and (not self._forward_pre_hooks)
+            and (not self._forward_post_hooks)
+            and (not self._built)
+            and in_dygraph_mode()
+            and (not in_profiler_mode())
+        ):
            self._build_once(*inputs, **kwargs)
            return self.forward(*inputs, **kwargs)
        else:
@@ -964,7 +1028,9 @@ class Layer(object):
        raise ValueError("Layer shouldn't implement backward")

    def add_sublayer(self, name, sublayer):
-        """Adds a sub Layer instance.
+        """
+
+        Adds a sub Layer instance.

        Added sublayer can be accessed by self.name

@@ -972,7 +1038,7 @@ class Layer(object):
            name(str): name of this sublayer.
            sublayer(Layer): an instance of Layer.
        Returns:
-            Layer: the sublayer passed in.
+            Layer, the sublayer passed in.

        Examples:
            .. code-block:: python
@@ -999,8 +1065,9 @@ class Layer(object):
                model = MySequential(fc1, fc2)
                for prefix, layer in model.named_sublayers():
                    print(prefix, layer)
+
        """
-        assert (isinstance(sublayer, Layer) or sublayer == None)
+        assert isinstance(sublayer, Layer) or sublayer == None

        self._sub_layers[name] = sublayer
        return sublayer
@@ -1014,7 +1081,7 @@ class Layer(object):
            name(str): name of this sublayer.
            parameter(Parameter): an instance of Parameter.
        Returns:
-            Parameter: the parameter passed in.
+            Parameter, the parameter passed in.
        Examples:
            .. code-block:: python

@@ -1037,32 +1104,42 @@ class Layer(object):
        """
        if '_parameters' not in self.__dict__:
            raise RuntimeError(
-                "super(YourLayer, self).__init__() should be called firstly.")
+                "super(YourLayer, self).__init__() should be called firstly."
+            )
        elif not isinstance(name, six.string_types):
            raise TypeError(
-                "The name of parameter should be a string, but received {}.".
-                format(type(name).__name__))
+                "The name of parameter should be a string, but received {}.".format(
+                    type(name).__name__
+                )
+            )
        elif '.' in name:
            raise KeyError(
                "The name of parameter can not contain `.`, "
                "because when you access the newly added parameter in the "
-                "form of `self.**.**`, it will cause AttributeError.")
+                "form of `self.**.**`, it will cause AttributeError."
+            )
        elif name == '':
            raise KeyError("The name of parameter can not be empty.")
        elif hasattr(self, name) and name not in self._parameters:
            raise KeyError("The parameter '{}' already exists.".format(name))
-        elif parameter is not None and not isinstance(parameter,
-                                                      framework.Parameter):
+        elif parameter is not None and not isinstance(
+            parameter, framework.Parameter
+        ):
            raise TypeError(
-                "The parameter to be added should be a Parameter, but received {}."
-                .format(type(parameter).__name__))
+                "The parameter to be added should be a Parameter, but received {}.".format(
+                    type(parameter).__name__
+                )
+            )
        else:
            if parameter is None:
                self._parameters[name] = None

            if len(self._loaddict_holder) > 0:
-                assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
-                    parameter.name)
+                assert (
+                    parameter.name in self._loaddict_holder
+                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    parameter.name
+                )

                parameter.set_value(self._loaddict_holder[parameter.name])

@@ -1081,37 +1158,50 @@ class Layer(object):
        """

        def is_already_registered(is_pre_hook):
-            layers_hooks = self._forward_pre_hooks if is_pre_hook else self._forward_post_hooks
-            candidate_hook = record_program_ops_pre_hook if is_pre_hook else set_op_customized_attrs_post_hook
+            layers_hooks = (
+                self._forward_pre_hooks
+                if is_pre_hook
+                else self._forward_post_hooks
+            )
+            candidate_hook = (
+                record_program_ops_pre_hook
+                if is_pre_hook
+                else set_op_customized_attrs_post_hook
+            )

            already_registed = False
            if layers_hooks:
                last_key = next(reversed(layers_hooks))
-                already_registed = (layers_hooks[last_key] == candidate_hook)
+                already_registed = layers_hooks[last_key] == candidate_hook

            return already_registed

        if not isinstance(attrs, dict):
            raise TypeError(
                "attrs should be type(dict), but received {}".format(
-                    type(attrs).__name__))
+                    type(attrs).__name__
+                )
+            )

        # NOTE: Overwrite behavior for same key.
        self._customized_attrs.update(attrs)

        if not is_already_registered(is_pre_hook=True):
            pre_hook_helper = self.register_forward_pre_hook(
-                record_program_ops_pre_hook)
+                record_program_ops_pre_hook
+            )
            assert len(self._op_recorder.hooks) == 0
            self._op_recorder.hooks = [pre_hook_helper]

        # manually register post_hook to ensure it is inserted into the head.
        if not is_already_registered(is_pre_hook=False):
            post_hook_helper = self.register_forward_post_hook(
-                set_op_customized_attrs_post_hook)
+                set_op_customized_attrs_post_hook
+            )
            if len(self._forward_post_hooks) > 1:
-                self._forward_post_hooks.move_to_end(post_hook_helper._hook_id,
-                                                     last=False)
+                self._forward_post_hooks.move_to_end(
+                    post_hook_helper._hook_id, last=False
+                )

            assert len(self._op_recorder.hooks) == 1

@@ -1144,7 +1234,6 @@ class Layer(object):
        return object.__getattribute__(self, name)

    def __setattr__(self, name, value):
-
        def _remove_if_exist(*dicts):
            for d in dicts:
                if name in d:
@@ -1156,10 +1245,14 @@ class Layer(object):
        if isinstance(value, framework.Parameter):
            if params is None:
                raise ValueError(
-                    "super(YourLayer, self).__init__() should be called first")
+                    "super(YourLayer, self).__init__() should be called first"
+                )
            if len(self._loaddict_holder) > 0:
-                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
-                    value.name)
+                assert (
+                    value.name in self._loaddict_holder
+                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    value.name
+                )

                value.set_value(self._loaddict_holder[value.name])

@@ -1168,9 +1261,10 @@ class Layer(object):
        elif params is not None and name in params:
            if value is not None:
                raise TypeError(
-                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'"
-                    .format(name,
-                            type(value).__name__))
+                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'".format(
+                        name, type(value).__name__
+                    )
+                )
            params[name] = None
        else:
            layers = self.__dict__.get('_sub_layers', None)
@@ -1185,9 +1279,10 @@ class Layer(object):
            elif layers is not None and name in layers:
                if value is not None:
                    raise TypeError(
-                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'"
-                        .format(name,
-                                type(value).__name__))
+                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'".format(
+                            name, type(value).__name__
+                        )
+                    )
                layers[name] = None
            else:
                _buffers = self.__dict__.get('_buffers', None)
@@ -1196,8 +1291,9 @@ class Layer(object):
                        raise ValueError(
                            "super(YourLayer, self).__init__() should be called first"
                        )
-                    _remove_if_exist(self.__dict__, self._parameters,
-                                     self._sub_layers)
+                    _remove_if_exist(
+                        self.__dict__, self._parameters, self._sub_layers
+                    )
                    # Set persistable=False by default. Only `register_buffer` can
                    # add a persistable buffer.
                    if name not in self._buffers:
@@ -1211,6 +1307,7 @@ class Layer(object):
                    # value via `assign`.
                    if type(value) == framework.Variable:
                        from paddle import assign
+
                        # Note(zhhsplendid): the condition below happens in PaddleGan model,
                        # but should all non-Variable _buffers[name] be re-assign? We
                        # should consider it in the future. I current wrote this as
@@ -1218,18 +1315,23 @@ class Layer(object):
                        if in_declarative_mode() and _buffers[name] is None:
                            raise RuntimeError(
                                'In Dy2stat, self.{0} is a buffer and self.{0} is '
-                                'not allowed to be set to Variable when self.{0} is None.'
-                                .format(name))
-                        elif _buffers[name] is None or type(getattr(
-                                self, name)) == core.VarBase:
+                                'not allowed to be set to Variable when self.{0} is None.'.format(
+                                    name
+                                )
+                            )
+                        elif (
+                            _buffers[name] is None
+                            or type(getattr(self, name)) == core.VarBase
+                        ):
                            _buffers[name] = assign(value)
                        else:
                            assign(value, getattr(self, name))
                    elif value is not None:
                        raise TypeError(
-                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
-                            .format(name,
-                                    type(value).__name__))
+                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'".format(
+                                name, type(value).__name__
+                            )
+                        )
                    else:
                        # Assigning None will remove the buffer, but if re-assign a new varBase to it,
                        # it will be remarked as a buffer with same `persistable` attribute.
@@ -1316,10 +1418,12 @@ class Layer(object):
        self._state_dict_hooks[hook_remove_helper._hook_id] = hook
        return hook_remove_helper

-    def _obtain_parameters_buffers(self,
+    def _obtain_parameters_buffers(
+        self,
        destination=None,
        include_sublayers=True,
-                                   structured_name_prefix=""):
+        structured_name_prefix="",
+    ):
        """
        The difference from state_dict() is that state_dict_hook will not be called,
        but the original types of parameters and buffers will be maintained.
@@ -1330,7 +1434,10 @@ class Layer(object):
            if data is not None:
                destination[structured_name_prefix + name] = data
        for name, buffer in self._buffers.items():
-            if buffer is not None and name not in self._non_persistable_buffer_names_set:
+            if (
+                buffer is not None
+                and name not in self._non_persistable_buffer_names_set
+            ):
                destination[structured_name_prefix + name] = buffer

        if include_sublayers:
@@ -1339,17 +1446,22 @@ class Layer(object):
                    destination_temp = destination.copy()
                    destination_temp.update(
                        layer_item._obtain_parameters_buffers(
-                            destination_temp, include_sublayers,
-                            structured_name_prefix + layer_name + "."))
+                            destination_temp,
+                            include_sublayers,
+                            structured_name_prefix + layer_name + ".",
+                        )
+                    )
                    destination = destination_temp
        return destination

-    def _state_dict_impl(self,
+    def _state_dict_impl(
+        self,
        destination=None,
        include_sublayers=True,
        structured_name_prefix="",
        include_non_persistable_buffer=False,
-                         use_hook=True):
+        use_hook=True,
+    ):
        """
        Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict

@@ -1367,7 +1479,10 @@ class Layer(object):
                destination[structured_name_prefix + name] = data
        for name, buffer in self._buffers.items():
            if not include_non_persistable_buffer:
-                if buffer is not None and name not in self._non_persistable_buffer_names_set:
+                if (
+                    buffer is not None
+                    and name not in self._non_persistable_buffer_names_set
+                ):
                    destination[structured_name_prefix + name] = buffer
            else:
                if buffer is not None:
@@ -1379,9 +1494,13 @@ class Layer(object):
                    destination_temp = destination.copy()
                    destination_temp.update(
                        layer_item._state_dict_impl(
-                            destination_temp, include_sublayers,
+                            destination_temp,
+                            include_sublayers,
                            structured_name_prefix + layer_name + ".",
-                            include_non_persistable_buffer, use_hook))
+                            include_non_persistable_buffer,
+                            use_hook,
+                        )
+                    )
                    destination = destination_temp
        if use_hook:
            for state_dict_hook in self._state_dict_hooks.values():
@@ -1391,12 +1510,15 @@ class Layer(object):

        return destination

-    def to_static_state_dict(self,
+    def to_static_state_dict(
+        self,
        destination=None,
        include_sublayers=True,
        structured_name_prefix="",
-                             use_hook=True):
+        use_hook=True,
+    ):
        '''
+
        Get all parameters and buffers of current layer and its sub-layers. And set them into a dict

        Parameters:
@@ -1405,7 +1527,7 @@ class Layer(object):
            use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True

        Retruns:
-            dict: a dict contains all the parameters and persistable buffers.
+            dict, a dict contains all the parameters and persistable buffers.

        Examples:
            .. code-block:: python
@@ -1423,13 +1545,16 @@ class Layer(object):
            include_sublayers=include_sublayers,
            structured_name_prefix=structured_name_prefix,
            include_non_persistable_buffer=True,
-            use_hook=use_hook)
+            use_hook=use_hook,
+        )

-    def state_dict(self,
+    def state_dict(
+        self,
        destination=None,
        include_sublayers=True,
        structured_name_prefix="",
-                   use_hook=True):
+        use_hook=True,
+    ):
        '''
        Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict

@@ -1457,7 +1582,8 @@ class Layer(object):
            include_sublayers=include_sublayers,
            structured_name_prefix=structured_name_prefix,
            include_non_persistable_buffer=False,
-            use_hook=use_hook)
+            use_hook=use_hook,
+        )

    @framework.deprecate_stat_dict
    def set_state_dict(self, state_dict, use_structured_name=True):
@@ -1489,22 +1615,31 @@ class Layer(object):
            state = state_dict.get(key, None)
            if state is None:
                raise ValueError(
-                    "{} is not found in the provided dict.".format(key))
-            if (isinstance(state, dict) or isinstance(state, list)):
-                if (len(state) != len(param)):
-                    raise ValueError("{} receieves the length of {}, "
+                    "{} is not found in the provided dict.".format(key)
+                )
+            if isinstance(state, dict) or isinstance(state, list):
+                if len(state) != len(param):
+                    raise ValueError(
+                        "{} receieves the length of {}, "
                        "but the expected shape is {}".format(
-                                         key, len(state), len(param)))
+                            key, len(state), len(param)
+                        )
+                    )
                else:
                    return param, state
            else:
-                state_shape = state.shape() if inspect.ismethod(
-                    state.shape) else state.shape
+                state_shape = (
+                    state.shape()
+                    if inspect.ismethod(state.shape)
+                    else state.shape
+                )

                if list(state_shape) != list(param.shape):
                    raise ValueError(
-                        "{} receives a shape {}, but the expected shape is {}.".
-                        format(key, list(state_shape), list(param.shape)))
+                        "{} receives a shape {}, but the expected shape is {}.".format(
+                            key, list(state_shape), list(param.shape)
+                        )
+                    )
                return param, state

        matched_param_state = []
@@ -1541,8 +1676,10 @@ class Layer(object):
            executor = Executor(_get_device())._default_executor
            # restore parameter states
            core._create_loaded_parameter(
-                [param for param, state in matched_param_state], global_scope(),
-                executor)
+                [param for param, state in matched_param_state],
+                global_scope(),
+                executor,
+            )
            for param, state in matched_param_state:
                _set_var(param, state)

@@ -1594,11 +1731,13 @@ class Layer(object):
                #        [ 0.33960250,  0.96878713]])

        '''
-        return self._to_impl(device=device,
+        return self._to_impl(
+            device=device,
            dtype=dtype,
            blocking=blocking,
            include_sublayers=True,
-                             floating_only=False)
+            floating_only=False,
+        )

    def _apply(self, func, device, dtype, blocking, include_sublayers=True):
        if include_sublayers:
@@ -1612,8 +1751,9 @@ class Layer(object):

                if param.grad is not None:
                    with no_grad():
-                        grad_applied = func(param._grad_ivar(), device, dtype,
-                                            blocking)
+                        grad_applied = func(
+                            param._grad_ivar(), device, dtype, blocking
+                        )

        for key, buf in self._buffers.items():
            if buf is not None:
@@ -1637,12 +1777,14 @@ class Layer(object):
            # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
            # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
            waiting_alloc_memory = (
-                (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+                ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            )
            gpu_memory_available = core.gpu_memory_available()
            if gpu_memory_available < waiting_alloc_memory:
                # Copy param / Tensor to cpu
-                t_used = t._copy_to(paddle.CPUPlace(),
-                                    blocking)  # k-v type will error
+                t_used = t._copy_to(
+                    paddle.CPUPlace(), blocking
+                )  # k-v type will error
                # Release mem of t
                t.value().get_tensor()._clear()
            else:
@@ -1653,7 +1795,8 @@ class Layer(object):
        # 2. cast param / Tensor to dtype
        if dtype is not None and dtype != t_used.dtype:
            with paddle.fluid.framework._dygraph_place_guard(
-                    place=t_used.place):
+                place=t_used.place
+            ):
                t_casted = t_used.cast(dtype=dtype)
        else:
            t_casted = t_used
@@ -1671,12 +1814,14 @@ class Layer(object):

        return t

-    def _to_impl(self,
+    def _to_impl(
+        self,
        device=None,
        dtype=None,
        blocking=None,
        include_sublayers=True,
-                 floating_only=False):
+        floating_only=False,
+    ):
        '''
        Cast the parameters and buffers of Layer by the give device, dtype and blocking.

@@ -1705,20 +1850,28 @@ class Layer(object):
        if device is not None:
            if isinstance(device, str):
                device = paddle.device._convert_to_place(device)
-            elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
-                                     core.CUDAPinnedPlace, core.XPUPlace)):
+            elif isinstance(
+                device,
+                (
+                    core.CPUPlace,
+                    core.CUDAPlace,
+                    core.CUDAPinnedPlace,
+                    core.XPUPlace,
+                ),
+            ):
                pass
            else:
                raise ValueError(
                    "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
-                    + type(device).__name__)
+                    + type(device).__name__
+                )

        if blocking is None:
            blocking = True
        else:
            assert isinstance(
-                blocking,
-                bool), "blocking value error, must be the True, False or None"
+                blocking, bool
+            ), "blocking value error, must be the True, False or None"

        def transform(t, device, dtype, blocking):
            if floating_only and (not paddle.is_floating_point(t)):

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1352,12 +1352,13 @@ class ParameterMetaClass(VariableMetaClass):
 @six.add_metaclass(VariableMetaClass)
 class Variable(object):
    """
-    **Notes**:
-        **The constructor of Variable should not be invoked directly.**

-        **In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.**
+    Notes:
+        The constructor of Variable should not be invoked directly.
+
+        In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.

-        **In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data**
+        In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data.

    In Fluid, every input and output of an OP is a variable. In most
    cases, variables are used for holding different kinds of data or training
@@ -1514,12 +1515,13 @@ class Variable(object):

    def detach(self):
        """
+
        Returns a new Variable, detached from the current graph.
        It will share data with origin Variable and without tensor copy.
        In addition, the detached Variable doesn't provide gradient propagation.

        Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable.

        Examples:
            .. code-block:: python
@@ -1533,6 +1535,7 @@ class Variable(object):

                # create a detached Variable
                y = x.detach()
+
        """

        assert (
@@ -2085,6 +2088,7 @@ class Variable(object):
    @property
    def T(self):
        """
+
        Permute current Variable with its dimensions reversed.

        If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
@@ -2103,6 +2107,7 @@ class Variable(object):
                x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
                print(x_T_np.shape)
                # (5, 3, 2)
+
        """
        if len(self.shape) == 1:
            return self
@@ -2141,7 +2146,7 @@ class Variable(object):
        as ``out = assign(tensor)`` .

        Returns:
-            Variable: The cloned Variable.
+            Variable, The cloned Variable.

        Examples:
            .. code-block:: python
@@ -2171,6 +2176,7 @@ class Variable(object):

    def _set_error_clip(self, error_clip):
        """
+
        Set the error_clip.

        Args:
@@ -2178,11 +2184,13 @@ class Variable(object):

        Returns:
            None
+
        """
        self.error_clip = error_clip

    def _set_info(self, key, value):
        """
+
        Set key-value information for this variable.

        Args:
@@ -2191,6 +2199,7 @@ class Variable(object):

        Returns:
            None
+
        """
        if not hasattr(self, "_info"):
            self._info = {}
@@ -2198,6 +2207,7 @@ class Variable(object):

    def _get_info(self, key):
        """
+
        Get the information of this variable corresponding to key.

        Args:
@@ -2205,6 +2215,7 @@ class Variable(object):

        Returns:
            object
+
        """
        if hasattr(self, "_info") and key in self._info:
            return self._info[key]
@@ -2212,7 +2223,9 @@ class Variable(object):

    def _slice_indices(self, slice, length):
        """
+
        Reference implementation for the slice.indices method.
+
        """
        # Compute step and length as integers.
        step = 1 if slice.step is None else slice.step
@@ -2383,7 +2396,7 @@ class Variable(object):
                Default: None

        Returns:
-            Tensor: the value in given scope.
+            Tensor, the value in given scope.

        Examples:
            .. code-block:: python
@@ -2438,6 +2451,7 @@ class Variable(object):

    def set_value(self, value, scope=None):
        '''
+
        Set the value to the tensor in given scope.

        Args:
@@ -2477,6 +2491,7 @@ class Variable(object):
                    if var.persistable:
                        t_load = paddle.load(path+var.name+'.pdtensor')
                        var.set_value(t_load)
+
        '''

        # The 'framework' is a low-level module, and 'executor'
@@ -2547,10 +2562,11 @@ class Variable(object):

    def size(self):
        """
+
        Returns the number of elements for current Variable, which is a int64 Variable with shape [1]

        Returns:
-            Variable: the number of elements for current Variable
+            Variable, the number of elements for current Variable

        Examples:
            .. code-block:: python
@@ -2564,6 +2580,7 @@ class Variable(object):

                # get the number of elements of the Variable
                y = x.size()
+
        """

        output = self.block.create_var(
@@ -2578,23 +2595,27 @@ class Variable(object):

    def _set_attr(self, name, val):
        """
+
        Set the value of attribute by attribute's name.

        Args:
            name(str): the attribute name.
            val(int|str|list): the value of the attribute.
+
        """
        self._update_desc_attr(name, val)

    def _has_attr(self, name):
        """
+
        Whether this Variable has the attribute with the name `name` or not.

        Args:
            name(str): the attribute name.

        Returns:
-            bool: True if has this attribute.
+            bool, True if has this attribute.
+
        """
        return self.desc.has_attr(name)

@@ -2624,7 +2645,7 @@ class Variable(object):
            name(str): the attribute name.

        Returns:
-            int|str|list: The attribute value. The return value
+            int|str|list, The attribute value. The return value
            can be any valid attribute type.
        """
        return self.desc.attr(name)
@@ -3196,14 +3217,16 @@ class Operator(object):

    def input(self, name):
        r"""
+
        Get the input arguments according to the input parameter name.

        Args:
            name(str): The input parameter name.

        Returns:
-            list: return the list of argument names that associated with \
+            list, return the list of argument names that associated with \
                the specific parameter name.
+
        """
        return self.desc.input(name)


--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,13 @@ from __future__ import print_function
 import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
-from ..framework import Variable, _non_static_mode, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Variable,
+    _non_static_mode,
+    _varbase_creator,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from .. import core
 from ..param_attr import ParamAttr
 from . import nn
@@ -33,22 +39,29 @@ __all__ = ['accuracy', 'auc']

 def accuracy(input, label, k=1, correct=None, total=None):
    """
+
    accuracy layer.
    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
    This function computes the accuracy using the input and label.
    If the correct label occurs in top k predictions, then correct will increment by one.
-    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+
+    Note:
+        the dtype of accuracy is determined by input. the input and label dtype can be different.
+
    Args:
        input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
            The shape is ``[sample_number, class_dim]`` .
        label(Tensor): The label of dataset.  Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
-        k(int): The top k predictions for each class will be checked. Data type is int64 or int32.
-        correct(Tensor): The correct predictions count. A Tensor with type int64 or int32.
-        total(Tensor): The total entries count. A tensor with type int64 or int32.
+        k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1.
+        correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None.
+        total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None.
+
    Returns:
-        Tensor: The correct rate. A Tensor with type float32.
+        Tensor, The correct rate. A Tensor with type float32.
+
    Examples:
        .. code-block:: python
+
            import numpy as np
            import paddle
            import paddle.static as static
@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
                        fetch_list=[result[0]])
            print(output)
            #[array([0.], dtype=float32)]
+
    """
    if _non_static_mode():
        if correct is None:
@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None):
            total = _varbase_creator(dtype="int32")

        _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k,
-                                                        'sorted', False)
-        _acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
-                                            correct, total)
+        topk_out, topk_indices = _legacy_C_ops.top_k_v2(
+            input, 'k', _k, 'sorted', False
+        )
+        _acc, _, _ = _legacy_C_ops.accuracy(
+            topk_out, topk_indices, label, correct, total
+        )
        return _acc

    helper = LayerHelper("accuracy", **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'accuracy')
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'accuracy'
+    )
    topk_out = helper.create_variable_for_type_inference(dtype=input.dtype)
    topk_indices = helper.create_variable_for_type_inference(dtype="int64")
    inputs = {"X": [input]}
@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None):
    else:
        attrs = {'k': k}
    attrs['sorted'] = False
-    helper.append_op(type="top_k_v2",
+    helper.append_op(
+        type="top_k_v2",
        inputs=inputs,
        attrs=attrs,
-                     outputs={
-                         "Out": [topk_out],
-                         "Indices": [topk_indices]
-                     })
+        outputs={"Out": [topk_out], "Indices": [topk_indices]},
+    )
    acc_out = helper.create_variable_for_type_inference(dtype="float32")
    if correct is None:
        correct = helper.create_variable_for_type_inference(dtype="int32")
    if total is None:
        total = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(type="accuracy",
-                     inputs={
-                         "Out": [topk_out],
-                         "Indices": [topk_indices],
-                         "Label": [label]
-                     },
+    helper.append_op(
+        type="accuracy",
+        inputs={"Out": [topk_out], "Indices": [topk_indices], "Label": [label]},
        outputs={
            "Accuracy": [acc_out],
            "Correct": [correct],
            "Total": [total],
-                     })
+        },
+    )
    return acc_out


-def auc(input,
+def auc(
+    input,
    label,
    curve='ROC',
    num_thresholds=2**12 - 1,
    topk=1,
    slide_steps=1,
-        ins_tag_weight=None):
+    ins_tag_weight=None,
+):
    """
    **Area Under the Curve (AUC) Layer**

@@ -216,13 +232,14 @@ def auc(input,
    helper = LayerHelper("auc", **locals())

    if ins_tag_weight is None:
-        ins_tag_weight = tensor.fill_constant(shape=[1, 1],
-                                              dtype="float32",
-                                              value=1.0)
+        ins_tag_weight = tensor.fill_constant(
+            shape=[1, 1], dtype="float32", value=1.0
+        )
    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc')
    check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc')
-    check_variable_and_dtype(ins_tag_weight, 'ins_tag_weight',
-                             ['float32', 'float64'], 'auc')
+    check_variable_and_dtype(
+        ins_tag_weight, 'ins_tag_weight', ['float32', 'float64'], 'auc'
+    )
    auc_out = helper.create_variable_for_type_inference(dtype="float64")
    batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
@@ -236,62 +253,71 @@ def auc(input,
    batch_stat_pos = helper.create_global_variable(
        persistable=True,
        dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
    batch_stat_neg = helper.create_global_variable(
        persistable=True,
        dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )

    # for global auc
    # Needn't maintain the batch id
-    stat_pos = helper.create_global_variable(persistable=True,
-                                             dtype='int64',
-                                             shape=[1, num_thresholds + 1])
-    stat_neg = helper.create_global_variable(persistable=True,
-                                             dtype='int64',
-                                             shape=[1, num_thresholds + 1])
+    stat_pos = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
+    )
+    stat_neg = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
+    )

    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
-        helper.set_variable_initializer(var, Constant(value=0.0,
-                                                      force_cpu=False))
+        helper.set_variable_initializer(
+            var, Constant(value=0.0, force_cpu=False)
+        )

-    #"InsTagWeight": [ins_tag_weight]
+    # "InsTagWeight": [ins_tag_weight]
    # Batch AUC
-    helper.append_op(type="auc",
+    helper.append_op(
+        type="auc",
        inputs={
            "Predict": [input],
            "Label": [label],
            "StatPos": [batch_stat_pos],
-                         "StatNeg": [batch_stat_neg]
+            "StatNeg": [batch_stat_neg],
        },
        attrs={
            "curve": curve,
            "num_thresholds": num_thresholds,
-                         "slide_steps": slide_steps
+            "slide_steps": slide_steps,
        },
        outputs={
            "AUC": [batch_auc_out],
            "StatPosOut": [batch_stat_pos],
-                         "StatNegOut": [batch_stat_neg]
-                     })
+            "StatNegOut": [batch_stat_neg],
+        },
+    )
    # Global AUC
-    helper.append_op(type="auc",
+    helper.append_op(
+        type="auc",
        inputs={
            "Predict": [input],
            "Label": [label],
            "StatPos": [stat_pos],
-                         "StatNeg": [stat_neg]
+            "StatNeg": [stat_neg],
        },
        attrs={
            "curve": curve,
            "num_thresholds": num_thresholds,
-                         "slide_steps": 0
+            "slide_steps": 0,
        },
        outputs={
            "AUC": [auc_out],
            "StatPosOut": [stat_pos],
-                         "StatNegOut": [stat_neg]
-                     })
-    return auc_out, batch_auc_out, [
-        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
-    ]
+            "StatNegOut": [stat_neg],
+        },
+    )
+    return (
+        auc_out,
+        batch_auc_out,
+        [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg],
+    )
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/tests/.gitignore
+++ b/python/paddle/fluid/tests/.gitignore
+image/
+fit_a_line.model/
+tmp
+cuda_profiler.txt
--- a/python/paddle/fluid/tests/book/.gitignore
+++ b/python/paddle/fluid/tests/book/.gitignore
+*.inference.model
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
@@ -13,18 +13,19 @@
 # limitations under the License.

 import os
-from paddle.fluid import core
 from distutils.sysconfig import get_python_lib
-from distutils.core import setup, Extension
+
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext

+from paddle.fluid import core
+

 # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
 # Avoid a gcc warning below:
 # cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
 # for C/ObjC but not for C++
 class BuildExt(build_ext):
-
    def build_extensions(self):
        if '-Wstrict-prototypes' in self.compiler.compiler_so:
            self.compiler.compiler_so.remove('-Wstrict-prototypes')
@@ -48,8 +49,9 @@ paddle_custom_kernel_include = [
    os.path.join(site_packages_path, 'paddle', 'include'),
 ]
 # include path third_party
-compile_third_party_path = os.path.join(os.environ['PADDLE_BINARY_DIR'],
-                                        'third_party')
+compile_third_party_path = os.path.join(
+    os.environ['PADDLE_BINARY_DIR'], 'third_party'
+)
 paddle_custom_kernel_include += [
    os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
    os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
@@ -69,10 +71,13 @@ custom_kernel_dot_module = Extension(
    include_dirs=paddle_custom_kernel_include,
    library_dirs=paddle_custom_kernel_library_dir,
    libraries=libs,
-    extra_compile_args=paddle_extra_compile_args)
+    extra_compile_args=paddle_extra_compile_args,
+)

-setup(name='custom_kernel_dot_c',
+setup(
+    name='custom_kernel_dot_c',
    version='1.0',
    description='custom kernel fot compiling',
    cmdclass={'build_ext': BuildExt},
-      ext_modules=[custom_kernel_dot_module])
+    ext_modules=[custom_kernel_dot_module],
+)
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -14,18 +14,17 @@

 import os
 import site
-from paddle.fluid import core
-from distutils.sysconfig import get_python_lib
-from distutils.core import setup, Extension
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext

+from paddle.fluid import core
+

 # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
 # Avoid a gcc warning below:
 # cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
 # for C/ObjC but not for C++
 class BuildExt(build_ext):
-
    def build_extensions(self):
        if '-Wstrict-prototypes' in self.compiler.compiler_so:
            self.compiler.compiler_so.remove('-Wstrict-prototypes')
@@ -46,12 +45,15 @@ if core.is_compiled_with_npu():
 # include path
 site_packages_path = site.getsitepackages()
 paddle_custom_kernel_include = list(
-    map(lambda path: os.path.join(path, 'paddle', 'include'),
-        site_packages_path))
+    map(
+        lambda path: os.path.join(path, 'paddle', 'include'), site_packages_path
+    )
+)

 # include path third_party
-compile_third_party_path = os.path.join(os.environ['PADDLE_BINARY_DIR'],
-                                        'third_party')
+compile_third_party_path = os.path.join(
+    os.environ['PADDLE_BINARY_DIR'], 'third_party'
+)
 paddle_custom_kernel_include += [
    os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
    os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
@@ -59,7 +61,8 @@ paddle_custom_kernel_include += [

 # libs path
 paddle_custom_kernel_library_dir = list(
-    map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path))
+    map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path)
+)

 # libs
 libs = [':libpaddle.so']
@@ -70,10 +73,13 @@ custom_kernel_dot_module = Extension(
    include_dirs=paddle_custom_kernel_include,
    library_dirs=paddle_custom_kernel_library_dir,
    libraries=libs,
-    extra_compile_args=paddle_extra_compile_args)
+    extra_compile_args=paddle_extra_compile_args,
+)

-setup(name='custom_kernel_dot',
+setup(
+    name='custom_kernel_dot',
    version='1.0',
    description='custom kernel fot compiling',
    cmdclass={'build_ext': BuildExt},
-      ext_modules=[custom_kernel_dot_module])
+    ext_modules=[custom_kernel_dot_module],
+)
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -44,7 +44,7 @@ __global__ void relu_cuda_double_backward_kernel(const data_t* out_data,
                                                 data_t* ddout_data,
                                                 int64_t num) {
  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int64_t i = num; i < num; i += blockDim.x * gridDim.x) {
+  for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) {
    ddout_data[i] = ddx_data[i] * (out_data[i] > static_cast<data_t>(0.)
                                       ? static_cast<data_t>(1.)
                                       : static_cast<data_t>(0.));

--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -21,6 +21,7 @@ import paddle.static as static
 import tempfile
 import subprocess
 import numpy as np
+from paddle import fluid
 from paddle.vision.transforms import Compose, Normalize
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from paddle.fluid.framework import _test_eager_guard
@@ -43,12 +44,9 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
        return out.numpy(), t.grad.numpy()


-def custom_relu_static(func,
-                       device,
-                       dtype,
-                       np_x,
-                       use_func=True,
-                       test_infer=False):
+def custom_relu_static(
+    func, device, dtype, np_x, use_func=True, test_infer=False
+):
    paddle.enable_static()
    paddle.set_device(device)

@@ -62,9 +60,11 @@ def custom_relu_static(func,
            exe = static.Executor()
            exe.run(static.default_startup_program())
            # in static mode, x data has been covered by out
-            out_v = exe.run(static.default_main_program(),
+            out_v = exe.run(
+                static.default_main_program(),
                feed={'X': np_x},
-                            fetch_list=[out.name])
+                fetch_list=[out.name],
+            )

    paddle.disable_static()
    return out_v
@@ -87,11 +87,11 @@ def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):

            # in static mode, x data has been covered by out
            compiled_prog = static.CompiledProgram(
-                static.default_main_program()).with_data_parallel(
-                    loss_name=out.name, places=places)
-            out_v = exe.run(compiled_prog,
-                            feed={'X': np_x},
-                            fetch_list=[out.name])
+                static.default_main_program()
+            ).with_data_parallel(loss_name=out.name, places=places)
+            out_v = exe.run(
+                compiled_prog, feed={'X': np_x}, fetch_list=[out.name]
+            )

    paddle.disable_static()
    return out_v
@@ -103,9 +103,9 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            # simple module
-            data = static.data(name='data',
-                               shape=[None, 1, 28, 28],
-                               dtype='float32')
+            data = static.data(
+                name='data', shape=[None, 1, 28, 28], dtype='float32'
+            )
            label = static.data(name='label', shape=[None, 1], dtype='int64')

            hidden = static.nn.fc(data, size=128)
@@ -124,23 +124,21 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):

            # train
            for i in range(4):
-                avg_loss_v = exe.run(static.default_main_program(),
-                                     feed={
-                                         'data': np_data,
-                                         'label': np_label
-                                     },
-                                     fetch_list=[avg_loss])
+                avg_loss_v = exe.run(
+                    static.default_main_program(),
+                    feed={'data': np_data, 'label': np_label},
+                    fetch_list=[avg_loss],
+                )

            # save inference model
            static.save_inference_model(path_prefix, [data], [predict], exe)

            # get train predict value
-            predict_v = exe.run(static.default_main_program(),
-                                feed={
-                                    'data': np_data,
-                                    'label': np_label
-                                },
-                                fetch_list=[predict])
+            predict_v = exe.run(
+                static.default_main_program(),
+                feed={'data': np_data, 'label': np_label},
+                fetch_list=[predict],
+            )

    return predict_v

@@ -151,30 +149,37 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
    t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)

    out = func(t) if use_func else paddle.nn.functional.relu(t)
-    out.stop_gradient = False
-
-    dx = paddle.grad(outputs=[out],
-                     inputs=[t],
+    dx = paddle.grad(
+        outputs=out,
+        inputs=t,
+        grad_outputs=paddle.ones_like(t),
        create_graph=True,
-                     retain_graph=True)
+        retain_graph=True,
+    )

-    dx[0].backward()
+    ddout = paddle.grad(
+        outputs=dx[0],
+        inputs=out.grad,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=False,
+    )

-    assert dx[0].grad is not None
-    return dx[0].numpy(), dx[0].grad.numpy()
+    assert ddout[0].numpy() is not None
+    return dx[0].numpy(), ddout[0].numpy()


 class TestNewCustomOpSetUpInstall(unittest.TestCase):
-
    def setUp(self):
        cur_dir = os.path.dirname(os.path.abspath(__file__))
        # compile, install the custom op egg into site-packages under background
        if os.name == 'nt':
            cmd = 'cd /d {} && python custom_relu_setup.py install'.format(
-                cur_dir)
+                cur_dir
+            )
        else:
            cmd = 'cd {} && {} custom_relu_setup.py install'.format(
-                cur_dir, sys.executable)
+                cur_dir, sys.executable
+            )
        run_cmd(cmd)

        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -190,16 +195,18 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
        custom_egg_path = [
            x for x in os.listdir(site_dir) if 'custom_relu_module_setup' in x
        ]
-        assert len(custom_egg_path
-                   ) == 1, "Matched egg number is %d." % len(custom_egg_path)
+        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
+            custom_egg_path
+        )
        sys.path.append(os.path.join(site_dir, custom_egg_path[0]))

        # usage: import the package directly
        import custom_relu_module_setup
+
        # `custom_relu_dup` is same as `custom_relu_dup`
        self.custom_ops = [
            custom_relu_module_setup.custom_relu,
-            custom_relu_module_setup.custom_relu_dup
+            custom_relu_module_setup.custom_relu_dup,
        ]

        self.dtypes = ['float32', 'float64']
@@ -222,13 +229,16 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
                    out = custom_relu_static(custom_op, device, dtype, x)
-                    pd_out = custom_relu_static(custom_op, device, dtype, x,
-                                                False)
+                    pd_out = custom_relu_static(
+                        custom_op, device, dtype, x, False
+                    )
                    np.testing.assert_array_equal(
                        out,
                        pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.
-                        format(out, pd_out))
+                        err_msg='custom op out: {},\n paddle api out: {}'.format(
+                            out, pd_out
+                        ),
+                    )

    def test_static_pe(self):
        for device in self.devices:
@@ -238,13 +248,16 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
                    out = custom_relu_static_pe(custom_op, device, dtype, x)
-                    pd_out = custom_relu_static_pe(custom_op, device, dtype, x,
-                                                   False)
+                    pd_out = custom_relu_static_pe(
+                        custom_op, device, dtype, x, False
+                    )
                    np.testing.assert_array_equal(
                        out,
                        pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.
-                        format(out, pd_out))
+                        err_msg='custom op out: {},\n paddle api out: {}'.format(
+                            out, pd_out
+                        ),
+                    )

    def func_dynamic(self):
        for device in self.devices:
@@ -253,20 +266,26 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                    continue
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
-                    out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
-                                                      x)
+                    out, x_grad = custom_relu_dynamic(
+                        custom_op, device, dtype, x
+                    )
                    pd_out, pd_x_grad = custom_relu_dynamic(
-                        custom_op, device, dtype, x, False)
+                        custom_op, device, dtype, x, False
+                    )
                    np.testing.assert_array_equal(
                        out,
                        pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.
-                        format(out, pd_out))
+                        err_msg='custom op out: {},\n paddle api out: {}'.format(
+                            out, pd_out
+                        ),
+                    )
                    np.testing.assert_array_equal(
                        x_grad,
                        pd_x_grad,
-                        err_msg='custom op x grad: {},\n paddle api x grad: {}'.
-                        format(x_grad, pd_x_grad))
+                        err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
+                            x_grad, pd_x_grad
+                        ),
+                    )

    def test_dynamic(self):
        with _test_eager_guard():
@@ -279,22 +298,29 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
        np_label = np.random.random((1, 1)).astype("int64")
        path_prefix = "custom_op_inference/custom_relu"
        for device in self.devices:
-            predict = custom_relu_static_inference(self.custom_ops[0], device,
-                                                   np_data, np_label,
-                                                   path_prefix)
+            predict = custom_relu_static_inference(
+                self.custom_ops[0], device, np_data, np_label, path_prefix
+            )
            # load inference model
            with static.scope_guard(static.Scope()):
                exe = static.Executor()
-                [inference_program, feed_target_names,
-                 fetch_targets] = static.load_inference_model(path_prefix, exe)
-                predict_infer = exe.run(inference_program,
+                [
+                    inference_program,
+                    feed_target_names,
+                    fetch_targets,
+                ] = static.load_inference_model(path_prefix, exe)
+                predict_infer = exe.run(
+                    inference_program,
                    feed={feed_target_names[0]: np_data},
-                                        fetch_list=fetch_targets)
+                    fetch_list=fetch_targets,
+                )
                np.testing.assert_array_equal(
                    predict,
                    predict_infer,
-                    err_msg='custom op predict: {},\n custom op infer predict: {}'
-                    .format(predict, predict_infer))
+                    err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
+                        predict, predict_infer
+                    ),
+                )
        paddle.disable_static()

    def test_static_save_and_run_inference_predictor(self):
@@ -304,62 +330,80 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
        path_prefix = "custom_op_inference/custom_relu"
        from paddle.inference import Config
        from paddle.inference import create_predictor
+
        for device in self.devices:
-            predict = custom_relu_static_inference(self.custom_ops[0], device,
-                                                   np_data, np_label,
-                                                   path_prefix)
+            predict = custom_relu_static_inference(
+                self.custom_ops[0], device, np_data, np_label, path_prefix
+            )
            # load inference model
-            config = Config(path_prefix + ".pdmodel",
-                            path_prefix + ".pdiparams")
+            config = Config(
+                path_prefix + ".pdmodel", path_prefix + ".pdiparams"
+            )
            predictor = create_predictor(config)
            input_tensor = predictor.get_input_handle(
-                predictor.get_input_names()[0])
+                predictor.get_input_names()[0]
+            )
            input_tensor.reshape(np_data.shape)
            input_tensor.copy_from_cpu(np_data.copy())
            predictor.run()
            output_tensor = predictor.get_output_handle(
-                predictor.get_output_names()[0])
+                predictor.get_output_names()[0]
+            )
            predict_infer = output_tensor.copy_to_cpu()
            self.assertTrue(
                np.isclose(predict, predict_infer, rtol=5e-5).any(),
                "custom op predict: {},\n custom op infer predict: {}".format(
-                    predict, predict_infer))
+                    predict, predict_infer
+                ),
+            )
        paddle.disable_static()

-    def test_func_double_grad_dynamic(self):
+    def test_double_grad_dynamic(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
        for device in self.devices:
            for dtype in self.dtypes:
                if device == 'cpu' and dtype == 'float16':
                    continue
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                out, dx_grad = custom_relu_double_grad_dynamic(
-                    self.custom_ops[0], device, dtype, x)
+                    self.custom_ops[0], device, dtype, x
+                )
                pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
-                    self.custom_ops[0], device, dtype, x, False)
+                    self.custom_ops[0], device, dtype, x, False
+                )
                np.testing.assert_array_equal(
                    out,
                    pd_out,
                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out))
+                        out, pd_out
+                    ),
+                )
                np.testing.assert_array_equal(
                    dx_grad,
                    pd_dx_grad,
-                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.
-                    format(dx_grad, pd_dx_grad))
+                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
+                        dx_grad, pd_dx_grad
+                    ),
+                )
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})

    def test_with_dataloader(self):
        for device in self.devices:
            paddle.set_device(device)
            # data loader
            transform = Compose(
-                [Normalize(mean=[127.5], std=[127.5], data_format='CHW')])
-            train_dataset = paddle.vision.datasets.MNIST(mode='train',
-                                                         transform=transform)
-            train_loader = paddle.io.DataLoader(train_dataset,
+                [Normalize(mean=[127.5], std=[127.5], data_format='CHW')]
+            )
+            train_dataset = paddle.vision.datasets.MNIST(
+                mode='train', transform=transform
+            )
+            train_loader = paddle.io.DataLoader(
+                train_dataset,
                batch_size=64,
                shuffle=True,
                drop_last=True,
-                                                num_workers=0)
+                num_workers=0,
+            )

            for batch_id, (image, _) in enumerate(train_loader()):
                out = self.custom_ops[0](image)
@@ -368,7 +412,9 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                    out,
                    pd_out,
                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out))
+                        out, pd_out
+                    ),
+                )

                if batch_id == 5:
                    break

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_backward_without_params.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class Net(paddle.nn.Layer):
+    def __init__(self):
+        super(Net, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        out = x + 1
+        return out
+
+
+class TestBackwardWithoutParams(unittest.TestCase):
+    def test_run(self):
+        net = Net()
+
+        x = paddle.ones([2, 2])
+        x.stop_gradient = False
+        out = net(x)
+        loss = paddle.mean(out)
+        loss.backward()
+        np.testing.assert_equal(x.grad.numpy(), np.full(x.shape, 0.25))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -292,7 +292,6 @@ def for_tuple_as_enumerate_value(x_array):

 # 20. test for function in a class
 class ForwardContainsForLayer(paddle.nn.Layer):
-
    def __init__(self):
        super(ForwardContainsForLayer, self).__init__()
        self.high = 5
@@ -328,8 +327,8 @@ def for_original_tuple():

 # 23. for zip error
 @paddle.jit.to_static(
-    input_spec=[InputSpec(shape=[None, 10]),
-                InputSpec(shape=[None, 10])])
+    input_spec=[InputSpec(shape=[None, 10]), InputSpec(shape=[None, 10])]
+)
 def for_zip_error(x, y):
    for i, j in zip(x, y):
        a = i + j
@@ -338,8 +337,8 @@ def for_zip_error(x, y):

 # 24. for zip
 @paddle.jit.to_static(
-    input_spec=[InputSpec(shape=[2, 10]),
-                InputSpec(shape=[2, 10])])
+    input_spec=[InputSpec(shape=[2, 10]), InputSpec(shape=[2, 10])]
+)
 def for_zip(x, y):
    for i, j in zip(x, y):
        a = i + j
@@ -347,10 +346,12 @@ def for_zip(x, y):


 class TestTransformBase(unittest.TestCase):
-
    def setUp(self):
-        self.place = fluid.CUDAPlace(
-            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
+        self.place = (
+            fluid.CUDAPlace(0)
+            if fluid.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
        self.set_input()
        self.set_test_func()

@@ -359,7 +360,8 @@ class TestTransformBase(unittest.TestCase):

    def set_test_func(self):
        raise NotImplementedError(
-            "For Enumerate test should implement set_test_func")
+            "For Enumerate test should implement set_test_func"
+        )

    def _run(self, to_static):
        program_translator.enable(to_static)
@@ -374,22 +376,21 @@ class TestTransformBase(unittest.TestCase):


 class TestTransform(TestTransformBase):
-
    def transformed_result_compare(self):
        dy_outs = self.get_dygraph_output()
        if not isinstance(dy_outs, (tuple, list)):
-            dy_outs = (dy_outs, )
+            dy_outs = (dy_outs,)

+        self.dygraph_func.eval()
        st_outs = self.get_static_output()
        if not isinstance(st_outs, (tuple, list)):
-            st_outs = (st_outs, )
+            st_outs = (st_outs,)

        for x, y in zip(dy_outs, st_outs):
            np.testing.assert_allclose(x.numpy(), y.numpy(), rtol=1e-05)


 class TestTransformForOriginalList(TestTransform):
-
    def _run(self, to_static):
        program_translator.enable(to_static)
        with fluid.dygraph.guard():
@@ -397,7 +398,6 @@ class TestTransformForOriginalList(TestTransform):


 class TestTransformError(TestTransformBase):
-
    def transformed_error(self, etype):
        with self.assertRaises(etype):
            dy_out = self.get_dygraph_output()
@@ -405,7 +405,6 @@ class TestTransformError(TestTransformBase):


 class TestForInRange(TestTransform):
-
    def set_input(self):
        self.input = np.array([5])

@@ -417,7 +416,6 @@ class TestForInRange(TestTransform):


 class TestForIterList(TestTransform):
-
    def set_test_func(self):
        self.dygraph_func = for_iter_list

@@ -426,19 +424,16 @@ class TestForIterList(TestTransform):


 class TestForEnumerateSimple(TestForIterList):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_list


 class TestForInRangeWithBreak(TestForInRange):
-
    def set_test_func(self):
        self.dygraph_func = for_in_range_with_break


 class TestForIterVarNumpy(TestTransform):
-
    def set_input(self):
        self.input = np.array([1, 2, 3, 4, 5])

@@ -450,103 +445,86 @@ class TestForIterVarNumpy(TestTransform):


 class TestForEnumerateVarNumpy(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var_numpy


 class TestForEnumerateVarNumpyWithStart(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var_numpy_with_start


 class TestForEnumerateVarNumpyWithBreak(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var_numpy_with_break


 class TestForEnumerateVarNumpyWithContinue(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var_numpy_with_continue


 class TestForEnumerateVarNumpyWithStartAndBreak(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var_numpy_with_start_break


 class TestForEnumerateVarNumpyWithStartAndContinue(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var_numpy_with_start_continue


 class TestForIterVar(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_iter_var


 class TestForIterVarIdx(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_iter_var_idx


 class TestForEnumerateVar(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var


 class TestForEnumerateVarWithNestedRange(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var_with_nested_range


 class TestForIterVarList(TestForInRange):
-
    def set_test_func(self):
        self.dygraph_func = for_iter_var_list


 class TestForEnumerateVarList(TestForInRange):
-
    def set_test_func(self):
        self.dygraph_func = for_enumerate_var_list


 class TestForTupleAsIterVar(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_tuple_as_iter_var


 class TestForTupleAsEnumerateIter(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_tuple_as_enumerate_iter


 class TestForTupleAsEnumerateValue(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = for_tuple_as_enumerate_value


 class TestForwardContainsForLayer(TestForIterVarNumpy):
-
    def set_test_func(self):
        self.dygraph_func = ForwardContainsForLayer()


 class TestForOriginalList(TestTransformForOriginalList):
-
    def set_test_func(self):
        self.dygraph_func = for_original_list

@@ -555,7 +533,6 @@ class TestForOriginalList(TestTransformForOriginalList):


 class TestForOriginalTuple(TestTransformForOriginalList):
-
    def set_test_func(self):
        self.dygraph_func = for_original_tuple

@@ -564,7 +541,6 @@ class TestForOriginalTuple(TestTransformForOriginalList):


 class TestForZip(unittest.TestCase):
-
    def setUp(self):
        self.temp_dir = tempfile.TemporaryDirectory()


--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
@@ -22,12 +22,10 @@ from typing import Optional, List, Callable, Dict, Any, Set


 class TrtConvertActivationTest(TrtLayerAutoScanTest):
-
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
        return True

    def sample_program_configs(self):
-
        def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
            if dims == 1:
                return np.random.random([32]).astype(np.float32)
@@ -41,11 +39,19 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
        for dims in [1, 2, 3, 4]:
            for batch in [1, 4]:
                for op_type in [
-                        "relu", "sigmoid", "tanh", "relu6", "elu", "selu",
-                        "softsign", "stanh", "thresholded_relu", "softplus"
+                    "relu",
+                    "sigmoid",
+                    "tanh",
+                    "relu6",
+                    "elu",
+                    "selu",
+                    "softsign",
+                    "stanh",
+                    "thresholded_relu",
+                    "softplus",
                ]:
                    # few samples to reduce time
-                    #for beta in [-0.2, 0.5, 0.67, 3]:
+                    # for beta in [-0.2, 0.5, 0.67, 3]:
                    #    for alpha in [-0.2, 0.5, 0.67, 3]:
                    for beta in [0.67]:
                        for alpha in [0.67]:
@@ -62,33 +68,34 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
                            if op_type == "softplus":
                                dics = [{"beta": beta}]

-                            ops_config = [{
+                            ops_config = [
+                                {
                                    "op_type": op_type,
-                                "op_inputs": {
-                                    "X": ["input_data"]
-                                },
-                                "op_outputs": {
-                                    "Out": ["output_data"]
-                                },
-                                "op_attrs": dics[0]
-                            }]
+                                    "op_inputs": {"X": ["input_data"]},
+                                    "op_outputs": {"Out": ["output_data"]},
+                                    "op_attrs": dics[0],
+                                }
+                            ]
                            ops = self.generate_op_config(ops_config)

                            program_config = ProgramConfig(
                                ops=ops,
                                weights={},
                                inputs={
-                                    "input_data":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input1, dims, batch, dics))
+                                    "input_data": TensorConfig(
+                                        data_gen=partial(
+                                            generate_input1, dims, batch, dics
+                                        )
+                                    )
                                },
-                                outputs=["output_data"])
+                                outputs=["output_data"],
+                            )

                            yield program_config

    def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
        def generate_dynamic_shape(attrs):
            if self.dims == 1:
                self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -131,19 +138,23 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
        clear_dynamic_shape()
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3

        # for dynamic_shape
        generate_dynamic_shape(attrs)
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3

    def test(self):
        self.run_test()

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
@@ -22,60 +22,66 @@ from typing import Optional, List, Callable, Dict, Any, Set


 class TrtConvertAnchorGeneratorTest(TrtLayerAutoScanTest):
-
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
        return True

    def sample_program_configs(self):
-
        def generate_input1(batch, attrs: List[Dict[str, Any]]):
            return np.random.random([batch, 3, 64, 64]).astype(np.float32)

        for batch in [1, 2, 4]:
            for anchor_sizes in [[64.0, 128.0, 256.0, 512.0]]:
                for aspect_ratios in [[0.5, 1, 2], [0.4, 1.2, 3]]:
-                    for variances in [[1.0, 1.0, 1.0, 1.0],
-                                      [0.5, 1.0, 0.5, 1.0]]:
+                    for variances in [
+                        [1.0, 1.0, 1.0, 1.0],
+                        [0.5, 1.0, 0.5, 1.0],
+                    ]:
                        for stride in [[16.0, 16.0], [16.0, 32.0]]:
                            for offset in [0.5, 0.8]:
-                                dics = [{
+                                dics = [
+                                    {
                                        "anchor_sizes": anchor_sizes,
                                        "aspect_ratios": aspect_ratios,
                                        "variances": variances,
                                        "stride": stride,
-                                    "offset": offset
-                                }]
+                                        "offset": offset,
+                                    }
+                                ]

-                                ops_config = [{
+                                ops_config = [
+                                    {
                                        "op_type": "anchor_generator",
-                                    "op_inputs": {
-                                        "Input": ["input_data"]
-                                    },
+                                        "op_inputs": {"Input": ["input_data"]},
                                        "op_outputs": {
                                            "Anchors": ["output_anchors"],
-                                        "Variances": ["output_variances"]
+                                            "Variances": ["output_variances"],
                                        },
-                                    "op_attrs": dics[0]
-                                }]
+                                        "op_attrs": dics[0],
+                                    }
+                                ]
                                ops = self.generate_op_config(ops_config)

                                program_config = ProgramConfig(
                                    ops=ops,
                                    weights={},
                                    inputs={
-                                        "input_data":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input1, batch, dics))
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input1, batch, dics
+                                            )
+                                        )
                                    },
                                    outputs=[
-                                        "output_anchors", "output_variances"
-                                    ])
+                                        "output_anchors",
+                                        "output_variances",
+                                    ],
+                                )

                                yield program_config

    def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
        def generate_dynamic_shape(attrs):
            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -100,19 +106,23 @@ class TrtConvertAnchorGeneratorTest(TrtLayerAutoScanTest):
        clear_dynamic_shape()
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3

        # for dynamic_shape
        generate_dynamic_shape(attrs)
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3

    def test(self):
        self.run_test()