2.4.2

dbe08e9b · yuguo960516yuguo · b5499578 · dbe08e9b · dbe08e9b · dbe08e9b
Commit dbe08e9b authored Jun 12, 2023 by yuguo960516yuguo
20 changed files
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py
@@ -22,7 +22,6 @@ import unittest


 class TrtConvertTransposeTest(TrtLayerAutoScanTest):
-
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
        inputs = program_config.inputs
        weights = program_config.weights
@@ -32,14 +31,13 @@ class TrtConvertTransposeTest(TrtLayerAutoScanTest):
            program_config.ops[i].attrs for i in range(len(program_config.ops))
        ]

-        #The shape of input and axis should be equal.
+        # The shape of input and axis should be equal.
        if len(inputs['transpose_input'].shape) != len(attrs[0]['axis']):
            return False

        return True

    def sample_program_configs(self):
-
        def generate_input1(attrs: List[Dict[str, Any]], batch):
            if self.dims == 4:
                return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -50,37 +48,43 @@ class TrtConvertTransposeTest(TrtLayerAutoScanTest):

        for dims in [2, 3, 4]:
            for batch in [1, 2, 4]:
-                for axis in [[0, 1, 3, 2], [0, 3, 2, 1], [3, 2, 0, 1],
-                             [0, 1, 2, 3], [0, 1, 2], [2, 0, 1], [1, 0], [0,
-                                                                          1]]:
+                for axis in [
+                    [0, 1, 3, 2],
+                    [0, 3, 2, 1],
+                    [3, 2, 0, 1],
+                    [0, 1, 2, 3],
+                    [0, 1, 2],
+                    [2, 0, 1],
+                    [1, 0],
+                    [0, 1],
+                ]:
                    self.dims = dims
                    dics = [{"axis": axis}, {}]
-                    ops_config = [{
-                        "op_type": "transpose",
-                        "op_inputs": {
-                            "X": ["transpose_input"]
-                        },
-                        "op_outputs": {
-                            "Out": ["transpose_out"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "transpose",
+                            "op_inputs": {"X": ["transpose_input"]},
+                            "op_outputs": {"Out": ["transpose_out"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                    ops = self.generate_op_config(ops_config)
                    program_config = ProgramConfig(
                        ops=ops,
                        weights={},
                        inputs={
-                            "transpose_input":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics, batch))
+                            "transpose_input": TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)
+                            )
                        },
-                        outputs=["transpose_out"])
+                        outputs=["transpose_out"],
+                    )

                    yield program_config

    def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
        def generate_dynamic_shape(attrs):
            if self.dims == 4:
                self.dynamic_shape.min_input_shape = {
@@ -134,19 +138,23 @@ class TrtConvertTransposeTest(TrtLayerAutoScanTest):
        clear_dynamic_shape()
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3

        # for dynamic_shape
        generate_dynamic_shape(attrs)
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3

    def test(self):
        self.run_test()

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
@@ -22,7 +22,6 @@ from typing import Optional, List, Callable, Dict, Any, Set


 class TrtConvertActivationTest(TrtLayerAutoScanTest):
-
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
        return True

@@ -42,40 +41,54 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
        for dims in [1, 2, 3, 4]:
            for batch in [1, 4]:
                for op_type in [
-                        "exp", "log", "sqrt", "abs", "sin", "cos", "tan",
-                        "sinh", "cosh", "asin", "acos", "atan", "asinh",
-                        "atanh", "ceil", "floor"
+                    "exp",
+                    "log",
+                    "sqrt",
+                    "abs",
+                    "sin",
+                    "cos",
+                    "tan",
+                    "sinh",
+                    "cosh",
+                    "asin",
+                    "acos",
+                    "atan",
+                    "asinh",
+                    "atanh",
+                    "ceil",
+                    "floor",
                ]:
                    self.dims = dims
                    dics = [{}]

-                    ops_config = [{
-                        "op_type": op_type,
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                    ops = self.generate_op_config(ops_config)

                    program_config = ProgramConfig(
                        ops=ops,
                        weights={},
                        inputs={
-                            "input_data":
-                            TensorConfig(data_gen=partial(
-                                generate_input1, dims, batch, dics))
+                            "input_data": TensorConfig(
+                                data_gen=partial(
+                                    generate_input1, dims, batch, dics
+                                )
+                            )
                        },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )

                    yield program_config

    def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
        def generate_dynamic_shape(attrs):
            if self.dims == 1:
                self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -118,19 +131,23 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
        clear_dynamic_shape()
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3

        # for dynamic_shape
        generate_dynamic_shape(attrs)
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3

    def test(self):
        self.run_test()

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unfold.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unfold.py
@@ -22,46 +22,46 @@ import unittest


 class TrtConvertUnfold(TrtLayerAutoScanTest):
-
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
        return True

    def sample_program_configs(self):
-
        def generate_input1():
            return np.random.random([1, 3, 24, 24]).astype(np.float32)

-        ops_config = [{
-            "op_type": "unfold",
-            "op_inputs": {
-                "X": ["input_data"],
-            },
-            "op_outputs": {
-                "Y": ["output_data"]
-            },
-            "op_attrs": {
-                "dilations": [1, 1],
-                "kernel_sizes": [4, 4],
-                "paddings": [0, 0, 0, 0],
-                "strides": [1, 1],
+        ops_config = [
+            {
+                "op_type": "unfold",
+                "op_inputs": {
+                    "X": ["input_data"],
+                },
+                "op_outputs": {"Y": ["output_data"]},
+                "op_attrs": {
+                    "dilations": [1, 1],
+                    "kernel_sizes": [4, 4],
+                    "paddings": [0, 0, 0, 0],
+                    "strides": [1, 1],
+                },
            }
-        }]
+        ]
        ops = self.generate_op_config(ops_config)
        for i in range(10):
            program_config = ProgramConfig(
                ops=ops,
                weights={},
                inputs={
-                    "input_data":
-                    TensorConfig(data_gen=partial(generate_input1)),
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1)
+                    ),
                },
-                outputs=["output_data"])
+                outputs=["output_data"],
+            )

            yield program_config

    def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
        def generate_dynamic_shape(attrs):
            self.dynamic_shape.min_input_shape = {
                "input_data": [1, 3, 4, 4],
@@ -87,14 +87,14 @@ class TrtConvertUnfold(TrtLayerAutoScanTest):
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), (0, 3), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 3), 1e-5
+        yield self.create_inference_config(), (0, 3), 1e-3

        # for dynamic_shape
        generate_dynamic_shape(attrs)
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), (1, 2), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-3

    def test(self):
        self.run_test()

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unsqueeze2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unsqueeze2.py
@@ -22,7 +22,6 @@ from typing import Optional, List, Callable, Dict, Any, Set


 class TrtConvertSplitTest(TrtLayerAutoScanTest):
-
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
        return True

@@ -34,17 +33,17 @@ class TrtConvertSplitTest(TrtLayerAutoScanTest):
                    self.dims = dims
                    self.axes = axes
                    dics = [{"axes": axes}]
-                    ops_config = [{
-                        "op_type": "unsqueeze2",
-                        "op_inputs": {
-                            "X": ["in_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["out_data"],
-                            "XShape": ["XShape_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "unsqueeze2",
+                            "op_inputs": {"X": ["in_data"]},
+                            "op_outputs": {
+                                "Out": ["out_data"],
+                                "XShape": ["XShape_data"],
+                            },
+                            "op_attrs": dics[0],
+                        }
+                    ]

                    # generate input data
                    self.input_shape = [1] * dims
@@ -54,24 +53,26 @@ class TrtConvertSplitTest(TrtLayerAutoScanTest):
                    def generate_input1(attrs: List[Dict[str, Any]], batch):
                        self.input_shape[0] = batch
                        return np.random.random(self.input_shape).astype(
-                            np.float32)
+                            np.float32
+                        )

                    ops = self.generate_op_config(ops_config)
                    program_config = ProgramConfig(
                        ops=ops,
                        weights={},
                        inputs={
-                            "in_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics, batch))
+                            "in_data": TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)
+                            )
                        },
-                        outputs=["out_data"])
+                        outputs=["out_data"],
+                    )

                    yield program_config

    def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
        def generate_dynamic_shape(attrs):
            max_shape = list(self.input_shape)
            min_shape = list(self.input_shape)
@@ -98,19 +99,23 @@ class TrtConvertSplitTest(TrtLayerAutoScanTest):
        clear_dynamic_shape()
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3

        # for dynamic_shape
        generate_dynamic_shape(attrs)
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3

    def add_skip_trt_case(self):
        pass

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
@@ -47,7 +47,8 @@ class TestSqueeze2MatmulFusePass(PassAutoScanTest):
            min_subgraph_size=0,
            precision_mode=paddle_infer.PrecisionType.Float32,
            use_static=False,
-            use_calib_mode=False)
+            use_calib_mode=False,
+        )
        yield config, ['mul', 'elementwise_add'], (1e-4, 1e-1)

    def add_ignore_pass_case(self):
@@ -70,9 +71,10 @@ class TestSqueeze2MatmulFusePass(PassAutoScanTest):
    def sample_program_config(self, draw):
        # 1. Generate shape of input:X of squeeze2
        x_shape = draw(
-            st.lists(st.integers(min_value=1, max_value=8),
-                     min_size=2,
-                     max_size=2))
+            st.lists(
+                st.integers(min_value=1, max_value=8), min_size=2, max_size=2
+            )
+        )
        # axes of squeeze2 == [2, 3]
        x_shape += [1, 1]
        axes = [2, 3]
@@ -84,9 +86,10 @@ class TestSqueeze2MatmulFusePass(PassAutoScanTest):

        # 3. Generate legal shape of input:Y of matmul
        y_shape = draw(
-            st.lists(st.integers(min_value=1, max_value=8),
-                     min_size=2,
-                     max_size=2))
+            st.lists(
+                st.integers(min_value=1, max_value=8), min_size=2, max_size=2
+            )
+        )
        y_shape[0] = x_shape[1]

        # 4. Generate legal attr:axis of elementwise_add
@@ -108,17 +111,11 @@ class TestSqueeze2MatmulFusePass(PassAutoScanTest):
                "X": ["squeeze2_x"],
            },
            axes=axes,
-            outputs={
-                "Out": ["squeeze2_out"],
-                "XShape": ["xshape"]
-            },
+            outputs={"Out": ["squeeze2_out"], "XShape": ["xshape"]},
        )
        matmul_op = OpConfig(
            "matmul",
-            inputs={
-                "X": ["squeeze2_out"],
-                "Y": ["matmul_y"]
-            },
+            inputs={"X": ["squeeze2_out"], "Y": ["matmul_y"]},
            outputs={"Out": ["matmul_out"]},
            alpha=alpha,
            transpose_X=transpose_X,
@@ -133,10 +130,7 @@ class TestSqueeze2MatmulFusePass(PassAutoScanTest):

        add_op = OpConfig(
            "elementwise_add",
-            inputs={
-                "X": ["matmul_out"],
-                "Y": ["bias"]
-            },
+            inputs={"X": ["matmul_out"], "Y": ["bias"]},
            outputs={"Out": ["add_out"]},
            axis=axis,
        )
@@ -157,9 +151,11 @@ class TestSqueeze2MatmulFusePass(PassAutoScanTest):
        return program_config

    def test(self):
-        self.run_and_statis(quant=False,
-                            max_examples=50,
-                            passes=["trt_squeeze2_matmul_fuse_pass"])
+        self.run_and_statis(
+            quant=False,
+            max_examples=25,
+            passes=["trt_squeeze2_matmul_fuse_pass"],
+        )


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
@@ -24,13 +24,15 @@ paddle.enable_static()
 np.random.seed(0)


-@unittest.skipIf(not paddle.is_compiled_with_cuda()
-                 or paddle.get_cudnn_version() < 8000
-                 or paddle.device.cuda.get_device_capability()[0] < 7,
-                 "only support with cuda and cudnn version is at least 8.0 "
-                 "and device's compute capability is at least 7.0")
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda()
+    or paddle.get_cudnn_version() < 8000
+    or paddle.device.cuda.get_device_capability()[0] < 7
+    or paddle.device.cuda.get_device_capability()[0] >= 9,
+    "only support with cuda and cudnn version is at least 8.0 "
+    "and device's compute capability is at least 7.0 and less than 9.0",
+)
 class TestFuseResNetUnit(unittest.TestCase):
-
    def test_fuse_resenet_unit(self):
        place = paddle.CUDAPlace(0)
        program = paddle.static.Program()
@@ -38,14 +40,12 @@ class TestFuseResNetUnit(unittest.TestCase):
        with paddle.static.amp.fp16_guard():
            with paddle.static.program_guard(program, startup_program):
                x = paddle.static.data("x", [1, 64, 64, 8])
-                conv2d = paddle.nn.Conv2D(8,
-                                          32,
-                                          1,
-                                          bias_attr=False,
-                                          data_format='NHWC')
-                batch_norm = paddle.nn.BatchNorm(32,
-                                                 act='relu',
-                                                 data_layout='NHWC')
+                conv2d = paddle.nn.Conv2D(
+                    8, 32, 1, bias_attr=False, data_format='NHWC'
+                )
+                batch_norm = paddle.nn.BatchNorm(
+                    32, act='relu', data_layout='NHWC'
+                )
                out = batch_norm(conv2d(x))
        graph = core.Graph(program.desc)
        core.get_pass("fuse_resnet_unit").apply(graph)
@@ -54,15 +54,15 @@ class TestFuseResNetUnit(unittest.TestCase):
        after_params = paddle.static.amp.cast_model_to_fp16(after_program)
        exe = paddle.static.Executor(place)
        exe.run(startup_program)
-        paddle.static.amp.cast_parameters_to_fp16(place,
-                                                  program,
-                                                  to_fp16_var_names=params)
        paddle.static.amp.cast_parameters_to_fp16(
-            place, after_program, to_fp16_var_names=after_params)
+            place, program, to_fp16_var_names=params
+        )
+        paddle.static.amp.cast_parameters_to_fp16(
+            place, after_program, to_fp16_var_names=after_params
+        )
        feed = {"x": np.random.randn(1, 64, 64, 8).astype("float16")}
        before_out = exe.run(program, feed=feed, fetch_list=[out.name])
        after_out = exe.run(after_program, feed=feed, fetch_list=[out.name])
-        np.testing.assert_allclose(before_out[0],
-                                   after_out[0],
-                                   rtol=1e-05,
-                                   atol=0.005)
+        np.testing.assert_allclose(
+            before_out[0], after_out[0], rtol=1e-05, atol=0.005
+        )
--- a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
@@ -34,7 +34,10 @@ import unittest
 from multiprocessing import Process
 import paddle.fluid.layers as layers
 from functools import reduce
-from test_sync_batch_norm_base_mlu import TestSyncBatchNormRunnerBase, runtime_main
+from test_sync_batch_norm_base_mlu import (
+    TestSyncBatchNormRunnerBase,
+    runtime_main,
+)
 from op_test import OpTest, _set_use_system_allocator

 from test_sync_batch_norm_op import create_or_get_tensor
@@ -44,11 +47,11 @@ paddle.enable_static()


 class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
-
    def __init__(self):
        self.global_ring_id = 0

        self.dtype = np.float32
+        self.bn_dtype = np.float32
        self.N = 8
        self.C = 16
        self.H = 32
@@ -56,29 +59,36 @@ class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
        self.dshape = [self.N, self.C, self.H, self.W]
        self.atol = 1e-3

-    def get_model(self,
-                  main,
-                  startup,
-                  place,
-                  layout,
-                  seed,
-                  sync_bn=False,
-                  only_forward=False):
+    def get_model(
+        self,
+        main,
+        startup,
+        place,
+        layout,
+        seed,
+        sync_bn=False,
+        only_forward=False,
+    ):
        """Build program."""
        use_cudnn = False
        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
-                data = fluid.layers.data(name='input',
-                                         shape=self.dshape,
-                                         dtype=self.dtype,
-                                         append_batch_size=False)
+                data = fluid.layers.data(
+                    name='input',
+                    shape=self.dshape,
+                    dtype=self.dtype,
+                    append_batch_size=False,
+                )
                conv = fluid.layers.conv2d(
                    input=data,
                    num_filters=32,
                    filter_size=1,
                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
                    bias_attr=False,
-                    use_cudnn=use_cudnn)
+                    use_cudnn=use_cudnn,
+                )
+                if self.bn_dtype == np.float16:
+                    conv = fluid.layers.cast(conv, 'float16')
                bn = fluid.layers.batch_norm(
                    conv,
                    param_attr=fluid.ParamAttr(name='bn_scale'),
@@ -86,9 +96,10 @@ class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
                    moving_mean_name='bn_moving_mean',
                    moving_variance_name='bn_moving_variance',
                    data_layout=layout,
-                    is_test=only_forward)
-                # if self.dtype == np.float16:
-                #     bn = fluid.layers.cast(bn, 'float32')
+                    is_test=only_forward,
+                )
+                if self.bn_dtype == np.float16:
+                    bn = fluid.layers.cast(bn, 'float32')
                sigmoid = fluid.layers.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                # if not sync_bn:

--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -41,10 +41,10 @@ def DataTypeCast(date_type):


 class TestCollectiveAPIRunnerBase(object):
-
    def get_model(self, train_prog, startup_prog, rank, indata=None):
        raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )

    def run_trainer(self, args):
        train_prog = fluid.Program()
@@ -66,12 +66,12 @@ class TestCollectiveAPIRunnerBase(object):
            fetch_list = []
            for elem in result:
                fetch_list.append(elem.name)
-            out = exe.run(train_prog,
-                          feed={'tindata': indata},
-                          fetch_list=fetch_list)
+            out = exe.run(
+                train_prog, feed={'tindata': indata}, fetch_list=fetch_list
+            )
        else:
            out = self.get_model(train_prog, startup_prog, rank, indata)
-            #print(out, sys.stderr)
+            # print(out, sys.stderr)
        sys.stdout.buffer.write(pickle.dumps(out))


@@ -96,19 +96,20 @@ from contextlib import closing


 class TestDistBase(unittest.TestCase):
-
    def setUp(self):
        self._port_set = set()
        self._trainers = 2
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
        self._python_interp = sys.executable

    def _find_free_port(self):
-
        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                s.bind(('', 0))
                return s.getsockname()[1]

@@ -121,13 +122,13 @@ class TestDistBase(unittest.TestCase):
    def _run_cluster(self, model_file, envs):
        worker_endpoints = self._ps_endpoints.split(",")
        w0_ep, w1_ep = worker_endpoints
-        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        # print("w0_ep:",w0_ep," w1_ep:",w1_ep)
        env0 = {
            "FLAGS_selected_mlus": "0",
            "PADDLE_TRAINER_ID": "0",
            "PADDLE_TRAINERS_NUM": "2",
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
        }

        env1 = {
@@ -135,9 +136,9 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_TRAINER_ID": "1",
            "PADDLE_TRAINERS_NUM": "2",
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
        }
-        #update environment
+        # update environment
        env0.update(envs)
        env1.update(envs)
        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
@@ -148,16 +149,20 @@ class TestDistBase(unittest.TestCase):
        tr1_cmd = tr_cmd % (self._python_interp, model_file)
        tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
-        #print(tr0_cmd)
-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
-
-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        # print(tr0_cmd)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )

        tr0_out, tr0_err = tr0_proc.communicate()
        tr1_out, tr1_err = tr1_proc.communicate()
@@ -170,17 +175,23 @@ class TestDistBase(unittest.TestCase):
            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         data_type,
-                         path_id="0",
-                         static_mode="1",
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self,
+        model_file,
+        col_type,
+        data_type,
+        path_id="0",
+        static_mode="1",
+        check_error_log=False,
+        need_envs={},
+    ):
        required_envs = {
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
            "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -194,7 +205,7 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_WITH_GLOO": '0',
            "BACKEND": "cncl",
            "PATH_ID": path_id,
-            "DATA_TYPE": data_type
+            "DATA_TYPE": data_type,
        }
        required_envs.update(need_envs)
        if check_error_log:
@@ -202,7 +213,8 @@ class TestDistBase(unittest.TestCase):
            required_envs["GLOG_logtostderr"] = "1"
            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
        np_data_type = DataTypeCast(data_type)
        np.random.seed(pid0)
        input1 = np.random.random((10, 1000)).astype(np_data_type)
@@ -210,21 +222,19 @@ class TestDistBase(unittest.TestCase):
        input2 = np.random.random((10, 1000)).astype(np_data_type)
        if col_type == "broadcast":
            need_result = input2
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "allreduce":
            need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "reduce":
            need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
        elif col_type == "allgather":
            need_result = np.vstack((input1, input2))
            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))

--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -53,10 +53,10 @@ def DataTypeCast(date_type):


 class TestCollectiveRunnerBase(object):
-
    def get_model(self, train_prog, startup_prog, col_type):
        raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )

    def wait_server_ready(self, endpoints):
        while True:
@@ -64,13 +64,15 @@ class TestCollectiveRunnerBase(object):
            not_ready_endpoints = []
            for ep in endpoints:
                ip_port = ep.split(":")
-                with closing(socket.socket(socket.AF_INET,
-                                           socket.SOCK_STREAM)) as sock:
+                with closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                ) as sock:
                    sock.settimeout(2)
                    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                    if hasattr(socket, 'SO_REUSEPORT'):
-                        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT,
-                                        1)
+                        sock.setsockopt(
+                            socket.SOL_SOCKET, socket.SO_REUSEPORT, 1
+                        )

                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                    if result != 0:
@@ -78,44 +80,51 @@ class TestCollectiveRunnerBase(object):
                        not_ready_endpoints.append(ep)
            if not all_ok:
                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" +
-                                 str(not_ready_endpoints) + "\n")
+                sys.stderr.write(
+                    "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+                )
                sys.stderr.flush()
                time.sleep(3)
            else:
                break

+    # endpoints should be ["ip1:port1","ip2:port2"]

-#endpoints should be ["ip1:port1","ip2:port2"]
-
-    def initCommunicator(self, program, rank, nranks, wait_port,
-                         current_endpoint, endpoints):
+    def initCommunicator(
+        self, program, rank, nranks, wait_port, current_endpoint, endpoints
+    ):
        other_endpoints = endpoints[:]
        other_endpoints.remove(current_endpoint)
        if rank == 0 and wait_port:
            self.wait_server_ready(other_endpoints)
        block = program.global_block()
-        cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'),
-                                       persistable=True,
-                                       type=core.VarDesc.VarType.RAW)
-
-        block.append_op(type='c_gen_cncl_id',
-                        inputs={},
-                        outputs={'Out': cncl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-
-        block.append_op(type='c_comm_init',
-                        inputs={'X': cncl_id_var},
-                        outputs={},
-                        attrs={
-                            'nranks': nranks,
-                            'rank': rank,
-                            'ring_id': self.global_ring_id
-                        })
+        cncl_id_var = block.create_var(
+            name=nameGen.generate('cncl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+        )
+
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id,
+            },
+        )

    def run_trainer(self, args):
        train_prog = fluid.Program()
@@ -124,8 +133,9 @@ class TestCollectiveRunnerBase(object):
        rank = args["trainerid"]
        current_endpoint = args["currentendpoint"]
        nranks = 2
-        self.initCommunicator(startup_prog, rank, nranks, True,
-                              current_endpoint, endpoints)
+        self.initCommunicator(
+            startup_prog, rank, nranks, True, current_endpoint, endpoints
+        )
        self.rank = rank
        result = self.get_model(train_prog, startup_prog, args["col_type"])
        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
@@ -135,9 +145,9 @@ class TestCollectiveRunnerBase(object):
        np.random.seed(os.getpid())
        np_data_type = DataTypeCast(args["data_type"])
        indata = np.random.random((10, 1000)).astype(np_data_type)
-        out = exe.run(train_prog,
-                      feed={'tindata': indata},
-                      fetch_list=[result.name])
+        out = exe.run(
+            train_prog, feed={'tindata': indata}, fetch_list=[result.name]
+        )
        sys.stdout.buffer.write(pickle.dumps(out))


@@ -160,19 +170,20 @@ from contextlib import closing


 class TestDistBase(unittest.TestCase):
-
    def setUp(self):
        self._port_set = set()
        self._trainers = 2
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
        self._python_interp = sys.executable

    def _find_free_port(self):
-
        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                s.bind(('', 0))
                return s.getsockname()[1]

@@ -191,7 +202,7 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_TRAINER_ID": "0",
            "PADDLE_TRAINERS_NUM": "2",
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
        }

        env1 = {
@@ -199,9 +210,9 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_TRAINER_ID": "1",
            "PADDLE_TRAINERS_NUM": "2",
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
        }
-        #update environment
+        # update environment
        env0.update(envs)
        env1.update(envs)
        tr_cmd = "%s %s"
@@ -210,15 +221,19 @@ class TestDistBase(unittest.TestCase):
        tr0_pipe = open("/tmp/tr0_err.log", "wb")
        tr1_pipe = open("/tmp/tr1_err.log", "wb")

-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )

-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )

        tr0_out, tr0_err = tr0_proc.communicate()
        tr1_out, tr1_err = tr1_proc.communicate()
@@ -227,15 +242,21 @@ class TestDistBase(unittest.TestCase):
        # close trainer file
        tr0_pipe.close()
        tr1_pipe.close()
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         data_type,
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self,
+        model_file,
+        col_type,
+        data_type,
+        check_error_log=False,
+        need_envs={},
+    ):
        required_envs = {
            "FLAGS_eager_delete_tensor_gb": "0.0",
            "PATH": os.getenv("PATH"),
@@ -251,7 +272,8 @@ class TestDistBase(unittest.TestCase):
            required_envs["GLOG_v"] = "3"
            required_envs["GLOG_logtostderr"] = "1"
        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
        np_data_type = DataTypeCast(data_type)
        np.random.seed(pid0)
        input1 = np.random.random((10, 1000)).astype(np_data_type)
@@ -259,63 +281,55 @@ class TestDistBase(unittest.TestCase):
        input2 = np.random.random((10, 1000)).astype(np_data_type)
        if col_type == "broadcast":
            need_result = input2
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "allreduce_sum":
            need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "allreduce_prod":
            need_result = input1 * input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "allreduce_max":
            need_result = np.maximum(input1, input2)
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "allreduce_min":
            need_result = np.minimum(input1, input2)
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
        elif col_type == "reduce_sum":
            need_result = input1 + input2
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "reduce_prod":
            need_result = input1 * input2
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "reduce_max":
            need_result = np.maximum(input1, input2)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "reduce_min":
            need_result = np.minimum(input1, input2)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        elif col_type == "allgather":
            need_result = np.vstack((input1, input2))
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
        else:
            pass
--- a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
@@ -29,26 +29,44 @@ SEED = 2022


 class TestDropoutOp(OpTest):
-
    def setUp(self):
-        self.op_type = "dropout"
        self.set_mlu()
        self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.init_inputs_shape()
+        self.init_attrs()
+        self.op_type = 'dropout'
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64)).astype('uint8')
+            'dropout_prob': self.dropout_prob,
+            'fix_seed': self.fix_seed,
+            'is_test': self.is_test,
+            'dropout_implementation': self.dropout_implementation,
        }

+        out = self.inputs['X'] * (1.0 - self.dropout_prob)
+        if self.is_test == False:
+            mask = None
+            if self.dropout_prob == 0.0:
+                mask = np.ones(self.shape).astype('uint8')
+            elif self.dropout_prob == 1.0:
+                mask = np.zeros(self.shape).astype('uint8')
+            self.outputs = {'Out': out, 'Mask': mask}
+        else:
+            self.outputs = {'Out': out}
+
    def init_dtype(self):
        self.dtype = np.float32

+    def init_inputs_shape(self):
+        self.shape = [32, 64]
+
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = False
+        self.dropout_prob = 0.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "upscale_in_train"
+
    def set_mlu(self):
        self.__class__.use_mlu = True
        self.place = paddle.device.MLUPlace(0)
@@ -57,84 +75,107 @@ class TestDropoutOp(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
+        if (
+            hasattr(self.__class__, "no_need_check_grad")
+            and self.__class__.no_need_check_grad == True
+        ):
+            return
+
        self.check_grad_with_place(self.place, ['X'], 'Out')


 class TestDropoutOpInput1d(TestDropoutOp):
-    # change input shape
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((3, 62)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((3, 62)).astype('uint8')
-        }
-
-
-class TestDropoutOpInput1d_1(TestDropoutOp):
-    # the input is 1-D
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((2000)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((2000)).astype('uint8')
-        }
+    def init_inputs_shape(self):
+        self.shape = [2000]


 class TestDropoutOp2(TestDropoutOp):
-    # the dropout_prob is 1.0
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 1.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': np.zeros((32, 64)).astype('float32'),
-            'Mask': np.zeros((32, 64)).astype('uint8')
-        }
+    def init_inputs_shape(self):
+        self.shape = [32, 64]
+
+    def init_attrs(self):
+        self.dropout_prob = 1.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "upscale_in_train"


 class TestDropoutOp3(TestDropoutOp):
-    # the input dim is 3
+    def init_inputs_shape(self):
+        self.shape = [32, 64, 2]
+
+
+class TestDropoutOp4(TestDropoutOp):
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.35
+        self.fix_seed = True
+        self.is_test = True
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOp5(TestDropoutOp):
+    def init_inputs_shape(self):
+        self.shape = [32, 64, 3]
+
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.75
+        self.fix_seed = True
+        self.is_test = True
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOp6(TestDropoutOp):
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOpWithSeed(TestDropoutOp):
+    # the seed is a Tensor
    def setUp(self):
        self.op_type = "dropout"
        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64, 2)).astype(self.dtype)}
+        self.dtype = np.float32
+        self.inputs = {
+            "X": np.random.random((32, 64)).astype(self.dtype),
+            "Seed": np.asarray([125], dtype="int32"),
+        }
        self.attrs = {
            'dropout_prob': 0.0,
-            'fix_seed': True,
            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
        }
        self.outputs = {
            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64, 2)).astype('uint8')
+            'Mask': np.ones((32, 64)).astype('uint8'),
        }

+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestDropoutOpFp16(TestDropoutOp):
+    # float16
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+

 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference(OpTest):
@@ -148,7 +189,7 @@ class TestDropoutOpInference(OpTest):
            'dropout_prob': 0.35,
            'fix_seed': True,
            'is_test': True,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
        }
        self.outputs = {'Out': self.inputs['X']}

@@ -165,7 +206,6 @@ class TestDropoutOpInference(OpTest):

 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference2(TestDropoutOpInference):
-
    def setUp(self):
        self.op_type = "dropout"
        self.set_mlu()
@@ -174,45 +214,12 @@ class TestDropoutOpInference2(TestDropoutOpInference):
        self.attrs = {
            'dropout_prob': 0.75,
            'is_test': True,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
        }
        self.outputs = {'Out': self.inputs['X']}


-class TestDropoutOpWithSeed(TestDropoutOp):
-    # the seed is a Tensor
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {
-            "X": np.random.random((32, 64)).astype(self.dtype),
-            "Seed": np.asarray([125], dtype="int32")
-        }
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64)).astype('uint8')
-        }
-
-
-class TestDropoutOpFp16(TestDropoutOp):
-    # float16
-    def init_dtype(self):
-        self.dtype = np.float16
-
-    def set_mlu(self):
-        self.__class__.use_mlu = True
-        self.place = paddle.device.MLUPlace(0)
-        self.__class__.no_need_check_grad = True
-
-
 class TestDropoutAPI(unittest.TestCase):
-
    def setUp(self):
        np.random.seed(123)
        self.places = [fluid.CPUPlace(), paddle.device.MLUPlace(0)]
@@ -220,43 +227,44 @@ class TestDropoutAPI(unittest.TestCase):
    def check_static_result(self, place):
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
-            res1 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                training=False,
-                                                mode='upscale_in_train')
-            res2 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=0,
-                                                training=True,
-                                                mode='upscale_in_train')
-            res3 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=0,
-                                                training=False,
-                                                mode='upscale_in_train')
-            res4 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=[0, 1],
-                                                training=True,
-                                                mode='upscale_in_train')
-            res5 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=[0, 1],
-                                                training=False,
-                                                mode='upscale_in_train')
-            res6 = paddle.nn.functional.dropout(x=input,
-                                                p=1.,
-                                                training=True,
-                                                mode='upscale_in_train')
+            res1 = paddle.nn.functional.dropout(
+                x=input, p=0.0, training=False, mode='upscale_in_train'
+            )
+            res2 = paddle.nn.functional.dropout(
+                x=input, p=0.0, axis=0, training=True, mode='upscale_in_train'
+            )
+            res3 = paddle.nn.functional.dropout(
+                x=input, p=0.0, axis=0, training=False, mode='upscale_in_train'
+            )
+            res4 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=[0, 1],
+                training=True,
+                mode='upscale_in_train',
+            )
+            res5 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=[0, 1],
+                training=False,
+                mode='upscale_in_train',
+            )
+            res6 = paddle.nn.functional.dropout(
+                x=input, p=1.0, training=True, mode='upscale_in_train'
+            )
            res7 = paddle.fluid.layers.dropout(
                x=input,
-                dropout_prob=0.,
-                dropout_implementation='upscale_in_train')
-            res8 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=(0, 1),
-                                                training=False,
-                                                mode='upscale_in_train')
+                dropout_prob=0.0,
+                dropout_implementation='upscale_in_train',
+            )
+            res8 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=(0, 1),
+                training=False,
+                mode='upscale_in_train',
+            )

            in_np = np.random.random([40, 40]).astype("float32")
            res_np = in_np
@@ -265,13 +273,17 @@ class TestDropoutAPI(unittest.TestCase):
            exe = fluid.Executor(place)
            res_list = [res1, res2, res3, res4, res5, res7, res8]
            for res in res_list:
-                fetches = exe.run(fluid.default_main_program(),
-                                  feed={"input": in_np},
-                                  fetch_list=[res])
+                fetches = exe.run(
+                    fluid.default_main_program(),
+                    feed={"input": in_np},
+                    fetch_list=[res],
+                )
                np.testing.assert_allclose(fetches[0], res_np)
-            fetches2 = exe.run(fluid.default_main_program(),
-                               feed={"input": in_np},
-                               fetch_list=[res6])
+            fetches2 = exe.run(
+                fluid.default_main_program(),
+                feed={"input": in_np},
+                fetch_list=[res6],
+            )
            np.testing.assert_allclose(fetches2[0], res_np2)

    def test_static(self):

--- a/python/paddle/fluid/tests/unittests/mlu/test_grid_sampler_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_grid_sampler_op_mlu.py
@@ -28,12 +28,15 @@ def AffineGrid(theta, grid_shape):
    n = grid_shape[0]
    h = grid_shape[1]
    w = grid_shape[2]
-    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w,
-                      axis=0).T[:, :, np.newaxis]
-    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h,
-                      axis=0)[:, :, np.newaxis]
-    grid = np.concatenate([w_idx, h_idx, np.ones([h, w, 1])],
-                          axis=2)  # h * w * 3
+    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[
+        :, :, np.newaxis
+    ]
+    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[
+        :, :, np.newaxis
+    ]
+    grid = np.concatenate(
+        [w_idx, h_idx, np.ones([h, w, 1])], axis=2
+    )  # h * w * 3
    grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3

    ret = np.zeros([n, h * w, 2])
@@ -53,13 +56,17 @@ def getGridPointValue(data, x, y):
    out_H = x.shape[1]
    out_W = x.shape[2]

-    #out = np.zeros(data_shape, dtype='float32')
+    # out = np.zeros(data_shape, dtype='float32')
    out = np.zeros([N, C, out_H, out_W], dtype='float32')
    for i in range(N):
        for j in range(out_H):
            for k in range(out_W):
-                if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[
-                        i, j, k] < 0 or x[i, j, k] > in_W - 1:
+                if (
+                    y[i, j, k] < 0
+                    or y[i, j, k] > in_H - 1
+                    or x[i, j, k] < 0
+                    or x[i, j, k] > in_W - 1
+                ):
                    out[i, :, j, k] = 0
                else:
                    out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
@@ -75,27 +82,28 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
    if align_corners:
        grid_slice = 0.5 * ((grid_slice.astype('float32') + 1.0) * max_val)
    else:
-        grid_slice = 0.5 * ((grid_slice.astype('float32') + 1.0) *
-                            (max_val + 1)) - 0.5
+        grid_slice = (
+            0.5 * ((grid_slice.astype('float32') + 1.0) * (max_val + 1)) - 0.5
+        )

    if padding_mode == "border":
        grid_slice = clip(grid_slice, 0, max_val)
    elif padding_mode == "reflection":
        double_range = 2 * max_val if align_corners else (max_val + 1) * 2
-        grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
-                                                                   0.5)
+        grid_abs = (
+            np.abs(grid_slice) if align_corners else np.abs(grid_slice + 0.5)
+        )
        extra = grid_abs - np.floor(grid_abs / double_range) * double_range
        grid_slice = np.minimum(extra, double_range - extra)
-        grid_slice = grid_slice if align_corners else clip(
-            grid_slice - 0.5, 0, max_val)
+        grid_slice = (
+            grid_slice if align_corners else clip(grid_slice - 0.5, 0, max_val)
+        )
    return grid_slice


-def GridSampler(data,
-                grid,
-                align_corners=True,
-                mode="bilinear",
-                padding_mode="zeros"):
+def GridSampler(
+    data, grid, align_corners=True, mode="bilinear", padding_mode="zeros"
+):
    dims = data.shape
    N = dims[0]
    in_C = dims[1]
@@ -119,14 +127,18 @@ def GridSampler(data,
        y0 = np.floor(y).astype('int32')
        y1 = y0 + 1

-        wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
+        wa = np.tile(
+            ((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wb = np.tile(
+            ((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wc = np.tile(
+            ((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wd = np.tile(
+            ((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )

        va = getGridPointValue(data, x0, y0)
        vb = getGridPointValue(data, x0, y1)
@@ -142,7 +154,6 @@ def GridSampler(data,


 class TestGridSamplerOp(OpTest):
-
    def setUp(self):
        self.place = paddle.device.MLUPlace(0)
        self.__class__.use_mlu = True
@@ -166,12 +177,12 @@ class TestGridSamplerOp(OpTest):
            'use_cudnn': False,
            "align_corners": self.align_corners,
            "padding_mode": self.padding_mode,
-            "mode": self.mode
+            "mode": self.mode,
        }
        self.outputs = {
-            'Output':
-            GridSampler(x, grid, self.align_corners, self.mode,
-                        self.padding_mode)
+            'Output': GridSampler(
+                x, grid, self.align_corners, self.mode, self.padding_mode
+            )
        }

    def test_check_output(self):
@@ -186,20 +197,17 @@ class TestGridSamplerOp(OpTest):
        self.mode = "bilinear"


-# TODO(fwg): Test this case when cnnl support align_corners = True.
-# class Case1(TestGridSamplerOp):
-#
-#     def initTestCase(self):
-#         self.x_shape = (2, 3, 5, 6)
-#         self.grid_shape = (2, 8, 9, 2)
-#         self.theta_shape = (2, 2, 3)
-#         self.align_corners = True
-#         self.padding_mode = "zeros"
-#         self.mode = "bilinear"
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"


 class LargeInputCase(TestGridSamplerOp):
-
    def initTestCase(self):
        self.x_shape = (2, 3, 128, 128)
        self.grid_shape = (2, 130, 130, 2)
@@ -209,16 +217,15 @@ class LargeInputCase(TestGridSamplerOp):
        self.mode = "bilinear"


-# TODO(fwg): Test this case when cnnl support align_corners = True.
-# class Case2(LargeInputCase):
-#
-#     def initTestCase(self):
-#         self.x_shape = (2, 3, 128, 128)
-#         self.grid_shape = (2, 130, 130, 2)
-#         self.theta_shape = (2, 2, 3)
-#         self.align_corners = True
-#         self.padding_mode = "zeros"
-#         self.mode = "bilinear"
+class Case2(LargeInputCase):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 128, 128)
+        self.grid_shape = (2, 130, 130, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+
+
+def huber_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+class TestHuberLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'huber_loss'
+        self.set_mlu()
+        self.python_api = paddle.fluid.layers.huber_loss
+        self.python_out_sig = ["Out"]
+        self.delta = 1.0
+        self.init_input()
+        shape = self.set_shape()
+        residual = self.inputs['Y'] - self.inputs['X']
+        loss = np.vectorize(huber_loss_forward)(residual, self.delta).astype(
+            'float32'
+        )
+        self.attrs = {'delta': self.delta}
+        self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)}
+
+    def init_input(self):
+        shape = self.set_shape()
+        self.inputs = {
+            'X': np.random.uniform(0, 1.0, shape).astype('float32'),
+            'Y': np.random.uniform(0, 1.0, shape).astype('float32'),
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def set_shape(self):
+        return (100, 1)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place,
+            ['Y'],
+            'Out',
+            max_relative_error=0.008,
+            no_grad_set=set("residual"),
+        )
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place,
+            ['X'],
+            'Out',
+            max_relative_error=0.008,
+            no_grad_set=set('residual'),
+        )
+
+
+def TestHuberLossOp1(TestHuberLossOp):
+    def set_shape(self):
+        return 64
+
+
+def TestHuberLossOp2(TestHuberLossOp):
+    def set_shape(self):
+        return (6, 6)
+
+
+def TestHuberLossOp3(TestHuberLossOp):
+    def set_shape(self):
+        return (6, 6, 1)
+
+
+class TestHuberLossOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input and label must be Variable
+            xw = np.random.random((6, 6)).astype("float32")
+            xr = fluid.data(name='xr', shape=[None, 6], dtype="float32")
+            lw = np.random.random((6, 6)).astype("float32")
+            lr = fluid.data(name='lr', shape=[None, 6], dtype="float32")
+            delta = 1.0
+            self.assertRaises(TypeError, fluid.layers.huber_loss, xr, lw, delta)
+            self.assertRaises(TypeError, fluid.layers.huber_loss, xw, lr, delta)
+
+            # the dtype of input and label must be float32 or float64
+            xw2 = fluid.data(name='xw2', shape=[None, 6], dtype="int32")
+            lw2 = fluid.data(name='lw2', shape=[None, 6], dtype="int32")
+            self.assertRaises(
+                TypeError, fluid.layers.huber_loss, xw2, lr, delta
+            )
+            self.assertRaises(
+                TypeError, fluid.layers.huber_loss, xr, lw2, delta
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_merged_adam_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_adam_op_mlu.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+sys.path.append('..')
+import unittest
+import paddle
+import numpy as np
+from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid.framework import in_dygraph_mode
+
+
+def run_adam_op(
+    params,
+    grads,
+    lrs,
+    moment1s,
+    moment2s,
+    beta1_pows,
+    beta2_pows,
+    master_params,
+    epsilon,
+    beta1,
+    beta2,
+    place,
+    multi_precision=False,
+    use_merged=False,
+):
+    assert len(params) == len(grads)
+    assert len(params) == len(lrs)
+    assert len(params) == len(moment1s)
+    assert len(params) == len(moment2s)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(master_params)
+    paddle.disable_static()
+    # paddle.set_device(place)
+
+    param_vars = [paddle.fluid.dygraph.to_variable(p) for p in params]
+    grad_vars = [paddle.fluid.dygraph.to_variable(g) for g in grads]
+    lr_vars = [paddle.fluid.dygraph.to_variable(l) for l in lrs]
+    moment1_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment1s]
+    moment2_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment2s]
+    beta1_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta1_pows]
+    beta2_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta2_pows]
+    master_param_vars = [
+        paddle.fluid.dygraph.to_variable(m_p) for m_p in master_params
+    ]
+
+    if not use_merged:
+        for i in range(len(param_vars)):
+            _, _, _, _, _, _ = _legacy_C_ops.adam(
+                param_vars[i],
+                grad_vars[i],
+                lr_vars[i],
+                moment1_vars[i],
+                moment2_vars[i],
+                beta1_pow_vars[i],
+                beta2_pow_vars[i],
+                master_param_vars[i],
+                param_vars[i],
+                moment1_vars[i],
+                moment2_vars[i],
+                beta1_pow_vars[i],
+                beta2_pow_vars[i],
+                master_param_vars[i],
+                'epsilon',
+                epsilon,
+                'beta1',
+                beta1,
+                'beta2',
+                beta2,
+                'multi_precision',
+                multi_precision,
+            )
+    else:
+        if in_dygraph_mode():
+            _, _, _, _, _, _ = _C_ops.merged_adam_(
+                param_vars,
+                grad_vars,
+                lr_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                beta1,
+                beta2,
+                epsilon,
+                multi_precision,
+                False,
+            )
+        else:
+            _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
+                param_vars,
+                grad_vars,
+                lr_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                param_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                'epsilon',
+                epsilon,
+                'beta1',
+                beta1,
+                'beta2',
+                beta2,
+                'multi_precision',
+                multi_precision,
+            )
+
+    outputs = {
+        'ParamOut': param_vars,
+        'Moment1Out': moment1_vars,
+        'Moment2Out': moment2_vars,
+        'Beta1PowOut': beta1_pow_vars,
+        'Beta2PowOut': beta2_pow_vars,
+        'MasterParamOut': master_param_vars,
+    }
+
+    return outputs
+
+
+class TestMergedAdam(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        # dtype = np.float16 if multi_precision and place == 'mlu' else np.float32
+        dtype = np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        lrs = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        moment1s = self.gen_rand_data(shapes, mp_dtype)
+        moment2s = self.gen_rand_data(shapes, mp_dtype)
+        beta1_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        beta2_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        master_params = [p.astype(mp_dtype) for p in params]
+        return (
+            params,
+            grads,
+            lrs,
+            moment1s,
+            moment2s,
+            beta1_pows,
+            beta2_pows,
+            master_params,
+        )
+
+    def check_with_place(self, place, multi_precision):
+        (
+            params,
+            grads,
+            lrs,
+            moment1s,
+            moment2s,
+            beta1_pows,
+            beta2_pows,
+            master_params,
+        ) = self.prepare_data(self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_merged):
+            return run_adam_op(
+                params=params,
+                grads=grads,
+                lrs=lrs,
+                moment1s=moment1s,
+                moment2s=moment2s,
+                beta1_pows=beta1_pows,
+                beta2_pows=beta2_pows,
+                master_params=master_params,
+                epsilon=0.9,
+                beta1=0.9,
+                beta2=0.99,
+                place=place,
+                multi_precision=multi_precision,
+                use_merged=use_merged,
+            )
+
+        outs1 = run_op(True)
+        outs2 = run_op(False)
+        self.assertEqual(len(outs1), len(outs2))
+
+        for key in outs1.keys():
+            value1 = outs1[key]
+            value2 = outs2[key]
+            for i in range(len(value1)):
+                if place == 'mlu':
+                    np.testing.assert_array_equal(value1[i], value2[i])
+                else:
+                    np.testing.assert_allclose(
+                        value1[i], value2[i], rtol=1e-05, atol=1e-07
+                    )
+
+    def test_main(self):
+        for multi_precision in [False, True]:
+            self.check_with_place(self.place, multi_precision)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_prior_box_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_prior_box_op_mlu.py
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+
+sys.path.append('..')
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle
+import math
+
+paddle.enable_static()
+
+
+class TestMLUPriorBox(OpTest):
+    def setUp(self):
+        self.op_type = "prior_box"
+        self.set_mlu()
+        self.init_dtype()
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'min_sizes': self.min_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'variances': self.variances,
+            'flip': self.flip,
+            'clip': self.clip,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset,
+        }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
+
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = True
+
+    def init_test_params(self):
+        self.layer_w = 32
+        self.layer_h = 32
+
+        self.image_w = 40
+        self.image_h = 40
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.min_sizes = [2, 4]
+        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
+        self.set_max_sizes()
+        self.aspect_ratios = [2.0, 3.0]
+        self.flip = True
+        self.set_min_max_aspect_ratios_order()
+        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
+        self.aspect_ratios = np.array(
+            self.aspect_ratios, dtype=np.float64
+        ).flatten()
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float64).flatten()
+
+        self.clip = True
+        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
+        if len(self.max_sizes) > 0:
+            self.num_priors += len(self.max_sizes)
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w, self.image_h)
+        ).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w, self.layer_h)
+        ).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        idx = 0
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                idx = 0
+                for s in range(len(self.min_sizes)):
+                    min_size = self.min_sizes[s]
+                    if not self.min_max_aspect_ratios_order:
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+                    else:
+                        c_w = c_h = min_size / 2.0
+                        out_boxes[h, w, idx, :] = [
+                            (c_x - c_w) / self.image_w,
+                            (c_y - c_h) / self.image_h,
+                            (c_x + c_w) / self.image_w,
+                            (c_y + c_h) / self.image_h,
+                        ]
+                        idx += 1
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            if abs(ar - 1.0) < 1e-6:
+                                continue
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+        # clip the prior's coordidate such that it is within[0, 1]
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        # set the variance.
+        out_var = np.tile(
+            self.variances, (self.layer_h, self.layer_w, self.num_priors, 1)
+        )
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+class TestMLUPriorBoxWithoutMaxSize(TestMLUPriorBox):
+    def set_max_sizes(self):
+        self.max_sizes = []
+
+
+class TestMLUPriorBoxWithoutSpecifiedOutOrder(TestMLUPriorBox):
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = False
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
@@ -26,7 +26,6 @@ paddle.enable_static()


 class TestMLUReduceSumOp(OpTest):
-
    def setUp(self):
        self.init_op_type()
        self.initTestCase()
@@ -34,16 +33,16 @@ class TestMLUReduceSumOp(OpTest):
        self.attrs = {
            'dim': self.axis,
            'keep_dim': self.keep_dim,
-            'reduce_all': self.reduce_all
+            'reduce_all': self.reduce_all,
        }
        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
        if self.attrs['reduce_all']:
            self.outputs = {'Out': self.inputs['X'].sum()}
        else:
            self.outputs = {
-                'Out':
-                self.inputs['X'].sum(axis=self.axis,
-                                     keepdims=self.attrs['keep_dim'])
+                'Out': self.inputs['X'].sum(
+                    axis=self.axis, keepdims=self.attrs['keep_dim']
+                )
            }

    def set_mlu(self):
@@ -64,100 +63,92 @@ class TestMLUReduceSumOp(OpTest):

    def initTestCase(self):
        self.shape = (5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)


 class TestSumOp5D(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (1, 2, 5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)


 class TestSumOp6D(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (1, 1, 2, 5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)


 class TestSumOp8D(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
        self.axis = (0, 3)


 class Test1DReduce(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = 120
-        self.axis = (0, )
+        self.axis = (0,)


 class Test2DReduce0(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (20, 10)
-        self.axis = (0, )
+        self.axis = (0,)


 class Test2DReduce1(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (20, 10)
-        self.axis = (1, )
+        self.axis = (1,)


 class Test3DReduce0(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 7)
-        self.axis = (1, )
+        self.axis = (1,)


 class Test3DReduce1(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 7)
-        self.axis = (2, )
+        self.axis = (2,)


 class Test3DReduce2(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 7)
-        self.axis = (-2, )
+        self.axis = (-2,)


 class Test3DReduce3(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 7)
        self.axis = (1, 2)


 class TestKeepDimReduce(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 10)
-        self.axis = (1, )
+        self.axis = (1,)
        self.keep_dim = True


 class TestKeepDim8DReduce(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
        self.axis = (3, 4, 5)
        self.keep_dim = True

+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', max_relative_error=0.03
+        )
+

 class TestReduceAll(TestMLUReduceSumOp):
-
    def initTestCase(self):
        self.shape = (5, 6, 2, 10)
-        self.axis = (0, )
+        self.axis = (0,)
        self.reduce_all = True



--- a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
@@ -31,7 +31,6 @@ paddle.enable_static()
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
 class TestSliceOp(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.set_mlu()
@@ -42,7 +41,7 @@ class TestSliceOp(OpTest):
            'axes': self.axes,
            'starts': self.starts,
            'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -57,9 +56,9 @@ class TestSliceOp(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )

    def set_mlu(self):
        self.__class__.use_mlu = True
@@ -67,7 +66,6 @@ class TestSliceOp(OpTest):


 class TestCase1(TestSliceOp):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-3, 0, 2]
@@ -78,7 +76,6 @@ class TestCase1(TestSliceOp):


 class TestCase2(TestSliceOp):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-3, 0, 2]
@@ -90,7 +87,6 @@ class TestCase2(TestSliceOp):

 # 1.2 with attr(decrease)
 class TestSliceOp_decs_dim(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.set_mlu()
@@ -118,9 +114,9 @@ class TestSliceOp_decs_dim(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )

    def set_mlu(self):
        self.__class__.use_mlu = True
@@ -128,7 +124,6 @@ class TestSliceOp_decs_dim(OpTest):


 class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [1, 0, 2]
@@ -140,7 +135,6 @@ class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):


 class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-1, 0, 2]
@@ -152,7 +146,6 @@ class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):


 class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
        self.starts = [0, 1, 2, 3]
@@ -164,7 +157,6 @@ class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):


 class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-1]
@@ -176,7 +168,6 @@ class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):


 class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
-
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [0, 1, 2, 3]
@@ -190,7 +181,6 @@ class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 # without attr(decrease)
 class TestSliceOp_starts_ListTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.set_mlu()
@@ -198,8 +188,9 @@ class TestSliceOp_starts_ListTensor(OpTest):

        starts_tensor = []
        for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int64') * ele))
+            starts_tensor.append(
+                ("x" + str(index), np.ones((1)).astype('int64') * ele)
+            )

        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
        self.outputs = {'Out': self.out}
@@ -207,7 +198,7 @@ class TestSliceOp_starts_ListTensor(OpTest):
            'axes': self.axes,
            'starts': self.starts_infer,
            'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -224,9 +215,9 @@ class TestSliceOp_starts_ListTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )

    def set_mlu(self):
        self.__class__.use_mlu = True
@@ -236,7 +227,6 @@ class TestSliceOp_starts_ListTensor(OpTest):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.set_mlu()
@@ -244,8 +234,9 @@ class TestSliceOp_decs_dim_starts_ListTensor(OpTest):

        starts_tensor = []
        for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
+            starts_tensor.append(
+                ("x" + str(index), np.ones((1)).astype('int32') * ele)
+            )

        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}

@@ -273,9 +264,9 @@ class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )

    def set_mlu(self):
        self.__class__.use_mlu = True
@@ -283,8 +274,8 @@ class TestSliceOp_decs_dim_starts_ListTensor(OpTest):


 class TestSliceOp_decs_dim_5_starts_ListTensor(
-        TestSliceOp_decs_dim_starts_ListTensor):
-
+    TestSliceOp_decs_dim_starts_ListTensor
+):
    def config(self):
        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
        self.starts = [-1]
@@ -300,7 +291,6 @@ class TestSliceOp_decs_dim_5_starts_ListTensor(
 # Situation 3: starts(tensor), ends(list, no tensor)
 # with attr(decrease)
 class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -308,7 +298,7 @@ class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
        self.config()
        self.inputs = {
            'Input': self.input,
-            "StartsTensor": np.array(self.starts, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32"),
        }
        self.outputs = {'Out': self.out}
        self.attrs = {
@@ -332,15 +322,14 @@ class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 # Situation 4: starts(tensor), ends(tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -350,14 +339,14 @@ class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
        self.inputs = {
            'Input': self.input,
            "StartsTensor": np.array(self.starts, dtype="int64"),
-            "EndsTensor": np.array(self.ends, dtype="int32")
+            "EndsTensor": np.array(self.ends, dtype="int32"),
        }
        self.outputs = {'Out': self.out}
        self.attrs = {
            'axes': self.axes,
            #'starts': self.starts,
            #'ends': self.ends_infer,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -372,15 +361,14 @@ class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 # Situation 5: starts(tensor), ends(tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -389,7 +377,7 @@ class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
        self.inputs = {
            'Input': self.input,
            "StartsTensor": np.array(self.starts, dtype="int32"),
-            "EndsTensor": np.array(self.ends, dtype="int32")
+            "EndsTensor": np.array(self.ends, dtype="int32"),
        }
        self.outputs = {'Out': self.out}
        self.attrs = {
@@ -413,15 +401,14 @@ class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 # Situation 6: starts(tensor), ends(list, have tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -430,20 +417,21 @@ class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):

        ends_tensor = []
        for index, ele in enumerate(self.ends):
-            ends_tensor.append(("y" + str(index), np.ones(
-                (1)).astype('int32') * ele))
+            ends_tensor.append(
+                ("y" + str(index), np.ones((1)).astype('int32') * ele)
+            )

        self.inputs = {
            'Input': self.input,
            "StartsTensor": np.array(self.starts, dtype="int32"),
-            'EndsTensorList': ends_tensor
+            'EndsTensorList': ends_tensor,
        }
        self.outputs = {'Out': self.out}
        self.attrs = {
            'axes': self.axes,
            #'starts': self.starts,
            'ends': self.ends_infer,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -460,14 +448,13 @@ class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
        self.check_output_with_place(self.place)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 # Test float16
 class TestFP16(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -479,7 +466,7 @@ class TestFP16(OpTest):
            'axes': self.axes,
            'starts': self.starts,
            'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -495,13 +482,12 @@ class TestFP16(OpTest):
        self.check_output_with_place(self.place, atol=1e-5)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )


 class TestFP16_2(OpTest):
-
    def setUp(self):
        self.op_type = "slice"
        self.__class__.use_mlu = True
@@ -513,7 +499,7 @@ class TestFP16_2(OpTest):
            'axes': self.axes,
            'starts': self.starts,
            'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
        }

    def config(self):
@@ -529,24 +515,28 @@ class TestFP16_2(OpTest):
        self.check_output_with_place(self.place, atol=1e-5)

    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006,
-                                   numeric_grad_delta=0.5)
+        self.check_grad_with_place(
+            self.place,
+            ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            numeric_grad_delta=0.5,
+        )


 class TestSliceApiWithTensor(unittest.TestCase):
-
    def test_starts_ends_is_tensor(self):
        with paddle.fluid.dygraph.guard():
            a = paddle.rand(shape=[4, 5, 6], dtype='float32')
            axes = [0, 1, 2]
            starts = [-3, 0, 2]
            ends = [3, 2, 4]
-            a_1 = paddle.slice(a,
-                               axes=axes,
-                               starts=paddle.to_tensor(starts, dtype='int32'),
-                               ends=paddle.to_tensor(ends, dtype='int32'))
+            a_1 = paddle.slice(
+                a,
+                axes=axes,
+                starts=paddle.to_tensor(starts, dtype='int32'),
+                ends=paddle.to_tensor(ends, dtype='int32'),
+            )
            a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends)

            np.testing.assert_allclose(a_1.numpy(), a_2.numpy())
@@ -569,24 +559,22 @@ class TestSliceApiWithTensor(unittest.TestCase):


 class TestImperativeVarBaseGetItem(unittest.TestCase):
-
    def test_getitem_with_long(self):
        with fluid.dygraph.guard():
            data = np.random.random((2, 80, 16128)).astype('float32')
            var = fluid.dygraph.to_variable(data)
-            sliced = var[:, 10:, :var.shape[1]]  # var.shape[1] is 80L here
+            sliced = var[:, 10:, : var.shape[1]]  # var.shape[1] is 80L here
            self.assertEqual(sliced.shape, [2, 70, 80])

-            sliced = var[:, var.shape[0]:, var.shape[0]:var.shape[1]]
+            sliced = var[:, var.shape[0] :, var.shape[0] : var.shape[1]]
            self.assertEqual(sliced.shape, [2, 78, 78])

    def test_getitem_with_float(self):
-
        def test_float_in_slice_item():
            with fluid.dygraph.guard():
                data = np.random.random((2, 80, 16128)).astype('float32')
                var = fluid.dygraph.to_variable(data)
-                sliced = var[:, 1.1:, :var.shape[1]]
+                sliced = var[:, 1.1:, : var.shape[1]]

        self.assertRaises(Exception, test_float_in_slice_item)

@@ -600,15 +588,6 @@ class TestImperativeVarBaseGetItem(unittest.TestCase):


 class TestInferShape(unittest.TestCase):
-
-    def test(self):
-        x = paddle.ones(shape=[3, 4, 5])
-        x.desc.set_shape([3, -1, 5])
-        self.assertEqual(x.shape, (3, -1, 5))
-
-        out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
-        self.assertEqual(out0.shape, (3, 3, 5))
-
    def test_axis_less_than_zero(self):

        # Using paddle.disable_static will make other unittests fail.
@@ -616,13 +595,18 @@ class TestInferShape(unittest.TestCase):
            x_arr = np.arange(0, 24, dtype=np.float32).reshape([2, 3, 4])
            x = paddle.to_tensor(x_arr)

-            pp_slice = paddle.slice(x, [
-                100,
-            ], [0], [1])
+            pp_slice = paddle.slice(
+                x,
+                [
+                    100,
+                ],
+                [0],
+                [1],
+            )
            np_slice = x_arr[:, :, 0:1]
            np.testing.assert_allclose(pp_slice, np_slice)

-            pp_slice = paddle.slice(x, (-100, ), [0], [1])
+            pp_slice = paddle.slice(x, (-100,), [0], [1])
            np_slice = x_arr[0:1]
            np.testing.assert_allclose(pp_slice, np_slice)

@@ -630,9 +614,11 @@ class TestInferShape(unittest.TestCase):
            x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0)))

            starts = paddle.to_tensor(
-                np.reshape(np.array([], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0,))
+            )
            ends = paddle.to_tensor(
-                np.reshape(np.array([], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0,))
+            )

            with self.assertRaises(ValueError):
                paddle.slice(x, [-1000000], starts, ends)

--- a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
@@ -30,7 +30,6 @@ SEED = 2021


 class TestSoftmaxWithCrossEntropyOp(OpTest):
-
    def set_mlu(self):
        self.__class__.use_mlu = True

@@ -53,8 +52,10 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        self.initParams()

        logits = getattr(
-            self, "logits",
-            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype),
+        )
        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)

        if self.soft_label:
@@ -65,8 +66,9 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
            self.shape[self.axis] = 1
            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")

-        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
-                             self.ignore_index)
+        loss = cross_entropy(
+            softmax, labels, self.soft_label, self.axis, self.ignore_index
+        )

        one_hot_label = np.eye(axis_dim)[labels.reshape(-1)]

@@ -74,7 +76,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        self.outputs = {
            "Backprop": (softmax - one_hot_label).astype(self.dtype),
            "Softmax": softmax.astype(self.dtype),
-            "Loss": loss.astype(self.dtype)
+            "Loss": loss.astype(self.dtype),
        }
        self.attrs = {
            "numeric_stable_mode": self.numeric_stable_mode,
@@ -92,14 +94,16 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        if self.dtype == np.float16:
            return
        # fp32 has low precision, cpu and mlu both need to relax the max_relative_error if using fp32
-        self.check_grad_with_place(self.place, ['Logits'],
-                                   'Loss',
-                                   numeric_grad_delta=0.001,
-                                   max_relative_error=0.5)
+        self.check_grad_with_place(
+            self.place,
+            ['Logits'],
+            'Loss',
+            numeric_grad_delta=0.001,
+            max_relative_error=0.5,
+        )


 class TestPowNet(unittest.TestCase):
-
    def _test(self, run_mlu=True):
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
@@ -114,9 +118,9 @@ class TestPowNet(unittest.TestCase):
        with paddle.static.program_guard(main_prog, startup_prog):
            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(name="label",
-                                       shape=[32, 1],
-                                       dtype='int64')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64'
+            )

            sum = paddle.add(a, b)
            z = paddle.pow(sum, 2.0)
@@ -140,16 +144,17 @@ class TestPowNet(unittest.TestCase):
        print("Start run on {}".format(place))
        for epoch in range(100):

-            pred_res, loss_res = exe.run(main_prog,
-                                         feed={
-                                             "a": a_np,
-                                             "b": b_np,
-                                             "label": label_np
-                                         },
-                                         fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np, "b": b_np, "label": label_np},
+                fetch_list=[prediction, loss],
+            )
            if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
+                print(
+                    "Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                        epoch, pred_res[0], loss_res
+                    )
+                )

        return pred_res, loss_res

@@ -157,7 +162,7 @@ class TestPowNet(unittest.TestCase):
        cpu_pred, cpu_loss = self._test(False)
        mlu_pred, mlu_loss = self._test(True)

-        np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-5)
+        np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=2e-5)
        np.testing.assert_allclose(mlu_loss, cpu_loss)



--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
@@ -44,17 +44,19 @@ SEED = 10


 class TestSyncBatchNormRunnerBase(object):
-
-    def get_model(self,
-                  main,
-                  startup,
-                  place,
-                  layout,
-                  seed,
-                  sync_bn=False,
-                  only_forward=False):
+    def get_model(
+        self,
+        main,
+        startup,
+        place,
+        layout,
+        seed,
+        sync_bn=False,
+        only_forward=False,
+    ):
        raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )

    def wait_server_ready(self, endpoints):
        assert not isinstance(endpoints, string_types)
@@ -63,13 +65,15 @@ class TestSyncBatchNormRunnerBase(object):
            not_ready_endpoints = []
            for ep in endpoints:
                ip_port = ep.split(":")
-                with closing(socket.socket(socket.AF_INET,
-                                           socket.SOCK_STREAM)) as sock:
+                with closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                ) as sock:
                    sock.settimeout(2)
                    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                    if hasattr(socket, 'SO_REUSEPORT'):
-                        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT,
-                                        1)
+                        sock.setsockopt(
+                            socket.SOL_SOCKET, socket.SO_REUSEPORT, 1
+                        )

                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                    if result != 0:
@@ -77,39 +81,47 @@ class TestSyncBatchNormRunnerBase(object):
                        not_ready_endpoints.append(ep)
            if not all_ok:
                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" +
-                                 str(not_ready_endpoints) + "\n")
+                sys.stderr.write(
+                    "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+                )
                sys.stderr.flush()
                time.sleep(3)
            else:
                break

-    def initCommunicator(self, program, rank, nranks, wait_port,
-                         current_endpoint, endpoints):
+    def initCommunicator(
+        self, program, rank, nranks, wait_port, current_endpoint, endpoints
+    ):
        other_endpoints = endpoints[:]
        other_endpoints.remove(current_endpoint)
        if rank == 0 and wait_port:
            self.wait_server_ready(other_endpoints)
        block = program.global_block()
-        cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'),
-                                       persistable=True,
-                                       type=core.VarDesc.VarType.RAW)
-        block.append_op(type='c_gen_cncl_id',
-                        inputs={},
-                        outputs={'Out': cncl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-        block.append_op(type='c_comm_init',
-                        inputs={'X': cncl_id_var},
-                        outputs={},
-                        attrs={
-                            'nranks': nranks,
-                            'rank': rank,
-                            'ring_id': self.global_ring_id
-                        })
+        cncl_id_var = block.create_var(
+            name=nameGen.generate('cncl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+        )
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id,
+            },
+        )

    def run_trainer(self, args):
        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
@@ -127,8 +139,8 @@ class TestSyncBatchNormRunnerBase(object):
                self._compare(args, place, layout, True)

        # Test FP16 - @TODO
-        self.dtype = np.float16
-        self.atol = 1e-2
+        self.bn_dtype = np.float16
+        self.atol = 3e-3

        # Test training
        for place in places:
@@ -142,24 +154,30 @@ class TestSyncBatchNormRunnerBase(object):

        sys.stdout.buffer.write(
            pickle.dumps(
-                'training, inference, fp32, fp16, NCHW, NHWC all passed'))
+                'training, inference, fp32, fp16, NCHW, NHWC all passed'
+            )
+        )

    def _compare(self, args, place, layout, only_forward):
        scope = core.Scope()

        np.random.seed(SEED)
-        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+        data = np.random.random(size=self.dshape).astype(self.dtype) * 4.0 - 2
        sys.stderr.write("data: " + str(data) + "\n")
-        data = create_or_get_tensor(scope, "input",
-                                    OpTest.np_dtype_to_fluid_dtype(data), place)
+        data = create_or_get_tensor(
+            scope, "input", OpTest.np_dtype_to_fluid_dtype(data), place
+        )

-        bn_fetches = self._cal_single_card(args, data, place, layout,
-                                           only_forward)
+        bn_fetches = self._cal_single_card(
+            args, data, place, layout, only_forward
+        )
        fetch_names, sync_bn_fetches = self._cal_multiple_cards(
-            args, data, place, layout, only_forward)
+            args, data, place, layout, only_forward
+        )

-        sys.stderr.write("len(sync_bn_fetches): " + str(len(sync_bn_fetches)) +
-                         "\n")
+        sys.stderr.write(
+            "len(sync_bn_fetches): " + str(len(sync_bn_fetches)) + "\n"
+        )
        for i in six.moves.xrange(0, len(sync_bn_fetches)):
            sys.stderr.write("i: " + str(i) + "\n")
            sys.stderr.write("fetch_names[i]): " + fetch_names[i] + "\n")
@@ -167,13 +185,14 @@ class TestSyncBatchNormRunnerBase(object):
            bn_val = bn_fetches[i]
            sync_bn_val = sync_bn_fetches[i]
            if sync_bn_val.shape != bn_val.shape:
-                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
+                sync_bn_val = sync_bn_val[: bn_val.shape[0]]

            # i = 0
            if fetch_names[i] == 'reduce_sum_0.tmp_0':
                # sys.stderr.write("skip reduce_sum_0.tmp_0 (Out of reduce_sum op)" + "\n")
-                sys.stderr.write("reduce_sum_0.tmp_0 (Out of reduce_sum op)" +
-                                 "\n")
+                sys.stderr.write(
+                    "reduce_sum_0.tmp_0 (Out of reduce_sum op)" + "\n"
+                )
                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")

@@ -201,7 +220,8 @@ class TestSyncBatchNormRunnerBase(object):
            if fetch_names[i] == 'batch_norm_0.tmp_2':
                # sys.stderr.write("skip batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n")
                sys.stderr.write(
-                    "batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n")
+                    "batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n"
+                )
                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")

@@ -234,8 +254,9 @@ class TestSyncBatchNormRunnerBase(object):

            # i = 8
            if fetch_names[i] == 'batch_norm_0.tmp_1':
-                sys.stderr.write("skip batch_norm_0.tmp_1 (SavedVariance)" +
-                                 "\n")
+                sys.stderr.write(
+                    "skip batch_norm_0.tmp_1 (SavedVariance)" + "\n"
+                )
                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")

@@ -281,10 +302,16 @@ class TestSyncBatchNormRunnerBase(object):
            if fetch_names[i] == 'conv2d_0.tmp_0@GRAD':
                atol = 1e-2

-            assert np.allclose(
-                bn_val, sync_bn_val, atol=atol), "Output (" + fetch_names[
-                    i] + ") has diff. \n" + "\nBN     " + str(
-                        bn_val) + "\n" + "Sync BN " + str(sync_bn_val)
+            assert np.allclose(bn_val, sync_bn_val, atol=atol), (
+                "Output ("
+                + fetch_names[i]
+                + ") has diff. \n"
+                + "\nBN     "
+                + str(bn_val)
+                + "\n"
+                + "Sync BN "
+                + str(sync_bn_val)
+            )

    def _cal_single_card(self, args, data, place, layout, only_forward):
        # Single-MLU, N = 32 per MLU
@@ -294,23 +321,31 @@ class TestSyncBatchNormRunnerBase(object):
        startup_prog.global_seed(SEED)
        paddle.seed(SEED)

-        outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
-                              False, only_forward)
+        outs = self.get_model(
+            train_prog, startup_prog, place, layout, SEED, False, only_forward
+        )

        exe = fluid.Executor(place)
        exe.run(startup_prog)
        fetch_names = [v.name for v in outs] + [
-            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+            'bn_moving_mean',
+            'bn_moving_variance',
+            'bn_scale',
+            'bn_bias',
        ]
        if not only_forward:
            others = [
-                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
+                'batch_norm_0.tmp_0',
+                'batch_norm_0.tmp_1',
+                'bn_scale@GRAD',
+                'bn_bias@GRAD',
+                'batch_norm_0.tmp_3@GRAD',
+                'conv2d_0.tmp_0@GRAD',
            ]
            fetch_names += others
-        bn_fetches = exe.run(program=train_prog,
-                             feed={'input': data},
-                             fetch_list=fetch_names)
+        bn_fetches = exe.run(
+            program=train_prog, feed={'input': data}, fetch_list=fetch_names
+        )

        return bn_fetches

@@ -331,8 +366,9 @@ class TestSyncBatchNormRunnerBase(object):
        current_endpoint = args["currentendpoint"]
        nranks = 2

-        self.initCommunicator(startup_prog, rank, nranks, True,
-                              current_endpoint, endpoints)
+        self.initCommunicator(
+            startup_prog, rank, nranks, True, current_endpoint, endpoints
+        )
        # sys.stderr.write("after init, startup_prog: " +
        #                  startup_prog.to_string(True) + "\n")
        train_prog.global_seed(SEED)
@@ -342,8 +378,9 @@ class TestSyncBatchNormRunnerBase(object):
        paddle.seed(SEED)

        self.rank = rank
-        outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
-                              True, only_forward)
+        outs = self.get_model(
+            train_prog, startup_prog, place, layout, SEED, True, only_forward
+        )
        # sys.stderr.write("after get_model, train_prog: " +
        #                  train_prog.to_string(True) + "\n")
        # sys.stderr.write("after get_model, startup_prog: " +
@@ -366,17 +403,24 @@ class TestSyncBatchNormRunnerBase(object):
        exe = fluid.Executor(place)
        exe.run(startup_prog)
        fetch_names = [v.name for v in outs] + [
-            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+            'bn_moving_mean',
+            'bn_moving_variance',
+            'bn_scale',
+            'bn_bias',
        ]
        if not only_forward:
            others = [
-                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
+                'batch_norm_0.tmp_0',
+                'batch_norm_0.tmp_1',
+                'bn_scale@GRAD',
+                'bn_bias@GRAD',
+                'batch_norm_0.tmp_3@GRAD',
+                'conv2d_0.tmp_0@GRAD',
            ]
            fetch_names += others
-        sync_bn_fetches = exe.run(program=train_prog,
-                                  feed={'input': data},
-                                  fetch_list=fetch_names)
+        sync_bn_fetches = exe.run(
+            program=train_prog, feed={'input': data}, fetch_list=fetch_names
+        )

        return fetch_names, sync_bn_fetches

@@ -399,19 +443,20 @@ from contextlib import closing


 class TestDistBase(unittest.TestCase):
-
    def setUp(self):
        self._port_set = set()
        self._trainers = 2
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
        self._python_interp = sys.executable

    def _find_free_port(self):
-
        def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                s.bind(('', 0))
                return s.getsockname()[1]

@@ -440,7 +485,7 @@ class TestDistBase(unittest.TestCase):
            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
            "PADDLE_CURRENT_ENDPOINT": w1_ep,
        }
-        #update environment
+        # update environment
        env0.update(envs)
        env1.update(envs)

@@ -451,15 +496,19 @@ class TestDistBase(unittest.TestCase):
        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
        print("tr0_cmd: {}, env: {}\n".format(tr0_cmd, env0))
        print("tr1_cmd: {}, env: {}\n".format(tr1_cmd, env1))
-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
-
-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )

        tr0_out, tr0_err = tr0_proc.communicate()
        tr1_out, tr1_err = tr1_proc.communicate()
@@ -473,14 +522,16 @@ class TestDistBase(unittest.TestCase):
            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self, model_file, col_type, check_error_log=False, need_envs={}
+    ):
        required_envs = {
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
            "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -491,7 +542,7 @@ class TestDistBase(unittest.TestCase):
            "FLAGS_call_stack_level": "2",
            "GLOG_v": "3",
            "PADDLE_WITH_GLOO": '0',
-            "BACKEND": "cncl"
+            "BACKEND": "cncl",
        }
        required_envs.update(need_envs)
        if check_error_log:
@@ -499,8 +550,11 @@ class TestDistBase(unittest.TestCase):
            required_envs["GLOG_logtostderr"] = "1"
            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
        self.assertEqual(
-            tr0_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed')
+            tr0_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed'
+        )
        self.assertEqual(
-            tr1_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed')
+            tr1_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed'
+        )
--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
@@ -29,14 +29,17 @@ paddle.enable_static()


 class TestSyncBatchNormOp(TestDistBase):
-
    def _setup_config(self):
        pass

    def test_identity(self, col_type="identity"):
-        self.check_with_place("sync_batch_norm_op_mlu.py",
-                              col_type,
-                              check_error_log=True)
+        envs = {"CNCL_MEM_POOL_MULTI_CLIQUE_ENABLE": "1"}
+        self.check_with_place(
+            "sync_batch_norm_op_mlu.py",
+            col_type,
+            check_error_log=True,
+            need_envs=envs,
+        )


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/mlu/test_yolo_box_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_yolo_box_op_mlu.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+import sys
+
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+from paddle.fluid import core
+import paddle.fluid as fluid
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+from paddle.fluid.framework import _test_eager_guard
+
+paddle.enable_static()
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(((-1.0) * x)))
+
+
+def YoloBox(x, img_size, attrs):
+    (n, c, h, w) = x.shape
+    anchors = attrs['anchors']
+    an_num = int((len(anchors) // 2))
+    class_num = attrs['class_num']
+    conf_thresh = attrs['conf_thresh']
+    downsample = attrs['downsample_ratio']
+    clip_bbox = attrs['clip_bbox']
+    scale_x_y = attrs['scale_x_y']
+    iou_aware = attrs['iou_aware']
+    iou_aware_factor = attrs['iou_aware_factor']
+    bias_x_y = (-0.5) * (scale_x_y - 1.0)
+    input_h = downsample * h
+    input_w = downsample * w
+    if iou_aware:
+        ioup = x[:, :an_num, :, :]
+        ioup = np.expand_dims(ioup, axis=(-1))
+        x = x[:, an_num:, :, :]
+    x = x.reshape((n, an_num, (5 + class_num), h, w)).transpose((0, 1, 3, 4, 2))
+    pred_box = x[:, :, :, :, :4].copy()
+    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
+    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
+    pred_box[:, :, :, :, 0] = (
+        (grid_x + (sigmoid(pred_box[:, :, :, :, 0]) * scale_x_y)) + bias_x_y
+    ) / w
+    pred_box[:, :, :, :, 1] = (
+        (grid_y + (sigmoid(pred_box[:, :, :, :, 1]) * scale_x_y)) + bias_x_y
+    ) / h
+    anchors = [
+        (anchors[i], anchors[(i + 1)]) for i in range(0, len(anchors), 2)
+    ]
+    anchors_s = np.array(
+        [((an_w / input_w), (an_h / input_h)) for (an_w, an_h) in anchors]
+    )
+    anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
+    anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
+    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
+    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
+    if iou_aware:
+        pred_conf = (sigmoid(x[:, :, :, :, 4:5]) ** (1 - iou_aware_factor)) * (
+            sigmoid(ioup) ** iou_aware_factor
+        )
+    else:
+        pred_conf = sigmoid(x[:, :, :, :, 4:5])
+    pred_conf[(pred_conf < conf_thresh)] = 0.0
+    pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
+    pred_box = pred_box * (pred_conf > 0.0).astype('float32')
+    pred_box = pred_box.reshape((n, (-1), 4))
+    (pred_box[:, :, :2], pred_box[:, :, 2:4]) = (
+        (pred_box[:, :, :2] - (pred_box[:, :, 2:4] / 2.0)),
+        (pred_box[:, :, :2] + (pred_box[:, :, 2:4] / 2.0)),
+    )
+    pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis]
+    pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis]
+    if clip_bbox:
+        for i in range(len(pred_box)):
+            pred_box[i, :, 0] = np.clip(pred_box[i, :, 0], 0, np.inf)
+            pred_box[i, :, 1] = np.clip(pred_box[i, :, 1], 0, np.inf)
+            pred_box[i, :, 2] = np.clip(
+                pred_box[i, :, 2], (-np.inf), (img_size[(i, 1)] - 1)
+            )
+            pred_box[i, :, 3] = np.clip(
+                pred_box[i, :, 3], (-np.inf), (img_size[(i, 0)] - 1)
+            )
+    return (pred_box, pred_score.reshape((n, (-1), class_num)))
+
+
+class TestYoloBoxOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'yolo_box'
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+        self.python_api = paddle.vision.ops.yolo_box
+        x = np.random.random(self.x_shape).astype('float32')
+        img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32')
+        self.attrs = {
+            'anchors': self.anchors,
+            'class_num': self.class_num,
+            'conf_thresh': self.conf_thresh,
+            'downsample_ratio': self.downsample,
+            'clip_bbox': self.clip_bbox,
+            'scale_x_y': self.scale_x_y,
+            'iou_aware': self.iou_aware,
+            'iou_aware_factor': self.iou_aware_factor,
+        }
+        self.inputs = {'X': x, 'ImgSize': img_size}
+        (boxes, scores) = YoloBox(x, img_size, self.attrs)
+        self.outputs = {'Boxes': boxes, 'Scores': scores}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False, atol=1e-5)
+
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = True
+        self.x_shape = (
+            self.batch_size,
+            (an_num * (5 + self.class_num)),
+            13,
+            13,
+        )
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.0
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxOpNoClipBbox(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = False
+        self.x_shape = (
+            self.batch_size,
+            (an_num * (5 + self.class_num)),
+            13,
+            13,
+        )
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.0
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxOpScaleXY(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = True
+        self.x_shape = (
+            self.batch_size,
+            (an_num * (5 + self.class_num)),
+            13,
+            13,
+        )
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.2
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxOpIoUAware(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = True
+        self.x_shape = (
+            self.batch_size,
+            (an_num * (6 + self.class_num)),
+            13,
+            13,
+        )
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.0
+        self.iou_aware = True
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxDygraph(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        img_size = np.ones((2, 2)).astype('int32')
+        img_size = paddle.to_tensor(img_size)
+        x1 = np.random.random([2, 14, 8, 8]).astype('float32')
+        x1 = paddle.to_tensor(x1)
+        (boxes, scores) = paddle.vision.ops.yolo_box(
+            x1,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.0,
+        )
+        assert (boxes is not None) and (scores is not None)
+        x2 = np.random.random([2, 16, 8, 8]).astype('float32')
+        x2 = paddle.to_tensor(x2)
+        (boxes, scores) = paddle.vision.ops.yolo_box(
+            x2,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.0,
+            iou_aware=True,
+            iou_aware_factor=0.5,
+        )
+        paddle.enable_static()
+
+
+class TestYoloBoxStatic(unittest.TestCase):
+    def test_static(self):
+        x1 = paddle.static.data('x1', [2, 14, 8, 8], 'float32')
+        img_size = paddle.static.data('img_size', [2, 2], 'int32')
+        (boxes, scores) = paddle.vision.ops.yolo_box(
+            x1,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.0,
+        )
+        assert (boxes is not None) and (scores is not None)
+        x2 = paddle.static.data('x2', [2, 16, 8, 8], 'float32')
+        (boxes, scores) = paddle.vision.ops.yolo_box(
+            x2,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.0,
+            iou_aware=True,
+            iou_aware_factor=0.5,
+        )
+        assert (boxes is not None) and (scores is not None)
+
+
+class TestYoloBoxOpHW(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = False
+        self.x_shape = (self.batch_size, (an_num * (5 + self.class_num)), 13, 9)
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.0
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()