2.4.2

dbe08e9b · yuguo960516yuguo · b5499578 · dbe08e9b · dbe08e9b · dbe08e9b
Commit dbe08e9b authored Jun 12, 2023 by yuguo960516yuguo
20 changed files
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -107,15 +107,21 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
            sizeof(bias_data)));

    if (enable_auxiliary && activation != "none") {
-      size_t reserve_space_size = 0;
+      // Note (Ming Huang): The initialization of ReseveSpace is happened in the
+      // dev_ctx.Alloc. Therefore, we set real date type up here.
      if (activation == "relu") {
-        // Count in bits.
-        reserve_space_size = phi::product(out->dims()) / 8;
+        paddle::experimental::DataType rs_type =
+            paddle::experimental::DataType::BOOL;
+        size_t reserve_space_size =
+            phi::product(reserve_space->dims()) * SizeOf(rs_type);
+        dev_ctx.Alloc(reserve_space, rs_type, reserve_space_size);
      } else {
-        reserve_space_size = phi::product(out->dims()) * sizeof(T);
+        size_t reserve_space_size =
+            phi::product(reserve_space->dims()) * sizeof(T);
+        dev_ctx.Alloc<T>(reserve_space, reserve_space_size);
      }
-      dev_ctx.Alloc(reserve_space, out->type(), reserve_space_size);
-      void* aux_data = reinterpret_cast<void*>(reserve_space->data<T>());
+
+      void* aux_data = reserve_space->data();

      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::cublasLtMatmulDescSetAttribute(
@@ -185,7 +191,6 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
                                                              stream,
                                                              workspace->ptr(),
                                                              workspace_size);
-
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cublasLtMatmul(lt_handle,
                                          operation_desc,
@@ -478,7 +483,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
              sizeof(epiloque_func_for_dx)));

      if (activation_grad != "none") {
-        auto* aux_data = reserve_space->data<T>();
+        auto* aux_data = reserve_space->data();
        PADDLE_ENFORCE_GPU_SUCCESS(
            platform::dynload::cublasLtMatmulDescSetAttribute(
                dx_operation_desc,

--- a/paddle/fluid/operators/huber_loss_op_mlu.cc
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = phi::DenseTensor;
+
+template <typename T>
+class HuberLossMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* residual = ctx.Output<Tensor>("Residual");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto delta = ctx.Attr<float>("delta");
+
+    auto place = ctx.GetPlace();
+
+    // compute y-x
+    cnnlDataType_t data_type = ToCnnlDataType<T>();
+    residual->mutable_data<T>(x->dims(), place);
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlOpTensorDesc sub_op_desc(
+        CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx,
+                      sub_op_desc.get(),
+                      x_desc.get(),
+                      GetBasePtr(y),
+                      x_desc.get(),
+                      GetBasePtr(x),
+                      x_desc.get(),
+                      GetBasePtr(residual),
+                      data_type);
+
+    // compute smoothl1loss
+    out->mutable_data<T>(x->dims(), place);
+    cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
+        CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
+                                           // here
+    MLUCnnl::SmoothL1LossForward(ctx,
+                                 x_desc.get(),
+                                 GetBasePtr(x),
+                                 x_desc.get(), /* target has same shape as x */
+                                 GetBasePtr(y),
+                                 static_cast<float>(delta),
+                                 smoothl1_algo,
+                                 x_desc.get(), /* out has same shape as x */
+                                 GetBasePtr(out));
+
+    // compute multiply by delta
+    Tensor scale_tensor, bias_tensor;
+    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
+    const int axis = std::max(out->dims().size() - 1, 0);
+
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Scale(ctx,
+                   axis,
+                   out_desc.get(),
+                   GetBasePtr(out),
+                   scale_desc.get(),
+                   GetBasePtr(&scale_tensor),
+                   bias_desc.get(),
+                   GetBasePtr(&bias_tensor),
+                   out_desc.get(),
+                   GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class HuberLossGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    auto* residual = ctx.Input<Tensor>("Residual");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto delta = ctx.Attr<float>("delta");
+
+    auto place = ctx.GetPlace();
+
+    Tensor t_grad_rd;
+    t_grad_rd =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
+    MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
+    if (dx || dy) {
+      Tensor t_zero;
+      t_zero =
+          ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
+
+      MLUCnnlTensorDesc residual_desc(*residual);
+      MLUCnnlTensorDesc dout_desc(*dout);
+
+      cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
+          CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
+                                             // here
+      MLUCnnl::SmoothL1LossBackward(ctx,
+                                    residual_desc.get(),
+                                    GetBasePtr(residual),
+                                    residual_desc.get(),
+                                    GetBasePtr(&t_zero),
+                                    dout_desc.get(),
+                                    GetBasePtr(dout),
+                                    static_cast<float>(delta),
+                                    smoothl1_algo,
+                                    t_grad_rd_desc.get(),
+                                    GetBasePtr(&t_grad_rd));
+    }
+    // compute multiply by delta
+    Tensor scale_tensor, bias_tensor;
+    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
+    const int axis = std::max(t_grad_rd.dims().size() - 1, 0);
+
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(-delta), &scale_tensor);
+      MLUCnnlTensorDesc out_desc(*dx);
+      MLUCnnl::Scale(ctx,
+                     axis,
+                     t_grad_rd_desc.get(),
+                     GetBasePtr(&t_grad_rd),
+                     scale_desc.get(),
+                     GetBasePtr(&scale_tensor),
+                     bias_desc.get(),
+                     GetBasePtr(&bias_tensor),
+                     out_desc.get(),
+                     GetBasePtr(dx));
+    }
+    if (dy) {
+      dy->mutable_data<T>(place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
+      MLUCnnlTensorDesc out_desc(*dy);
+      MLUCnnl::Scale(ctx,
+                     axis,
+                     t_grad_rd_desc.get(),
+                     GetBasePtr(&t_grad_rd),
+                     scale_desc.get(),
+                     GetBasePtr(&scale_tensor),
+                     bias_desc.get(),
+                     GetBasePtr(&bias_tensor),
+                     out_desc.get(),
+                     GetBasePtr(dy));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(huber_loss,
+                       ops::HuberLossMLUKernel<float>,
+                       ops::HuberLossMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(huber_loss_grad,
+                       ops::HuberLossGradMLUKernel<float>,
+                       ops::HuberLossGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -39,14 +39,23 @@ cc_test(
  SRCS test.cc
  DEPS jit_kernel_helper)
 if(NOT WIN32)
-  cc_binary(
-    jit_kernel_benchmark
-    SRCS
-    benchmark.cc
-    DEPS
-    jit_kernel_helper
-    device_tracer
-    tensor)
+  set(cuda_less12_and_gcc_greater12 false)
+  if(DEFINED CMAKE_CUDA_COMPILER_VERSION)
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0
+       AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 12.0)
+      set(cuda_less12_and_gcc_greater12 true)
+    endif()
+  endif()
+  if(NOT cuda_less12_and_gcc_greater12)
+    cc_binary(
+      jit_kernel_benchmark
+      SRCS
+      benchmark.cc
+      DEPS
+      jit_kernel_helper
+      device_tracer
+      tensor)
+  endif()
 endif()
 if(WITH_TESTING AND TEST jit_kernel_test)
  set_tests_properties(jit_kernel_test PROPERTIES TIMEOUT 120)

--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -214,10 +214,7 @@ class MatMulMKLDNNHandler
    }
    astream.wait();

-    auto format =
-        MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw);
-    out->set_format(format);
-    out->set_layout(DataLayout::kMKLDNN);
+    out->set_mem_desc(dst_memory_p->get_desc().reshape(out->dims()));
  }

  std::shared_ptr<dnnl::memory> AcquireDstMemory(
@@ -651,10 +648,18 @@ void ExecuteMatMulV2(const ExecutionContext &ctx,
  auto &astream = MKLDNNDeviceContext::tls().get_stream();
  matmul_p->execute(astream, matmul_args);
  astream.wait();
-  auto format =
-      MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw);
-  out->set_format(format);
-  out->set_layout(DataLayout::kMKLDNN);
+
+  // TODO(jczaja): Explain why int8 format of dst is ABCD and do not need
+  // permute
+  if (IsOutputFused(ctx) && !IsInt8<T_out>()) {
+    auto axis = ctx.Attr<std::vector<int>>("fused_transpose_Out");
+    auto permuted_md = dst_memory_p->get_desc().permute_axes(axis);
+    out->set_mem_desc(
+        permuted_md.reshape(phi::vectorize<int64_t>(out->dims())));
+  } else {
+    out->set_mem_desc(
+        dst_memory_p->get_desc().reshape(phi::vectorize<int64_t>(out->dims())));
+  }
 }

 template <typename T>
@@ -836,8 +841,7 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
    reduction_p->execute(astream, reduction_args);
    astream.wait();

-    dx->set_format(paddle::platform::GetMKLDNNFormat(
-        dst_memory_p->get_desc().reshape(squeezed_dims)));
+    dx->set_mem_desc(dst_memory_p->get_desc().reshape(squeezed_dims));
  }

  std::vector<int64_t> ExtendDimsWithOnes(const std::vector<int64_t> &dims,
@@ -1119,9 +1123,8 @@ void MatMulGradMKLDNNKernel<T>::ExecuteMatMulGrad(
  matmul_p->execute(astream, matmul_args);
  astream.wait();

-  out->set_layout(framework::DataLayout::kMKLDNN);
-  out->set_format(platform::GetMKLDNNFormat(
-      dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims()))));
+  out->set_mem_desc(
+      dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims())));
 }

 template <typename T>
@@ -1184,13 +1187,13 @@ void MatMulGradMKLDNNKernel<T>::RunKernel(const ExecutionContext &ctx) const {
  if (dx) {
    if (dx_dims != x.dims()) {
      dx->Resize(dx_dims);
-      dx->set_format(x.format());
+      dx->set_mem_desc(x.mem_desc());
    }
  }
  if (dy) {
    if (dy_dims != y.dims()) {
      dy->Resize(dy_dims);
-      dy->set_format(y.format());
+      dy->set_mem_desc(y.mem_desc());
    }
  }
 }

--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -221,7 +221,7 @@ class MulPrimitiveFactory {
              to_void_cast<T>(x_tmp.data<T>()));

      x_tmp.Resize(data->dims());
-      x_tmp.set_format(platform::GetMKLDNNFormat(dst_mdesc));
+      x_tmp.set_mem_desc(dst_mdesc);
      data_matrix = framework::ReshapeToMatrix(x_tmp, num_col_dims);
    } else {
      data_matrix = framework::ReshapeToMatrix(*data, num_col_dims);
@@ -235,11 +235,7 @@ class MulPrimitiveFactory {
                          const Tensor *in) {
    x_input_->set_data_handle(to_void_cast<XT>(in->data<XT>()));
    output_->set_data_handle(out->mutable_data<OT>(ctx.GetPlace()));
-
-    if (out->format() == MKLDNNMemoryFormat::undef) {
-      auto output_format = platform::GetMKLDNNFormat(*output_);
-      out->set_format((MKLDNNMemoryFormat)output_format);
-    }
+    out->set_mem_desc(output_->get_desc());
  }

  template <typename T>
@@ -272,7 +268,7 @@ class MulPrimitiveFactory {
    auto buffer_size = dst_desc.get_size();

    OT *output_data = output->mutable_data<OT>(ctx.GetPlace(), buffer_size);
-    output->set_format(paddle::platform::GetMKLDNNFormat(dst_desc));
+    output->set_mem_desc(dst_desc);
    return memory(dst_desc, engine_, to_void_cast<OT>(output_data));
  }

@@ -392,9 +388,10 @@ class MulMKLDNNINT8Kernel : public framework::OpKernel<XT> {
    if (out_dims.size() != 2) {
      out->Resize(out_dims);
    }
-    out->set_layout(DataLayout::kMKLDNN);
-    out->set_format(platform::MKLDNNFormatForSize(out_dims.size(),
-                                                  MKLDNNMemoryFormat::nchw));
+
+    auto in_md = dnnl::memory::desc(*dnnl_primitive_desc_query_md(
+        mul.get_primitive_desc(), dnnl_query_dst_md, 0));
+    out->set_mem_desc(in_md.reshape(phi::vectorize<int64_t>(out->dims())));
  }
 };

@@ -442,10 +439,11 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
    matmul_p->execute(astream, matmul_args);
    astream.wait();

-    out->set_layout(framework::DataLayout::kMKLDNN);
-    // plain output formats are enforced inside handler
-    out->set_format(platform::MKLDNNFormatForSize(
-        out->dims().size(), dnnl::memory::format_tag::nchw));
+    // This kernel is flattening dims so then we need to unflattened version
+    // that should be set in out reshape require plain layout, but
+    // MatmulV2MKLDNNHanlder enforces one so it should work
+    out->set_mem_desc(
+        dst_memory_p->get_desc().reshape(phi::vectorize<int64_t>(out->dims())));
  }

 private:

--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -24,7 +24,8 @@
  See the License for the specific language governing permissions and
  limitations under the License. */

-#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"

 namespace phi {
@@ -37,6 +38,9 @@ namespace operators {
 using paddle::platform::MKLDNNDeviceContext;
 using phi::CPUContext;
 using platform::to_void_cast;
+using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;

 template <typename T>
 class SumMKLDNNHandler

--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -256,6 +256,186 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() {
  }
 }

+class MLUOpTensorDescPool {
+ public:
+  mluOpTensorDescriptor_t Pop() {
+    mluOpTensorDescriptor_t raw_desc;
+    if (q_.try_dequeue(raw_desc)) {
+      return raw_desc;
+    } else {
+      mluOpCreateTensorDescriptor(&raw_desc);
+      return raw_desc;
+    }
+  }
+
+  void Recycle(mluOpTensorDescriptor_t desc) {
+    mluOpResetTensorDescriptor(desc);
+    q_.enqueue(desc);
+  }
+
+  ~MLUOpTensorDescPool() {
+    auto size = q_.size_approx();
+    if (size > 0) {
+      std::vector<mluOpTensorDescriptor_t> vec(size);
+      q_.try_dequeue_bulk(vec.data(), size);
+      for (auto desc : vec) {
+        mluOpDestroyTensorDescriptor(desc);
+      }
+    }
+  }
+
+ private:
+  moodycamel::ConcurrentQueue<mluOpTensorDescriptor_t> q_;
+};
+
+static MLUOpTensorDescPool g_mluop_tensor_desc_pool;
+
+MLUOpTensorDesc& MLUOpTensorDesc::operator=(MLUOpTensorDesc&& rhs) {
+  if (raw_tensor_desc) {
+    g_mluop_tensor_desc_pool.Recycle(raw_tensor_desc);
+  }
+  raw_tensor_desc = rhs.raw_tensor_desc;
+  rhs.raw_tensor_desc = nullptr;
+  return *this;
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype) {
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 const mluOpTensorLayout_t layout) {
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(
+      raw_tensor_desc, layout, tensor_dtype, tensor_dim, dim_sizes));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position)
+    : MLUOpTensorDesc(tensor_dim, dim_sizes, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 const mluOpTensorLayout_t layout) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      layout,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 const mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype) {
+  auto dims = phi::vectorize<int>(tensor.dims());
+  int tensor_dim = dims.size();
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  if (tensor_dim == 0) {
+    int scalar_dims[1] = {1};
+    PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(
+        raw_tensor_desc, layout, tensor_dtype, 1, scalar_dims));
+  } else {
+    std::vector<int> tensor_dim_sizes_int(dims.begin(), dims.end());
+    PADDLE_ENFORCE_MLU_SUCCESS(
+        mluOpSetTensorDescriptor(raw_tensor_desc,
+                                 layout,
+                                 tensor_dtype,
+                                 tensor_dim,
+                                 tensor_dim_sizes_int.data()));
+  }
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor)
+    : MLUOpTensorDesc(
+          tensor, MLUOP_LAYOUT_ARRAY, ToMluOpDataType(tensor.dtype())) {}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position)
+    : MLUOpTensorDesc(tensor, layout, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position,
+                                 float scale)
+    : MLUOpTensorDesc(tensor, layout, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptorPositionAndScale(
+      raw_tensor_desc, position, scale));
+}
+
+MLUOpTensorDesc::~MLUOpTensorDesc() {
+  if (raw_tensor_desc) {
+    g_mluop_tensor_desc_pool.Recycle(raw_tensor_desc);
+  }
+}
+
 MLUCnnlActivationDesc::MLUCnnlActivationDesc(
    const cnnlActivationMode_t act_mode, const float ceof) {
  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
@@ -1563,17 +1743,35 @@ MLURNNDesc::~MLURNNDesc() {
    void* indices_out) {
  cnnlHandle_t handle = GetHandleFromCTX(ctx);

-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTopKTensor(handle,
-                                            input_desc,
-                                            input,
-                                            k,
-                                            dim,
-                                            largest,
-                                            sorted,
-                                            values_output_desc,
-                                            values_out,
-                                            indices_output_desc,
-                                            indices_out));
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetTopKTensorWorkspaceSize(handle,
+                                                            input_desc,
+                                                            k,
+                                                            dim,
+                                                            largest,
+                                                            values_output_desc,
+                                                            indices_output_desc,
+                                                            &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTopKTensor_v3(handle,
+                                               input_desc,
+                                               input,
+                                               k,
+                                               dim,
+                                               largest,
+                                               sorted,
+                                               false /*lower_index_first*/,
+                                               workspace_ptr,
+                                               workspace_size,
+                                               values_output_desc,
+                                               values_out,
+                                               indices_output_desc,
+                                               indices_out));
 }

 /* static */ void MLUCnnl::StridedSlice(
@@ -4527,6 +4725,78 @@ MLURNNDesc::~MLURNNDesc() {
                                                 output));
 }

+/* static */ void MLUCnnl::SmoothL1LossForward(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const cnnlTensorDescriptor_t t_desc,
+    const void* target,
+    const float beta,
+    const cnnlSmoothL1LossAlgorithm_t algorithm,
+    const cnnlTensorDescriptor_t y_desc,
+    void* y) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSmoothL1LossForwardWorkspaceSize(
+      handle, x_desc, algorithm, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSmoothL1LossForward_v2(handle,
+                                                        x_desc,
+                                                        x,
+                                                        t_desc,
+                                                        target,
+                                                        beta,
+                                                        algorithm,
+                                                        workspace_ptr,
+                                                        workspace_size,
+                                                        y_desc,
+                                                        y));
+}
+
+/* static */ void MLUCnnl::SmoothL1LossBackward(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const cnnlTensorDescriptor_t target_desc,
+    const void* target,
+    const cnnlTensorDescriptor_t dy_desc,
+    const void* dy,
+    const float beta,
+    const cnnlSmoothL1LossAlgorithm_t algorithm,
+    const cnnlTensorDescriptor_t dx_desc,
+    void* dx) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSmoothL1LossBackwardWorkspaceSize(
+      handle, x_desc, algorithm, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSmoothL1LossBackward_v2(handle,
+                                                         x_desc,
+                                                         x,
+                                                         target_desc,
+                                                         target,
+                                                         dy_desc,
+                                                         dy,
+                                                         beta,
+                                                         algorithm,
+                                                         workspace_ptr,
+                                                         workspace_size,
+                                                         dx_desc,
+                                                         dx));
+}
+
 /* static */ void MLUCnnl::EmbeddingForward(
    const ExecutionContext& ctx,
    const int padding_idx,
@@ -5148,5 +5418,94 @@ MLURNNDesc::~MLURNNDesc() {
                                                              diff_x));
 }

+/* static */ void MLUOP::OpYoloBox(const ExecutionContext& ctx,
+                                   const mluOpTensorDescriptor_t x_desc,
+                                   const void* x,
+                                   const mluOpTensorDescriptor_t img_size_desc,
+                                   const void* img_size,
+                                   const mluOpTensorDescriptor_t anchors_desc,
+                                   const void* anchors,
+                                   const int class_num,
+                                   const float conf_thresh,
+                                   const int downsample_ratio,
+                                   const bool clip_bbox,
+                                   const float scale,
+                                   const bool iou_aware,
+                                   const float iou_aware_factor,
+                                   const mluOpTensorDescriptor_t boxes_desc,
+                                   void* boxes,
+                                   const mluOpTensorDescriptor_t scores_desc,
+                                   void* scores) {
+  mluOpHandle_t handle = GetMLUOpHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpYoloBox(handle,
+                                          x_desc,
+                                          x,
+                                          img_size_desc,
+                                          img_size,
+                                          anchors_desc,
+                                          anchors,
+                                          class_num,
+                                          conf_thresh,
+                                          downsample_ratio,
+                                          clip_bbox,
+                                          scale,
+                                          iou_aware,
+                                          iou_aware_factor,
+                                          boxes_desc,
+                                          boxes,
+                                          scores_desc,
+                                          scores));
+}
+
+/* static */ void MLUOP::OpPriorBox(
+    const ExecutionContext& ctx,
+    const mluOpTensorDescriptor_t min_sizes_desc,
+    const void* min_sizes,
+    const mluOpTensorDescriptor_t aspect_ratios_desc,
+    const void* aspect_ratios,
+    const mluOpTensorDescriptor_t variances_desc,
+    const void* variances,
+    const mluOpTensorDescriptor_t max_sizes_desc,
+    const void* max_sizes,
+    const int height,
+    const int width,
+    const int im_height,
+    const int im_width,
+    const float step_h,
+    const float step_w,
+    const float offset,
+    const bool clip,
+    const bool min_max_aspect_ratios_order,
+    const mluOpTensorDescriptor_t output_desc,
+    void* output,
+    const mluOpTensorDescriptor_t var_desc,
+    void* var) {
+  mluOpHandle_t handle = GetMLUOpHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpPriorBox(handle,
+                                           min_sizes_desc,
+                                           min_sizes,
+                                           aspect_ratios_desc,
+                                           aspect_ratios,
+                                           variances_desc,
+                                           variances,
+                                           max_sizes_desc,
+                                           max_sizes,
+                                           height,
+                                           width,
+                                           im_height,
+                                           im_width,
+                                           step_h,
+                                           step_w,
+                                           offset,
+                                           clip,
+                                           min_max_aspect_ratios_order,
+                                           output_desc,
+                                           output,
+                                           var_desc,
+                                           var));
+}
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cn_api.h>
 #include <cnnl.h>
 #include <concurrentqueue.h>
+#include <mlu_op.h>

 #include <string>
 #include <vector>
@@ -138,6 +139,54 @@ inline cnnlDataType_t ToCnnlDataType() {
  return ToCnnlDataType(type);
 }

+inline mluOpDataType_t ToMluOpDataType(
+    const paddle::experimental::DataType& dtype) {
+  mluOpDataType_t type = MLUOP_DTYPE_FLOAT;
+  switch (dtype) {
+    case DataType::FLOAT16:
+      type = MLUOP_DTYPE_HALF;
+      break;
+    case DataType::FLOAT32:
+      type = MLUOP_DTYPE_FLOAT;
+      break;
+    case DataType::FLOAT64:
+      type = MLUOP_DTYPE_DOUBLE;
+      break;
+    case DataType::INT8:
+      type = MLUOP_DTYPE_INT8;
+      break;
+    case DataType::INT16:
+      type = MLUOP_DTYPE_INT16;
+      break;
+    case DataType::INT32:
+      type = MLUOP_DTYPE_INT32;
+      break;
+    case DataType::INT64:
+      type = MLUOP_DTYPE_INT64;
+      break;
+    case DataType::BOOL:
+      type = MLUOP_DTYPE_BOOL;
+      break;
+    case DataType::UINT8:
+      type = MLUOP_DTYPE_UINT8;
+      break;
+    default:
+      break;
+  }
+  return type;
+}
+
+inline mluOpDataType_t ToMluOpDataType(
+    const paddle::framework::proto::VarType::Type& type) {
+  return ToMluOpDataType(framework::TransToPhiDataType(type));
+}
+
+template <typename T>
+inline mluOpDataType_t ToMluOpDataType() {
+  auto type = framework::ToDataType(std::type_index(typeid(T)));
+  return ToMluOpDataType(type);
+}
+
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -152,6 +201,10 @@ inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
  return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
 }

+inline static mluOpHandle_t GetMLUOpHandleFromCTX(const ExecutionContext& ctx) {
+  return ctx.template device_context<MLUDeviceContext>().mluOp_handle();
+}
+
 inline static const MLUDeviceContext& GetDevCtxFromCTX(
    const ExecutionContext& ctx) {
  return ctx.template device_context<MLUDeviceContext>();
@@ -281,6 +334,74 @@ class MLUCnnlTensorDesc {
  cnnlTensorDescriptor_t raw_tensor_desc = nullptr;
 };

+class MLUOpTensorDesc {
+ public:
+  MLUOpTensorDesc() {}
+
+  // SE_DISALLOW_COPY_AND_ASSIGN
+  MLUOpTensorDesc(const MLUOpTensorDesc& desc) = delete;
+  MLUOpTensorDesc& operator=(const MLUOpTensorDesc&) = delete;
+
+  MLUOpTensorDesc(MLUOpTensorDesc&& rhs)
+      : raw_tensor_desc(rhs.raw_tensor_desc) {
+    rhs.raw_tensor_desc = nullptr;
+  }
+
+  MLUOpTensorDesc& operator=(MLUOpTensorDesc&& rhs);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  const mluOpTensorLayout_t layout);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  const mluOpTensorLayout_t layout);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  const mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype);
+
+  explicit MLUOpTensorDesc(const Tensor& tensor);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype,
+                  int position,
+                  float scale);
+
+  ~MLUOpTensorDesc();
+
+  const mluOpTensorDescriptor_t get() const { return raw_tensor_desc; }
+
+ private:
+  mluOpTensorDescriptor_t raw_tensor_desc = nullptr;
+};
+
 class MLUCnnlActivationDesc {
 public:
  MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
@@ -1921,6 +2042,28 @@ class MLUCnnl {
                              const cnnlTensorDescriptor_t output_desc,
                              void* output);

+  static void SmoothL1LossForward(const ExecutionContext& ctx,
+                                  const cnnlTensorDescriptor_t x_desc,
+                                  const void* x,
+                                  const cnnlTensorDescriptor_t t_desc,
+                                  const void* target,
+                                  const float beta,
+                                  const cnnlSmoothL1LossAlgorithm_t algorithm,
+                                  const cnnlTensorDescriptor_t y_desc,
+                                  void* y);
+
+  static void SmoothL1LossBackward(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t x_desc,
+                                   const void* x,
+                                   const cnnlTensorDescriptor_t target_desc,
+                                   const void* target,
+                                   const cnnlTensorDescriptor_t dy_desc,
+                                   const void* dy,
+                                   const float beta,
+                                   const cnnlSmoothL1LossAlgorithm_t algorithm,
+                                   const cnnlTensorDescriptor_t dx_desc,
+                                   void* dx);
+
  static void EmbeddingForward(const ExecutionContext& ctx,
                               const int padding_idx,
                               const cnnlTensorDescriptor_t weight_desc,
@@ -2149,6 +2292,50 @@ class MLUCnnl {
      void* diff_x);
 };

+class MLUOP {
+ public:
+  static void OpYoloBox(const ExecutionContext& ctx,
+                        const mluOpTensorDescriptor_t x_desc,
+                        const void* x,
+                        const mluOpTensorDescriptor_t img_size_desc,
+                        const void* img_size,
+                        const mluOpTensorDescriptor_t anchors_desc,
+                        const void* anchors,
+                        const int class_num,
+                        const float conf_thresh,
+                        const int downsample_ratio,
+                        const bool clip_bbox,
+                        const float scale,
+                        const bool iou_aware,
+                        const float iou_aware_factor,
+                        const mluOpTensorDescriptor_t boxes_desc,
+                        void* boxes,
+                        const mluOpTensorDescriptor_t scores_desc,
+                        void* scores);
+
+  static void OpPriorBox(const ExecutionContext& ctx,
+                         const mluOpTensorDescriptor_t min_sizes_desc,
+                         const void* min_sizes,
+                         const mluOpTensorDescriptor_t aspect_ratios_desc,
+                         const void* aspect_ratios,
+                         const mluOpTensorDescriptor_t variances_desc,
+                         const void* variances,
+                         const mluOpTensorDescriptor_t max_sizes_desc,
+                         const void* max_sizes,
+                         const int height,
+                         const int width,
+                         const int im_height,
+                         const int im_width,
+                         const float step_h,
+                         const float step_w,
+                         const float offset,
+                         const bool clip,
+                         const bool min_max_aspect_ratios_order,
+                         const mluOpTensorDescriptor_t output_desc,
+                         void* output,
+                         const mluOpTensorDescriptor_t var_desc,
+                         void* var);
+};
 const std::map<const std::string, std::pair<std::vector<int>, std::vector<int>>>
    TransPermMap = {
        // trans_mode, (forward_perm, backward_perm)

--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -97,4 +97,6 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_MLU_KERNEL(one_hot_v2, ops::OneHotV2MLUKernel<int32_t>);
+REGISTER_OP_MLU_KERNEL(one_hot_v2,
+                       ops::OneHotV2MLUKernel<int32_t>,
+                       ops::OneHotV2MLUKernel<int64_t>);
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -291,11 +291,38 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
      skip_update = skip_update_vec[0];
    }
    bool with_decay = ctx.Attr<bool>("with_decay");
+    const bool multi_precision = ctx.Attr<bool>("multi_precision");
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* master_param_out = ctx.Output<LoDTensor>("MasterParamOut");
+    const auto* master_param = ctx.Input<LoDTensor>("MasterParam");
+
    VLOG(3) << "Skip update: " << skip_update << ", With decay: " << with_decay;
    if (!skip_update && with_decay) {
-      if (ctx.HasInput("MasterParam")) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Master Param is not supported on MLU"));
+      auto* param = ctx.Input<LoDTensor>("Param");
+      MLUCnnlTensorDesc param_desc(*param);
+      if (multi_precision) {
+        VLOG(3) << "[adamw] multi_precision, cast masterparam to param.";
+        bool has_master =
+            ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+        PADDLE_ENFORCE_EQ(
+            has_master,
+            true,
+            platform::errors::InvalidArgument(
+                "The Input(MasterParam) and Output(MasterParamOut) "
+                "should not be null when "
+                "the attr `multi_precision` is true"));
+        // cast masterparam (fp32) to param (fp16), then paramout (fp16) to
+        // masterparamout (fp32)
+        MLUCnnlTensorDesc master_param_desc(*master_param);
+        cnnlCastDataType_t cast_type = GetCastDataType(
+            framework::TransToProtoVarType(master_param->dtype()),
+            framework::TransToProtoVarType(param->dtype()));
+        MLUCnnl::Cast(ctx,
+                      cast_type,
+                      master_param_desc.get(),
+                      GetBasePtr(master_param),
+                      param_desc.get(),
+                      const_cast<void*>(GetBasePtr(param)));
      } else {
        const auto* param_var = ctx.InputVar("Param");
        PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(),
@@ -305,13 +332,12 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
                              "but the received is %s",
                              ctx.InputNames("Param").front(),
                              framework::ToTypeName(param_var->Type())));
-        auto* param = ctx.Input<LoDTensor>("Param");
+
        auto* lr = ctx.Input<LoDTensor>("LearningRate");
        float coeff = ctx.Attr<float>("coeff");

        // update param with decay coeff: mul(-1 * lr, coeff * param) + param
        MLUCnnlTensorDesc lr_desc(*lr);
-        MLUCnnlTensorDesc param_desc(*param);
        MLUCnnlOpTensorDesc mul_op_desc(
            CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);

@@ -330,9 +356,244 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
      }
    }
    AdamMLUKernel<T>::Compute(ctx);
+    if (multi_precision) {
+      VLOG(3) << "[adamw] multi_precision, cast paramout to masterparamout.";
+      // cast paramout to masterparamout
+      master_param_out->mutable_data<float>(ctx.GetPlace());
+      cnnlCastDataType_t cast_type = GetCastDataType(
+          framework::TransToProtoVarType(param_out->dtype()),
+          framework::TransToProtoVarType(master_param_out->dtype()));
+      MLUCnnlTensorDesc param_out_desc(*param_out);
+      MLUCnnlTensorDesc master_param_out_desc(*master_param_out);
+
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    param_out_desc.get(),
+                    GetBasePtr(param_out),
+                    master_param_out_desc.get(),
+                    GetBasePtr(master_param_out));
+    }
  }
 };

+template <typename T>
+class MergedAdamMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Get inputs and outputs
+    auto params = ctx.MultiInput<framework::Tensor>("Param");
+    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
+    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    auto mom1s = ctx.MultiInput<framework::Tensor>("Moment1");
+    auto mom2s = ctx.MultiInput<framework::Tensor>("Moment2");
+    auto beta1_pows = ctx.MultiInput<framework::Tensor>("Beta1Pow");
+    auto beta2_pows = ctx.MultiInput<framework::Tensor>("Beta2Pow");
+    auto master_params = ctx.MultiInput<framework::Tensor>("MasterParam");
+    auto param_outs = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto mom1_outs = ctx.MultiOutput<framework::Tensor>("Moment1Out");
+    auto mom2_outs = ctx.MultiOutput<framework::Tensor>("Moment2Out");
+    auto beta1_pow_outs = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_outs = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
+
+    // Check validation of inputs and outputs
+    size_t param_num = params.size();
+    PADDLE_ENFORCE_EQ(param_num,
+                      param_outs.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Output(ParamOut) must be equal to "
+                          "Input(Param), but got the size of Output(ParamOut) "
+                          "is %d, the size of Input(Param) is %d.",
+                          param_outs.size(),
+                          param_num));
+
+    bool skip_update = false;
+    if (ctx.HasInput("SkipUpdate")) {
+      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "Input(SkipUpdate) size must be 1, but get %d",
+                            skip_update_tensor->numel()));
+      std::vector<bool> skip_update_vec;
+      paddle::framework::TensorToVector(
+          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
+      ctx.device_context().Wait();
+      skip_update = skip_update_vec[0];
+    }
+    // skip_update=true, just copy input to output, and TensorCopy will call
+    // mutable_data
+
+    if (skip_update) {
+      VLOG(4) << "MergedAdam skip update";
+      for (size_t i = 0; i < param_num; ++i) {
+        framework::TensorCopy(
+            *params[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            param_outs[i]);
+        framework::TensorCopy(
+            *mom1s[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            mom1_outs[i]);
+        framework::TensorCopy(
+            *mom2s[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            mom2_outs[i]);
+        framework::TensorCopy(
+            *beta1_pows[i],
+            beta1_pows[i]->place(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            beta1_pow_outs[i]);
+        framework::TensorCopy(
+            *beta2_pows[i],
+            beta2_pows[i]->place(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            beta2_pow_outs[i]);
+      }
+      return;
+    }
+
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    // Get beta1, beta2 and epsilon from attribute.
+    const Tensor* beta1_tensor = nullptr;
+    const Tensor* beta2_tensor = nullptr;
+    const Tensor* epsilon_tensor = nullptr;
+
+    Tensor beta1_tmp(experimental::DataType::FLOAT32);
+    Tensor beta2_tmp(experimental::DataType::FLOAT32);
+    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
+    MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
+    MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &beta1,
+                  beta1_tmp_desc.get(),
+                  GetBasePtr(&beta1_tmp));
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &beta2,
+                  beta2_tmp_desc.get(),
+                  GetBasePtr(&beta2_tmp));
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &epsilon,
+                  epsilon_tmp_desc.get(),
+                  GetBasePtr(&epsilon_tmp));
+    beta1_tensor = &beta1_tmp;
+    beta2_tensor = &beta2_tmp;
+    epsilon_tensor = &epsilon_tmp;
+
+    // Loop to compute
+    for (size_t i = 0; i < param_num; ++i) {
+      VLOG(4) << "[MergedAdam] loop: " << i;
+      param_outs[i]->ShareDataWith(*params[i]);
+      mom1_outs[i]->ShareDataWith(*mom1s[i]);
+      mom2_outs[i]->ShareDataWith(*mom2s[i]);
+
+      LoDTensor beta1_pow_tmp;
+      LoDTensor beta2_pow_tmp;
+      if (beta1_pows[i]->place() == platform::CPUPlace()) {
+        T beta1 = *beta1_pows[i]->data<T>();
+        beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+        MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
+        MLUCnnl::Fill(ctx,
+                      CNNL_POINTER_MODE_HOST,
+                      &beta1,
+                      beta1_pow_tmp_desc.get(),
+                      GetBasePtr(&beta1_pow_tmp));
+        beta1_pows[i] = &beta1_pow_tmp;
+      }
+      if (beta2_pows[i]->place() == platform::CPUPlace()) {
+        T beta2 = *beta2_pows[i]->data<T>();
+        beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+        MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
+        MLUCnnl::Fill(ctx,
+                      CNNL_POINTER_MODE_HOST,
+                      &beta2,
+                      beta2_pow_tmp_desc.get(),
+                      GetBasePtr(&beta2_pow_tmp));
+        beta2_pows[i] = &beta2_pow_tmp;
+      }
+
+      VLOG(3) << "beta1_pow.numel() : " << beta1_pows[i]->numel()
+              << "beta2_pow.numel() : " << beta2_pows[i]->numel();
+      VLOG(3) << "param.numel(): " << params[i]->numel();
+      PADDLE_ENFORCE_EQ(beta1_pow_outs[i]->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "beta1 pow output size should be 1, but received "
+                            "value is:%d.",
+                            beta1_pow_outs[i]->numel()));
+
+      PADDLE_ENFORCE_EQ(beta2_pow_outs[i]->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "beta2 pow output size should be 1, but received "
+                            "value is:%d.",
+                            beta2_pow_outs[i]->numel()));
+      MLUCnnlTensorDesc param_desc(*params[i]);
+      MLUCnnlTensorDesc mom1_desc(*mom1s[i]);
+      MLUCnnlTensorDesc mom2_desc(*mom2s[i]);
+      MLUCnnlTensorDesc grad_desc(*grads[i]);
+      MLUCnnl::ApplyAdam(ctx,
+                         param_desc.get(),
+                         GetBasePtr(param_outs[i]),
+                         mom1_desc.get(),
+                         GetBasePtr(mom1_outs[i]),
+                         mom2_desc.get(),
+                         GetBasePtr(mom2_outs[i]),
+                         grad_desc.get(),
+                         GetBasePtr(grads[i]),
+                         GetBasePtr(lrs[i]),
+                         GetBasePtr(beta1_tensor),
+                         GetBasePtr(beta2_tensor),
+                         GetBasePtr(beta1_pows[i]),
+                         GetBasePtr(beta2_pows[i]),
+                         GetBasePtr(epsilon_tensor),
+                         /*use_nesterov*/ false);
+      if (!use_global_beta_pow) {
+        beta1_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
+        beta2_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
+
+        MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
+        MLUCnnlOpTensorDesc mul_op_desc(
+            CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
+
+        MLUCnnl::OpTensor(ctx,
+                          mul_op_desc.get(),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_pows[i]),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_tensor),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_pow_outs[i]),
+                          ToCnnlDataType<T>());
+
+        MLUCnnl::OpTensor(ctx,
+                          mul_op_desc.get(),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_pows[i]),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_tensor),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_pow_outs[i]),
+                          ToCnnlDataType<T>());
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle

@@ -346,3 +607,7 @@ REGISTER_OP_MLU_KERNEL(adam,
 REGISTER_OP_MLU_KERNEL(adamw,
                       ops::AdamWMLUKernel<float>,
                       ops::AdamWMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(merged_adam,
+                       ops::MergedAdamMLUKernel<float>,
+                       ops::MergedAdamMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -141,10 +141,9 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
          handle, pool_mode, out_w, out_h, &extra_input_size);

      if (extra_input_size > 0) {
-        phi::CPUContext cpu_ctx;
-        framework::Tensor extra_host_tensor =
-            ctx.AllocateTmpTensor<int8_t, phi::CPUContext>(
-                {static_cast<int64_t>(extra_input_size)}, cpu_ctx);
+        framework::Tensor extra_host_tensor;
+        extra_host_tensor.mutable_data<int8_t>(
+            {static_cast<int64_t>(extra_input_size)}, platform::CPUPlace());
        cnnlInitPoolingExtraInput(handle,
                                  pool_desc.get(),
                                  trans_in_x_desc.get(),

--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -92,6 +92,112 @@ class ReduceMaxMLUKernel : public framework::OpKernel<T> {
  }
 };

+template <typename T>
+class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Input<Tensor>("Out");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    int in_dtype = context.Attr<int>("in_dtype");
+
+    PADDLE_ENFORCE_EQ(
+        in_dtype == -1,
+        true,
+        platform::errors::InvalidArgument(
+            "MLU only support in_dtype == -1 in reduce_max_grad op."));
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    x_grad->mutable_data<T>(context.GetPlace());
+
+    auto place = context.GetPlace();
+
+    // broadcast
+    auto x_dims_vec = phi::vectorize(x->dims());
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+
+    Tensor tmp_out, tmp_out_grad;
+    auto tmp_out_dims_vec = x_dims_vec;
+    for (auto d : reduce_dims) {
+      if (d < 0) {
+        d += x_dims_vec.size();
+      }
+      tmp_out_dims_vec[d] = 1;
+    }
+
+    tmp_out.ShareDataWith(*out);
+    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
+    tmp_out_grad.ShareDataWith(*out_grad);
+    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
+
+    Tensor transformed_out(x->type());
+    transformed_out.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out.mutable_data<T>(place);
+
+    MLUCnnlTensorDesc tmp_out_desc(tmp_out);
+    MLUCnnlTensorDesc transformed_out_desc(transformed_out);
+
+    MLUCnnl::BroadcastTo(context,
+                         tmp_out_desc.get(),
+                         GetBasePtr(&tmp_out),
+                         transformed_out_desc.get(),
+                         GetBasePtr(&transformed_out));
+
+    Tensor transformed_out_grad(x->type());
+    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out_grad.mutable_data<T>(place);
+    MLUCnnlTensorDesc tmp_out_grad_desc(tmp_out_grad);
+    MLUCnnlTensorDesc transformed_out_grad_desc(transformed_out_grad);
+
+    MLUCnnl::BroadcastTo(context,
+                         tmp_out_grad_desc.get(),
+                         GetBasePtr(&tmp_out_grad),
+                         transformed_out_grad_desc.get(),
+                         GetBasePtr(&transformed_out_grad));
+
+    // compare
+    Tensor equal_cond;
+    equal_cond.mutable_data<bool>(x_grad->dims(), place);
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc equal_cond_desc(equal_cond);
+
+    MLUCnnl::Logic(context,
+                   CNNL_LOGIC_OP_EQ,
+                   x_desc.get(),
+                   GetBasePtr(x),
+                   transformed_out_desc.get(),
+                   GetBasePtr(&transformed_out),
+                   equal_cond_desc.get(),
+                   GetBasePtr(&equal_cond));
+
+    // select
+    Tensor t_zero;
+    t_zero.mutable_data<T>(x_grad->dims(), place);
+    FillMLUTensorWithHostValue<T>(context, static_cast<T>(0), &t_zero);
+    t_zero.Resize(x_grad->dims());
+
+    MLUCnnlTensorDesc t_zero_desc(t_zero);
+    MLUCnnlTensorDesc x_grad_desc(*x_grad);
+
+    MLUCnnl::Select(context,
+                    equal_cond_desc.get(),
+                    GetBasePtr(&equal_cond),
+                    transformed_out_grad_desc.get(),
+                    GetBasePtr(&transformed_out_grad),
+                    t_zero_desc.get(),
+                    GetBasePtr(&t_zero),
+                    x_grad_desc.get(),
+                    GetBasePtr(x_grad));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -102,3 +208,7 @@ REGISTER_OP_MLU_KERNEL(reduce_max,
                       ops::ReduceMaxMLUKernel<float>,
                       ops::ReduceMaxMLUKernel<plat::float16>,
                       ops::ReduceMaxMLUKernel<int>);
+REGISTER_OP_MLU_KERNEL(reduce_max_grad,
+                       ops::ReduceMaxGradMLUKernel<float>,
+                       ops::ReduceMaxGradMLUKernel<plat::float16>,
+                       ops::ReduceMaxGradMLUKernel<int>);
--- a/paddle/fluid/operators/select_output_op.cc
+++ b/paddle/fluid/operators/select_output_op.cc
@@ -93,7 +93,8 @@ class SelectOutputInferShape : public framework::InferShapeBase {
  void operator()(framework::InferShapeContext *context) const override {
    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "SelectOutput");
    OP_INOUT_CHECK(context->HasInput("Mask"), "Input", "Mask", "SelectOutput");
-    OP_INOUT_CHECK(context->HasOutputs("Out"), "Output", "Out", "SelectOutput");
+    OP_INOUT_CHECK(
+        context->HasOutputs("Out", true), "Output", "Out", "SelectOutput");
  }
 };


--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
@@ -19,6 +19,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+using Tensor = phi::DenseTensor;
+using Variable = framework::Variable;
+using LoDTensorArray = framework::LoDTensorArray;
+using DDim = framework::DDim;
+
 static void ProcessStridedSliceParams(
    const std::vector<int>& axes,
    const DDim& input_dims,

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -9,15 +9,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/sum_op.h"
-
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>

+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"

 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -32,94 +34,6 @@ class SumOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "sum");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sum");
-
-    if (ctx->IsRuntime() && ctx->GetOutputsVarType("Out")[0] ==
-                                framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      return;  // skip runtime infershape when is tensor array;
-    }
-
-    auto x_var_types = ctx->GetInputsVarType("X");
-    auto x_dims = ctx->GetInputsDim("X");
-
-    auto N = x_dims.size();
-    PADDLE_ENFORCE_GT(
-        N,
-        0,
-        platform::errors::InvalidArgument(
-            "The input tensor X's dimensions of SumOp "
-            "should be larger than 0. But received X's dimensions %d, "
-            "X's shape = [%s].",
-            N,
-            &x_dims));
-    if (N == 1) {
-      VLOG(3) << "Warning: SumOp have only one input, may waste memory";
-    }
-
-    framework::DDim in_dim({0});
-    for (size_t i = 0; i < x_dims.size(); ++i) {
-      auto& x_dim = x_dims[i];
-      // x_dim.size() == 1 means the real dim of selected rows is [0]
-      if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS &&
-          x_dim.size() == 1) {
-        continue;
-      }
-      if (phi::product(x_dim) == 0) {
-        continue;
-      }
-      if (phi::product(in_dim) == 0) {
-        in_dim = x_dim;
-      } else {
-        if (ctx->IsRuntime()) {
-          PADDLE_ENFORCE_EQ(in_dim,
-                            x_dim,
-                            platform::errors::InvalidArgument(
-                                "The input tensor X of SumOp must"
-                                " have same shape. But received X[0]'s shape = "
-                                "[%s], X[%d]'s shape = [%s].",
-                                in_dim,
-                                i,
-                                x_dim));
-        } else {
-          PADDLE_ENFORCE_EQ(
-              in_dim.size(),
-              x_dim.size(),
-              platform::errors::InvalidArgument(
-                  "The input tensor X of SumOp must have same "
-                  "dimensions. But received X[0]'s dimensions = %d, X[0]'s "
-                  "shape = "
-                  "[%s], X[%d]'s dimensions = %d, X[%d]'s shape = [%s].",
-                  in_dim.size(),
-                  in_dim,
-                  i,
-                  x_dim.size(),
-                  i,
-                  x_dim));
-          // if in_dim or x_dim has -1, not check equal
-          for (int j = 0; j < x_dim.size(); ++j) {
-            if (x_dim[j] == -1 || in_dim[j] == -1) {
-              continue;
-            }
-            PADDLE_ENFORCE_EQ(
-                in_dim[j],
-                x_dim[j],
-                platform::errors::InvalidArgument(
-                    "The input tensor X of SumOp must have same shape "
-                    "if not -1."
-                    "But received X[0]'s shape = [%s], X[%d]'s shape = [%s].",
-                    in_dim,
-                    i,
-                    x_dim));
-          }
-        }
-      }
-    }
-    ctx->SetOutputDim("Out", in_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -350,18 +264,16 @@ DECLARE_INPLACE_OP_INFERER(SumInplaceInferer, {"X", "Out"});

 namespace ops = paddle::operators;

+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(sum,
+                            AddNInferShapeFunctor,
+                            PD_INFER_META(phi::AddNTensorArrayInferMeta));
+
 REGISTER_OPERATOR(sum,
                  ops::SumOp,
                  ops::SumOpMaker,
                  ops::SumGradDescMaker,
                  ops::SumGradOpBaseMaker,
                  ops::SumOpVarTypeInference,
-                  ops::SumInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    sum,
-    ops::SumKernel<phi::CPUContext, float>,
-    ops::SumKernel<phi::CPUContext, double>,
-    ops::SumKernel<phi::CPUContext, int>,
-    ops::SumKernel<phi::CPUContext, paddle::platform::bfloat16>,
-    ops::SumKernel<phi::CPUContext, int64_t>);
+                  ops::SumInplaceInferer,
+                  AddNInferShapeFunctor);
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/fluid/platform/device_context.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
-
-using LoDTensor = framework::LoDTensor;
-
-template <class T>
-__global__ void Sum2CUDAKernel(const T *in_0,
-                               const T *in_1,
-                               T *out,
-                               int64_t N) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    out[id] = in_0[id] + in_1[id];
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-__global__ void SumArrayCUDAKernel(
-    T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    T total(read_dst ? out[id] : static_cast<T>(0));
-    for (int i = 0; i < in_size; ++i) {
-      const T *tmp = in[i];
-      if (tmp) {
-        total += tmp[id];
-      }
-    }
-    out[id] = total;
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-__global__ void SumSelectedRowsCUDAKernel(T **sr_in_out,
-                                          int64_t N,
-                                          size_t rows) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    for (int i = 0; i < 2 * rows; i += 2) {
-      const T *tmp = sr_in_out[i];
-      T *tmp_out = sr_in_out[i + 1];
-      if (tmp && tmp_out) {
-        tmp_out[id] += tmp[id];
-      }
-    }
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-void SumToLoDTensor(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  const size_t in_num = in_vars.size();
-
-  constexpr size_t theory_sm_threads = 1024;
-  auto &dev_ctx = context.template device_context<phi::GPUContext>();
-  auto stream = dev_ctx.stream();
-
-  auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-  auto sm_count = max_threads / theory_sm_threads;
-  size_t tile_size = 0;
-  dim3 grids;
-  dim3 blocks;
-
-  auto ComputeKernelParameter = [&](size_t length) {
-    if (length >= max_threads)
-      tile_size = 1024;
-    else if (length < max_threads && length > sm_count * 128)
-      tile_size = 512;
-    else if (length <= sm_count * 128)
-      tile_size = 256;
-    grids = dim3(CEIL_DIV(length, tile_size), 1, 1);
-    blocks = dim3(tile_size, 1, 1);
-  };
-
-  auto *out = context.Output<LoDTensor>("Out");
-  bool in_place = in_vars[0] == context.OutputVar("Out");
-
-  if (!in_place) {
-    auto *out_ptr = out->mutable_data<T>(context.GetPlace());
-    if (in_num >= 1 && in_vars[0]->IsType<framework::LoDTensor>()) {
-      auto &in_0_tensor = in_vars[0]->Get<framework::LoDTensor>();
-      if (in_0_tensor.numel() > 0) {
-        in_place = (in_0_tensor.data<T>() == out_ptr);
-      }
-    }
-  }
-
-  // Sum of two tensors
-  if (in_num == 2 && in_vars[0]->IsType<framework::LoDTensor>() &&
-      in_vars[1]->IsType<framework::LoDTensor>()) {
-    auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
-    auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
-    int64_t length_0 = in_0.numel();
-    int64_t length_1 = in_1.numel();
-    if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      auto in_0_e = EigenVector<T>::Flatten(in_0);
-      auto in_1_e = EigenVector<T>::Flatten(in_1);
-      result.device(place) = in_0_e + in_1_e;
-    } else if (length_0 && in_0.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      result.device(place) = EigenVector<T>::Flatten(in_0);
-    } else if (length_1 && in_1.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      result.device(place) = EigenVector<T>::Flatten(in_1);
-    }
-    return;
-  }
-
-  int start = in_place ? 1 : 0;
-  if (!in_place) {
-    phi::funcs::SetConstant<phi::GPUContext, T> constant_functor;
-    constant_functor(context.template device_context<phi::GPUContext>(),
-                     out,
-                     static_cast<T>(0));
-  }
-
-  std::vector<const T *> in_data;
-  std::vector<int> selectrow_index;
-  int64_t lod_length = 0;
-  bool dst_write = false;
-  for (int i = start; i < in_num; ++i) {
-    if (in_vars[i]->IsType<framework::LoDTensor>()) {
-      auto &in_i = in_vars[i]->Get<framework::LoDTensor>();
-      lod_length = in_i.numel();
-      if (lod_length && in_i.IsInitialized()) {
-        in_data.emplace_back(in_i.data<T>());
-      }
-    } else if (in_vars[i]->IsType<phi::SelectedRows>()) {
-      selectrow_index.push_back(i);
-    }
-  }
-
-  // compute select rows separately.
-  if (!selectrow_index.empty()) {
-    std::vector<const T *> sr_in_out_data;
-    size_t rows = 0;
-    int64_t length = 0;
-    for (auto index : selectrow_index) {
-      auto &sr = in_vars[index]->Get<phi::SelectedRows>();
-      auto &sr_value = sr.value();
-      auto &sr_rows = sr.rows();
-
-      auto row_numel = sr_value.numel() / sr_rows.size();
-      auto out_dims = out->dims();
-
-      PADDLE_ENFORCE_EQ(sr.height(),
-                        out_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The table height of input must be same as output, "
-                            "but received input height is %d"
-                            ", output height is %d",
-                            sr.height(),
-                            out_dims[0]));
-      PADDLE_ENFORCE_EQ(row_numel,
-                        out->numel() / sr.height(),
-                        platform::errors::InvalidArgument(
-                            "The table width of input must be same as output, "
-                            "but received input width is %d"
-                            ", output width is %d",
-                            row_numel,
-                            out->numel() / sr.height()));
-
-      auto *sr_data = sr_value.data<T>();
-      auto *sr_out_data = out->data<T>();
-      rows += sr_rows.size();
-      length = row_numel;
-
-      for (size_t i = 0; i < sr_rows.size(); ++i) {
-        sr_in_out_data.emplace_back(&sr_data[i * row_numel]);
-        sr_in_out_data.emplace_back(&sr_out_data[sr_rows[i] * row_numel]);
-      }
-    }
-    if (!sr_in_out_data.empty()) {
-      auto tmp_sr_in_out_array = memory::Alloc(
-          dev_ctx.GetPlace(),
-          sr_in_out_data.size() * sizeof(T *),
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-
-      memory::Copy(dev_ctx.GetPlace(),
-                   tmp_sr_in_out_array->ptr(),
-                   platform::CPUPlace(),
-                   reinterpret_cast<void *>(sr_in_out_data.data()),
-                   sr_in_out_data.size() * sizeof(T *),
-                   dev_ctx.stream());
-
-      T **sr_in_out_array_data =
-          reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
-
-      ComputeKernelParameter(length);
-      SumSelectedRowsCUDAKernel<T>
-          <<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
-      dst_write = true;
-    }
-  }
-  // if indata not null, merge into one kernel call.
-  if (!in_data.empty()) {
-    auto tmp_in_array = memory::Alloc(
-        dev_ctx.GetPlace(),
-        in_data.size() * sizeof(T *),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-
-    memory::Copy(dev_ctx.GetPlace(),
-                 tmp_in_array->ptr(),
-                 platform::CPUPlace(),
-                 reinterpret_cast<void *>(in_data.data()),
-                 in_data.size() * sizeof(T *),
-                 dev_ctx.stream());
-
-    T **in_array_data = reinterpret_cast<T **>(tmp_in_array->ptr());
-    ComputeKernelParameter(lod_length);
-    SumArrayCUDAKernel<T><<<grids, blocks, 0, stream>>>(in_array_data,
-                                                        out->data<T>(),
-                                                        lod_length,
-                                                        in_data.size(),
-                                                        dst_write | in_place);
-  }
-}
-
-template <typename T>
-class SumKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto out_var = context.OutputVar("Out");
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      SumToLoDTensor<T>(context);
-    } else if (out_var->IsType<phi::SelectedRows>()) {
-      SelectedRowsCompute<phi::GPUContext, T>(context);
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      LodTensorArrayCompute<phi::GPUContext, T>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be Tensor,  SelectedRows or "
-          "LodTensorArray. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(sum,
-                        ops::SumKernel<phi::GPUContext, float>,
-                        ops::SumKernel<phi::GPUContext, double>,
-                        ops::SumKernel<phi::GPUContext, int>,
-                        ops::SumKernel<phi::GPUContext, int64_t>,
-                        ops::SumKernel<phi::GPUContext, plat::float16>,
-                        ops::SumKernel<phi::GPUContext, plat::bfloat16>);
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using SelectedRows = phi::SelectedRows;
-using LoDTensor = framework::LoDTensor;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-void SelectedRowsCompute(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  auto out_var = context.OutputVar("Out");
-  bool in_place = out_var == in_vars[0];
-
-  if (in_place && in_vars.size() < 2) {
-    return;
-  }
-
-  std::vector<const phi::SelectedRows *> inputs;
-  SelectedRows temp_in0;
-
-  if (in_place) {
-    auto &in0 = in_vars[0]->Get<phi::SelectedRows>();
-    temp_in0.set_height(in0.height());
-    temp_in0.set_rows(in0.rows());
-    framework::TensorCopy(in0.value(),
-                          in0.place(),
-                          context.device_context(),
-                          temp_in0.mutable_value());
-    inputs.push_back(&temp_in0);
-    for (size_t i = 1; i < in_vars.size(); ++i) {
-      auto &in = in_vars[i]->Get<phi::SelectedRows>();
-      if (in.rows().size() > 0) {
-        inputs.push_back(&in);
-      }
-    }
-  } else {
-    for (auto &in_var : in_vars) {
-      auto &in = in_var->Get<phi::SelectedRows>();
-      if (in.rows().size() > 0) {
-        inputs.push_back(&in_var->Get<phi::SelectedRows>());
-      }
-    }
-  }
-
-  auto *out = context.Output<phi::SelectedRows>("Out");
-  out->mutable_rows()->clear();
-
-  bool has_data = false;
-  for (auto &in : inputs) {
-    if (in->rows().size() > 0) {
-      has_data = true;
-      break;
-    }
-  }
-  if (has_data) {
-    math::scatter::MergeAdd<DeviceContext, T> merge_add;
-    merge_add(context.template device_context<DeviceContext>(), inputs, out);
-
-    out->SyncIndex();
-
-  } else {
-    // no data, just set a empty out tensor.
-    out->mutable_value()->mutable_data<T>(phi::make_ddim({0}),
-                                          context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-void LodTensorArrayCompute(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  auto out_var = context.OutputVar("Out");
-  bool in_place = out_var == in_vars[0];
-  auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
-  for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Only support all inputs are TensorArray, "
-                          "but inputs[%d] is not TensorArray.",
-                          i));
-    auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
-
-    for (size_t i = 0; i < in_array.size(); ++i) {
-      if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
-        if (i >= out_array.size()) {
-          out_array.resize(i + 1);
-        }
-        if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
-          framework::TensorCopy(in_array[i],
-                                in_array[i].place(),
-                                context.device_context(),
-                                &out_array[i]);
-          out_array[i].set_lod(in_array[i].lod());
-        } else {
-          PADDLE_ENFORCE_EQ(
-              out_array[i].lod(),
-              in_array[i].lod(),
-              platform::errors::InvalidArgument(
-                  "The lod message between inputs[%d] and"
-                  " outputs[%d] must be same, but now is not same.",
-                  i,
-                  i));
-          auto in = EigenVector<T>::Flatten(in_array[i]);
-          auto result = EigenVector<T>::Flatten(out_array[i]);
-          result.device(*context.template device_context<DeviceContext>()
-                             .eigen_device()) = result + in;
-        }
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class SumKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    VLOG(10) << "start sum kernel";
-    auto in_vars = context.MultiInputVar("X");
-    size_t in_num = in_vars.size();
-    auto out_var = context.OutputVar("Out");
-
-    bool in_place = out_var == in_vars[0];
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      auto *out = out_var->GetMutable<framework::LoDTensor>();
-      auto *out_ptr = out->mutable_data<T>(context.GetPlace());
-      if (in_num >= 1 && in_vars[0]->IsType<framework::LoDTensor>() &&
-          in_vars[0]->Get<framework::LoDTensor>().IsInitialized()) {
-        auto &in_0_tensor = in_vars[0]->Get<framework::LoDTensor>();
-        if (in_0_tensor.numel() > 0) {
-          in_place = (in_0_tensor.data<T>() == out_ptr);
-        }
-      }
-
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      int start = in_place ? 1 : 0;
-      if (!in_place) {
-        if ((in_num >= 2) && in_vars[0]->IsType<framework::LoDTensor>() &&
-            in_vars[1]->IsType<framework::LoDTensor>() &&
-            in_vars[0]->Get<framework::LoDTensor>().IsInitialized() &&
-            in_vars[1]->Get<framework::LoDTensor>().IsInitialized()) {
-          auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
-          auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
-          if (in_0.numel() && in_1.numel()) {
-            auto in_0_e = EigenVector<T>::Flatten(in_0);
-            auto in_1_e = EigenVector<T>::Flatten(in_1);
-            result.device(place) = in_0_e + in_1_e;
-            start = 2;
-          }
-        }
-        if (start != 2) {
-          VLOG(10) << "Fill with constant = 0 in sum kernel.";
-          phi::funcs::SetConstant<DeviceContext, T> constant_functor;
-          constant_functor(context.template device_context<DeviceContext>(),
-                           out,
-                           static_cast<T>(0));
-        }
-      }
-
-      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
-      // If in_place, just skip the first tensor
-      for (size_t i = start; i < in_num; i++) {
-        if (in_vars[i]->IsType<framework::LoDTensor>()) {
-          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
-          if (!in_t.IsInitialized() || in_t.numel() == 0) {
-            continue;
-          }
-          auto in = EigenVector<T>::Flatten(in_t);
-          result.device(place) = result + in;
-        } else if (in_vars[i]->IsType<phi::SelectedRows>()) {
-          auto &in_t = in_vars[i]->Get<phi::SelectedRows>();
-          functor(context.template device_context<DeviceContext>(), in_t, out);
-        } else {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Expected type of Input(X) of %d-th must be Tensor, "
-              "SelectedRows. But got "
-              "unsupport type: %s.",
-              framework::ToTypeName(in_vars[i]->Type())));
-        }
-      }
-    } else if (out_var->IsType<phi::SelectedRows>()) {
-      SelectedRowsCompute<DeviceContext, T>(context);
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      LodTensorArrayCompute<DeviceContext, T>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be Tensor, SelectedRows, "
-          "LoDTensorArray. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-    VLOG(10) << "end sum kernel";
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/sum_op.h"

 namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;

 template <typename DeviceContext, typename T>
 class SumMLUKernel : public framework::OpKernel<T> {

--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -16,13 +16,16 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;

 template <typename DeviceContext, typename T>
 class SumNPUKernel : public framework::OpKernel<T> {

--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -13,14 +13,16 @@ limitations under the License. */

 #include <vector>

-#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"

 namespace paddle {
 namespace operators {
 using framework::Tensor;
-
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
 template <typename DeviceContext, typename T>
 class SumXPUKernel : public framework::OpKernel<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;