2.4.2

dbe08e9b · yuguo960516yuguo · b5499578 · dbe08e9b · dbe08e9b · dbe08e9b
Commit dbe08e9b authored Jun 12, 2023 by yuguo960516yuguo
20 changed files
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -115,7 +115,6 @@ class PD_INFER_DECL PaddlePassBuilder {
  /// \cond Protected
  std::vector<std::string> analysis_passes_{
      {"ir_graph_build_pass",
-       "ir_graph_clean_pass",
       "ir_analysis_pass",
       "ir_params_sync_among_devices_pass",
       "adjust_cudnn_workspace_size_pass",

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -294,15 +294,6 @@ class TensorRTEngine {
  nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
  nvinfer1::IExecutionContext* context() {
-#ifndef PADDLE_WITH_TESTING
-    PADDLE_ENFORCE_GT(
-        predictor_id_per_thread,
-        -1,
-        platform::errors::InvalidArgument(
-            "thread local var predictor_id_per_thread must be "
-            "initialized to >= 0, but now predictor_id_per_thread = %d",
-            predictor_id_per_thread));
-#endif
    std::unique_lock<std::mutex> lock(mutex_);
    if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
      PADDLE_ENFORCE_NOT_NULL(
@@ -329,15 +320,6 @@ class TensorRTEngine {
  int GetProfileIndex() {
    if (max_profile_num_ > 1) {
-#ifndef PADDLE_WITH_TESTING
-      PADDLE_ENFORCE_GT(
-          predictor_id_per_thread,
-          -1,
-          platform::errors::InvalidArgument(
-              "thread local var predictor_id_per_thread must be "
-              "initialized to >= 0, but now predictor_id_per_thread = %d",
-              predictor_id_per_thread));
-#endif
      std::unique_lock<std::mutex> lock(mutex_);
      return profile_index_[predictor_id_per_thread];
    } else {
@@ -356,15 +338,6 @@ class TensorRTEngine {
        infer_engine_,
        platform::errors::InvalidArgument(
            "You should build engine first and then set the context."));
-#ifndef PADDLE_WITH_TESTING
-    PADDLE_ENFORCE_GT(
-        predictor_id_per_thread,
-        -1,
-        platform::errors::InvalidArgument(
-            "thread local var predictor_id_per_thread must be "
-            "initialized to >= 0, but now predictor_id_per_thread = %d",
-            predictor_id_per_thread));
-#endif
    std::unique_lock<std::mutex> lock(mutex_);
    infer_context_[predictor_id_per_thread].reset(nullptr);
    infer_context_.erase(predictor_id_per_thread);

--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -639,8 +639,12 @@ struct SimpleOpTypeSetTeller : public Teller {
      int axis = desc.HasAttr("axis")
                     ? PADDLE_GET_CONST(int64_t, desc.GetAttr("axis"))
                     : -1;
-      bool flatten = PADDLE_GET_CONST(bool, desc.GetAttr("flatten"));
+      bool flatten = desc.HasAttr("flatten")
-      int dtype = PADDLE_GET_CONST(int, desc.GetAttr("dtype"));
+                         ? PADDLE_GET_CONST(bool, desc.GetAttr("flatten"))
+                         : false;
+      int dtype = desc.HasAttr("dtype")
+                      ? PADDLE_GET_CONST(int, desc.GetAttr("dtype"))
+                      : 3;
      if (axis == 0 || flatten || dtype != 2) return false;
    }
@@ -1708,8 +1712,10 @@ struct SimpleOpTypeSetTeller : public Teller {
          return false;
        }
      } else {
-#if !IS_TRT_VERSION_GE(8000)
+#if (IS_TRT_VERSION_GE(8000) && IS_TRT_VERSION_LT(8100)) || \
-        VLOG(3) << "The version of TRT must be greater than 8000";
+    (IS_TRT_VERSION_LT(7200))
+        VLOG(3) << "There are some bugs in v8.0.* and the versions lower than "
+                   "v7.2 are not supported";
        return false;
 #endif
      }

--- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
@@ -104,6 +104,7 @@ bool PluginArgumentMappingContext::IsSelectedRowsInput(
    const std::string& name) const {
  return false;
 }
 bool PluginArgumentMappingContext::IsSparseCooTensorInput(
    const std::string& name) const {
  return false;
@@ -112,6 +113,11 @@ bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
    const std::string& name) const {
  return false;
 }
+bool PluginArgumentMappingContext::IsSelectedRowsInputs(
+    const std::string& name) const {
+  return false;
+}
 bool PluginArgumentMappingContext::IsDenseTensorVectorInput(
    const std::string& name) const {
  return false;

--- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
@@ -50,6 +50,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
  bool IsSparseCsrTensorInput(const std::string& name) const override;
+  bool IsSelectedRowsInputs(const std::string& name) const override;
  bool IsDenseTensorVectorInput(const std::string& name) const override;
  bool IsDenseTensorOutput(const std::string& name) const override;

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -416,6 +416,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz"
 if(WITH_GPU)
  inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR}
                              analyzer_ernie_tester.cc)
+  inference_analysis_api_test(gpu_ernie_half_test ${ERNIE_INSTALL_DIR}
+                              gpu_ernie_half_test.cc)
+  set_tests_properties(gpu_ernie_half_test PROPERTIES TIMEOUT 60)
 endif()
 inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR}
                                 analyzer_ernie_int8_tester.cc)

--- a/paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
+++ b/paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+using paddle::PaddleTensor;
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+  std::string shape_str = data[0];
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+  std::string mat_str = data[1];
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+  return true;
+}
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+  tensors->clear();
+  tensors->reserve(4);
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+  return true;
+}
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+                   int batch_size = 1) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+// Compare results
+TEST(Ernie_gpu_fp16_no_ir, compare_results) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kHalf);
+  config.SwitchIrOptim(false);
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+    auto output = outputs.front();
+    size_t outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *result = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], result[j], 8e-3);
+    }
+  }
+}
+// Compare results
+TEST(Ernie_gpu_fp16_with_ir, compare_results) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kHalf);
+  config.SwitchIrOptim(true);
+  // There is a problem with the model itself, which has nothing to do with
+  // constant_folding_pass.
+  config.pass_builder()->DeletePass("constant_folding_pass");
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+    auto output = outputs.front();
+    size_t outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *result = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], result[j], 2e-2);
+    }
+  }
+}
+// Compare results
+TEST(Ernie_gpu_bf16_no_ir, compare_results) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kBf16);
+  config.SwitchIrOptim(false);
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+    auto output = outputs.front();
+    size_t outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *result = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], result[j], 1e-2);
+    }
+  }
+}
+// Compare results
+TEST(Ernie_gpu_bf16_with_ir, compare_results) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kBf16);
+  config.SwitchIrOptim(true);
+  // There is a problem with the model itself, which has nothing to do with
+  // constant_folding_pass.
+  config.pass_builder()->DeletePass("constant_folding_pass");
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+    auto output = outputs.front();
+    size_t outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *result = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], result[j], 5e-3);
+    }
+  }
+}
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <cuda_runtime.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <cstring>
-#include <numeric>
 #include "gflags/gflags.h"
-#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 namespace paddle_infer {

--- a/paddle/fluid/operators/collective/barrier_op_mlu.cc
+++ b/paddle/fluid/operators/collective/barrier_op_mlu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/barrier_op.h"
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+template <typename T>
+class BarrierOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
+    auto place = ctx.GetPlace();
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data();
+    void* recvbuff = out->mutable_data<T>(place);
+    int rid = ctx.Attr<int>("ring_id");
+    auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
+    auto* comm = cncl_comm->comm();
+    auto comm_stream = cncl_comm->stream();
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    cnclReduceOp_t cncl_red_type = cnclSum;
+    dev_ctx.Wait();
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
+        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
+    PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with CNCL."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
--- a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
 #if defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -27,15 +28,14 @@ template <typename T>
 class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
 #if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<framework::Tensor>("X");
+    auto x = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
-    cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
    int nranks = ctx.Attr<int>("nranks");
    int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
    PADDLE_ENFORCE_EQ(
        nranks,
@@ -48,19 +48,56 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
    out->mutable_data<T>(out_dims, place);
    uint32_t send_numel = x->numel();
-    void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    void* send_buff;
-    void* recv_buff = reinterpret_cast<void*>(out->data<T>());
+    void* recv_buff;
+    phi::DenseTensor in_tensor, out_tensor;
+    if (framework::TransToProtoVarType(x->dtype()) ==
+        framework::proto::VarType::INT64) {
+      // cast from int64 to int32 since cncl do not support int64
+      in_tensor.mutable_data<int32_t>(x->dims(), place);
+      out_tensor.mutable_data<int32_t>(out->dims(), place);
+      MLUCnnlTensorDesc x_int64_desc(*x);
+      MLUCnnlTensorDesc x_int32_desc(in_tensor);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    x_int64_desc.get(),
+                    GetBasePtr(x),
+                    x_int32_desc.get(),
+                    GetBasePtr(&in_tensor));
+      send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
+      recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
+    } else {
+      in_tensor.ShareDataWith(*x);
+      out_tensor.ShareDataWith(*out);
+      send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
+      recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
+    }
    mluStream stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }
+    cnclDataType_t dtype = platform::ToCNCLDataType(
+        framework::TransToProtoVarType(in_tensor.dtype()));
    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
        send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
+    if (framework::TransToProtoVarType(x->dtype()) ==
+        framework::proto::VarType::INT64) {
+      // cast back from int64 out_tensor to out
+      MLUCnnlTensorDesc out_int64_desc(*out);
+      MLUCnnlTensorDesc out_int32_desc(out_tensor);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    out_int32_desc.get(),
+                    GetBasePtr(&out_tensor),
+                    out_int64_desc.get(),
+                    GetBasePtr(out));
+    }
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with MLU."));
@@ -80,4 +117,5 @@ REGISTER_OP_MLU_KERNEL(c_allgather,
                       ops::CAllGatherOpMLUKernel<int>,
                       ops::CAllGatherOpMLUKernel<int8_t>,
                       ops::CAllGatherOpMLUKernel<int16_t>,
+                       ops::CAllGatherOpMLUKernel<int64_t>,
                       ops::CAllGatherOpMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -42,19 +42,23 @@ if(WITH_XPU)
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op_xpu.cc)
  detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 elseif(WITH_MLU)
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op_mlu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
 elseif(WITH_ASCEND_CL)
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op_npu.cc)
  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
 else()
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op.cu)
  detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
  # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 endif()
@@ -73,7 +77,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                  box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc

--- a/paddle/fluid/operators/detection/prior_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_mlu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class PriorBoxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
+    float step_w = ctx.Attr<float>("step_w");
+    float step_h = ctx.Attr<float>("step_h");
+    float offset = ctx.Attr<float>("offset");
+    bool clip = ctx.Attr<bool>("clip");
+    bool min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
+    int im_width = image->dims()[3];
+    int im_height = image->dims()[2];
+    int width = input->dims()[3];
+    int height = input->dims()[2];
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    bool flip = ctx.Attr<bool>("flip");
+    std::vector<float> new_aspect_ratios;
+    ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+    phi::DenseTensor ratios;
+    paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
+    MLUOpTensorDesc new_aspect_ratios_desc(ratios);
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    phi::DenseTensor min;
+    paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
+    MLUOpTensorDesc min_sizes_desc(min);
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    phi::DenseTensor max;
+    paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
+    MLUOpTensorDesc max_sizes_desc(max);
+    auto variances_attr = ctx.Attr<std::vector<float>>("variances");
+    phi::DenseTensor var_tensor;
+    paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
+    MLUOpTensorDesc variances_attr_desc(var_tensor);
+    auto place = ctx.GetPlace();
+    boxes->mutable_data<T>(place);
+    variances->mutable_data<T>(place);
+    MLUOpTensorDesc var_desc(*variances);
+    MLUOpTensorDesc output_desc(*boxes);
+    MLUOP::OpPriorBox(ctx,
+                      min_sizes_desc.get(),
+                      GetBasePtr(&min),
+                      new_aspect_ratios_desc.get(),
+                      GetBasePtr(&ratios),
+                      variances_attr_desc.get(),
+                      GetBasePtr(&var_tensor),
+                      max_sizes_desc.get(),
+                      GetBasePtr(&max),
+                      height,
+                      width,
+                      im_height,
+                      im_width,
+                      step_h,
+                      step_w,
+                      offset,
+                      clip,
+                      min_max_aspect_ratios_order,
+                      output_desc.get(),
+                      GetBasePtr(boxes),
+                      var_desc.get(),
+                      GetBasePtr(variances));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);
--- a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class YoloBoxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* img_size = ctx.Input<phi::DenseTensor>("ImgSize");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* scores = ctx.Output<phi::DenseTensor>("Scores");
+    const std::vector<int> anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto class_num = ctx.Attr<int>("class_num");
+    auto conf_thresh = ctx.Attr<float>("conf_thresh");
+    auto downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    auto clip_bbox = ctx.Attr<bool>("clip_bbox");
+    auto scale = ctx.Attr<float>("scale_x_y");
+    auto iou_aware = ctx.Attr<bool>("iou_aware");
+    auto iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
+    int anchor_num = anchors.size() / 2;
+    int64_t size = anchors.size();
+    auto dim_x = x->dims();
+    int n = dim_x[0];
+    int s = anchor_num;
+    int h = dim_x[2];
+    int w = dim_x[3];
+    // The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
+    // H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
+    // anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
+    std::vector<int64_t> boxes_dim_mluops({n, s, 4, h * w});
+    std::vector<int64_t> scores_dim_mluops({n, s, class_num, h * w});
+    // In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
+    // of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
+    // classification scores of boxes.
+    std::vector<int64_t> boxes_out_dim({n, s, h * w, 4});
+    std::vector<int64_t> scores_out_dim({n, s, h * w, class_num});
+    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
+    phi::DenseTensor boxes_tensor_mluops =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, 4, h * w}, dev_ctx);
+    phi::DenseTensor scores_tensor_mluops =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, class_num, h * w},
+                                                   dev_ctx);
+    MLUOpTensorDesc boxes_trans_desc_mluops(
+        4, boxes_dim_mluops.data(), ToMluOpDataType<T>());
+    MLUCnnlTensorDesc boxes_trans_desc_cnnl(
+        4, boxes_dim_mluops.data(), ToCnnlDataType<T>());
+    MLUOpTensorDesc scores_trans_desc_mluops(
+        4, scores_dim_mluops.data(), ToMluOpDataType<T>());
+    MLUCnnlTensorDesc scores_trans_desc_cnnl(
+        4, scores_dim_mluops.data(), ToCnnlDataType<T>());
+    boxes->mutable_data<T>(ctx.GetPlace());
+    scores->mutable_data<T>(ctx.GetPlace());
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), boxes);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), scores);
+    MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
+    MLUOpTensorDesc img_size_desc(
+        *img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
+    Tensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
+    anchors_temp.Resize({size});
+    paddle::framework::TensorFromVector(
+        anchors, ctx.device_context(), &anchors_temp);
+    MLUOpTensorDesc anchors_desc(anchors_temp);
+    MLUCnnlTensorDesc boxes_desc_cnnl(
+        4, boxes_out_dim.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc scores_desc_cnnl(
+        4, scores_out_dim.data(), ToCnnlDataType<T>());
+    MLUOP::OpYoloBox(ctx,
+                     x_desc.get(),
+                     GetBasePtr(x),
+                     img_size_desc.get(),
+                     GetBasePtr(img_size),
+                     anchors_desc.get(),
+                     GetBasePtr(&anchors_temp),
+                     class_num,
+                     conf_thresh,
+                     downsample_ratio,
+                     clip_bbox,
+                     scale,
+                     iou_aware,
+                     iou_aware_factor,
+                     boxes_trans_desc_mluops.get(),
+                     GetBasePtr(&boxes_tensor_mluops),
+                     scores_trans_desc_mluops.get(),
+                     GetBasePtr(&scores_tensor_mluops));
+    const std::vector<int> perm = {0, 1, 3, 2};
+    // transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
+    MLUCnnl::Transpose(ctx,
+                       perm,
+                       4,
+                       boxes_trans_desc_cnnl.get(),
+                       GetBasePtr(&boxes_tensor_mluops),
+                       boxes_desc_cnnl.get(),
+                       GetBasePtr(boxes));
+    // transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
+    // class_num]
+    MLUCnnl::Transpose(ctx,
+                       perm,
+                       4,
+                       scores_trans_desc_cnnl.get(),
+                       GetBasePtr(&scores_tensor_mluops),
+                       scores_desc_cnnl.get(),
+                       GetBasePtr(scores));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(yolo_box, ops::YoloBoxMLUKernel<float>);
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -39,8 +39,17 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
    MLUCnnlTensorDesc x_desc(*x);
    MLUCnnlTensorDesc out_desc(*out);
-    if (!is_test) {
+    if (is_test && is_upscale) {
-      // exec dropout op for training only.
+      // dropout op for inference: out = input.
+      framework::TensorCopy(
+          *x,
+          ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(),
+          out);
+      return;
+    } else if (!is_test) {
+      // dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
+      // out = input * mask.
      int seed_data = 0;
      if (seed_tensor) {
        if (platform::is_mlu_place(seed_tensor->place())) {
@@ -79,50 +88,44 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
      const int device_id = ctx.GetPlace().GetDeviceId();
      auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);
-      const float prob = is_upscale ? dropout_prob : 0.0f;
+      // compute out = input * mask / ( 1.0 - dropout_prob )
      MLUCnnl::FusedDropout(ctx,
                            mlu_gen_random->get(),
                            x_desc.get(),
                            GetBasePtr(x),
-                            prob,
+                            dropout_prob,
                            GetBasePtr(&(mlu_gen_random->get_state())),
                            mask_desc.get(),
                            GetBasePtr(mask),
                            out_desc.get(),
                            GetBasePtr(out));
-    } else {
-      // exec dropout op for inference only.
      if (is_upscale) {
-        framework::TensorCopy(
+        return;
-            *x,
+      }
-            ctx.GetPlace(),
+    }
-            ctx.template device_context<platform::MLUDeviceContext>(),
-            out);
+    // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
-      } else {
-        auto scale = static_cast<T>(1.0f - dropout_prob);
    Tensor scale_tensor(x->dtype());
+    Tensor bias_tensor(x->dtype());
    scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
    MLUCnnlTensorDesc scale_desc(scale_tensor);
-        MLUCnnl::Fill(ctx,
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
-                      CNNL_POINTER_MODE_HOST,
+    FillMLUTensorWithHostValue(
-                      &scale,
+        ctx, static_cast<T>(1.0f - dropout_prob), &scale_tensor);
-                      scale_desc.get(),
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0f), &bias_tensor);
-                      GetBasePtr(&scale_tensor));
+    MLUCnnl::Scale(ctx,
-        auto data_type = ToCnnlDataType<T>();
+                   0,
-        MLUCnnlOpTensorDesc op_tensor_desc(
+                   is_test ? x_desc.get() : out_desc.get(),
-            CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
+                   is_test ? GetBasePtr(x) : GetBasePtr(out),
-        MLUCnnl::OpTensor(ctx,
-                          op_tensor_desc.get(),
-                          x_desc.get(),
-                          GetBasePtr(x),
                   scale_desc.get(),
                   GetBasePtr(&scale_tensor),
+                   bias_desc.get(),
+                   GetBasePtr(&bias_tensor),
                   out_desc.get(),
-                          GetBasePtr(out),
+                   GetBasePtr(out));
-                          data_type);
-      }
-    }
  }
 };

--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -67,7 +67,7 @@ if(WITH_GPU OR WITH_ROCM)
  op_library(skip_layernorm_op)
  op_library(yolo_box_head_op)
  op_library(yolo_box_post_op)
-  op_library(fused_embedding_eltwise_layernorm_op)
+  op_library(fused_embedding_eltwise_layernorm_op DEPS bert_encoder_functor)
  op_library(fused_gate_attention_op)
  # fusion_group
  if(NOT APPLE AND NOT WIN32)

--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -45,6 +45,14 @@ struct NormConvolutionArgs {
           int stride,
           int dilation,
           int group) {
+    PADDLE_ENFORCE_LT(
+        ctx.GetComputeCapability(),
+        90,
+        phi::errors::PreconditionNotMet(
+            "Expect compute compatiblity to be less than 90, but got %d. "
+            "CUDNN FusedOps is no longer available on H100 and later "
+            "devices.",
+            ctx.GetComputeCapability()));
    PADDLE_ENFORCE_EQ(
        input_shape.size(),
        4U,

--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -442,7 +442,7 @@ TEST(CudnnNormConvFp16, K1S1) {
  phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
    ASSERT_THROW(test.CheckForward(1e-3, true),
                 paddle::platform::EnforceNotMet);
    ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -472,7 +472,7 @@ TEST(CudnnNormConvFp16, K3S1) {
  phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
    ASSERT_THROW(test.CheckForward(1e-3, true),
                 paddle::platform::EnforceNotMet);
    ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -502,7 +502,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
  phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
    ASSERT_THROW(test.CheckForward(1e-3, true),
                 paddle::platform::EnforceNotMet);
    ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -532,7 +532,7 @@ TEST(CudnnNormConvFp16, K1S2O4) {
  phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() <= 70 || ctx->GetComputeCapability() >= 90) {
    ASSERT_THROW(test.CheckForward(1e-3, true),
                 paddle::platform::EnforceNotMet);
    ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet);

--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -256,8 +256,10 @@ template <typename T,
          int BlockSizeX,
          int BlockSizeY,
          int VecSize,
-          typename Functor>
+          typename Functor,
-__global__ void FusedDropoutActBiasGrad(Functor act_grad,
+          int THREADS_PER_CTA = BlockSizeX *BlockSizeY>
+__global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
+    Functor act_grad,
    const T *dout,
    const MaskType *mask,
    const T *src,

--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -223,13 +223,7 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
      // For layer_norm, reduce to calculate mean and std
      sum_i += static_cast<float>(tmp_3);
-#if defined(PADDLE_WITH_CUDA) && __CUDA_ARCH__ >= 530
-      square_sum_i += static_cast<float>(__hmul(tmp_3, tmp_3));
-#elif defined(PADDLE_WITH_CUDA)
      square_sum_i += static_cast<float>(tmp_3) * static_cast<float>(tmp_3);
-#else
-      square_sum_i += static_cast<float>(tmp_3 * tmp_3);
-#endif
    }
    auto pair = BlockReduce(temp_storage)
                    .Reduce(PairForLayerNorm<float>(sum_i, square_sum_i),
@@ -282,8 +276,8 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
      half tmp_0 = __hdiv(__hsub(save_ptr[save_index], mean_i), std_i);
      half tmp_1 = scale ? __hmul(scale[j], tmp_0) : tmp_0;
 #else
-      half tmp_0 = static_cast<float>(static_cast<float>(save_ptr[save_index]) +
+      half tmp_0 = static_cast<half>((static_cast<float>(save_ptr[save_index]) -
-                                      static_cast<float>(mean_i) /
+                                      static_cast<float>(mean_i)) /
                                     static_cast<float>(std_i));
      half tmp_1 = scale ? static_cast<half>(static_cast<float>(scale[j]) *
                                             static_cast<float>(tmp_0))
@@ -400,19 +394,16 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
    auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
-    blas.GEMM(false,
+    blas.GEMM(CblasNoTrans,
-              false,
+              CblasNoTrans,
              M,
              N,
              K,
              static_cast<T>(1.0),
              x_data,
-              K,
              w_data,
-              N,
              static_cast<T>(0.0),
-              out_data,
+              out_data);
-              N);
    auto* y = ctx.Input<framework::Tensor>("Y");
    auto* bias_0 = ctx.Input<framework::Tensor>("Bias0");
    auto* bias_1 = ctx.Input<framework::Tensor>("Bias1");

--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -139,9 +139,8 @@ class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    // Note (Ming Huang): Reserve space of relu is a bit-mask,
-    // which cannot pass nan_and_inf checking if shape is set.
+    if (ctx->HasOutput("ReserveSpace")) {
-    if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) {
      ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims));
    }
  }