version 1

581b8d15 · liangjing · 581b8d15 · 581b8d15 · 581b8d15 · 581b8d15
Commit 581b8d15 authored Apr 10, 2023 by liangjing
20 changed files
--- a/external_ops/dist.bak/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
+++ b/external_ops/dist.bak/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
--- a/external_ops/dist/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
+++ b/external_ops/dist/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
--- a/external_ops/env.sh
+++ b/external_ops/env.sh
+#!/bin/bash
+################################################################################
+# Copyright 2016-2021 by SW Group, Chengdu Hygon IC Design Co., Ltd.
+# All right reserved. See COPYRIGHT for detailed Information.
+#
+# @file     set_env.sh
+# @brief    set env variables for running test.
+#
+# @author   wangmingliang <wangmingliang@hygon.cn>
+# @date     2022/03/23
+# @history  1.0
+################################################################################
+export HYGON_ROCM_INSTALL=/opt/dtk-21.04
+# library path
+# fix LD_LIBRARY_PATH begin or end with colon
+export LD_LIBRARY_PATH=$(echo ${LD_LIBRARY_PATH} | sed 's/:$//; s/^://;')
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/lib:${HYGON_ROCM_INSTALL}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hip/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipcub/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipfft/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hiprand/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipsolver/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipsparse/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hsa/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/llvm/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/miopen/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/miopengemm/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/oam/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/opencl/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rccl/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocalution/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocclr/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocfft/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocm_smi/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocprim/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocprofiler/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocrand/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocsolver/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocsparse/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocthrust/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/roctracer/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export C_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/rocrand/include:$C_INCLUDE_PATH
+export CPLUS_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/rocrand/include:$CPLUS_INCLUDE_PATH
+export C_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/hiprand/include:$C_INCLUDE_PATH
+export CPLUS_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/hiprand/include:$CPLUS_INCLUDE_PATH
+# executable path
+# fix PATH begin or end with colon
+export PATH=$(echo ${PATH} | sed 's/:$//; s/^://;')
+export PATH=${HYGON_ROCM_INSTALL}/bin${PATH:+:${PATH}}
+export PATH=${HYGON_ROCM_INSTALL}/hip/bin${PATH:+:${PATH}}
+export PATH=${HYGON_ROCM_INSTALL}/llvm/bin${PATH:+:${PATH}}
+export PATH=${HYGON_ROCM_INSTALL}/llvm/lib/clang/13.0.0/bin${PATH:+:${PATH}}
+export PATH=${HYGON_ROCM_INSTALL}/miopen/bin${PATH:+:${PATH}}
+export PATH=${HYGON_ROCM_INSTALL}/opencl/bin${PATH:+:${PATH}}
+export PATH=${HYGON_ROCM_INSTALL}/rocprofiler/bin${PATH:+:${PATH}}
+export PATH=/public/home/zhangqha/bladisc/hmmer/bin${PATH:+:${PATH}}
+export PATH=/public/home/zhangqha/bladisc/hh-suite-master/build/bin${PATH:+:${PATH}}
+export PATH=/public/home/zhangqha/bladisc/kalign/build/bin${PATH:+:${PATH}}
+# component path
+export ROCM_PATH=${HYGON_ROCM_INSTALL}
+export HSA_PATH=${HYGON_ROCM_INSTALL}/hsa
+export HIP_PATH=${HYGON_ROCM_INSTALL}/hip
+export HIP_ROCCLR_HOME=${HYGON_ROCM_INSTALL}/hip
+export HIP_LIB_PATH=${HYGON_ROCM_INSTALL}/hip/lib
+export DEVICE_LIB_PATH=${HYGON_ROCM_INSTALL}/amdgcn/bitcode
+export HIP_CLANG_PATH=${HYGON_ROCM_INSTALL}/llvm/bin
+export HIP_RUNTIME="rocclr"
+export HIP_COMPILER="clang"
+#export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$CPLUS_INCLUDE_PATH
+#export INCLUDE=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$INCLUDE
+#export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$C_INCLUDE_PATH
--- a/external_ops/fmhalib/README.md
+++ b/external_ops/fmhalib/README.md
+# preparation
+Accordding to the following PR to produce fmhalib.so
+https://github.com/sneaxiy/apex/pull/1
+# build the fmha op
+python setup.py install
+# unittest (need to improve)
+python test_fmha.py
--- a/external_ops/fmhalib/fmha_cuda.cc
+++ b/external_ops/fmhalib/fmha_cuda.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "paddle/extension.h"
+#define CHECK_INPUT(x)                                      \
+  PD_CHECK(x.place().GetType() == phi::AllocationType::GPU, \
+           #x " must be a GPU Tensor.")
+std::vector<paddle::Tensor> fmha_cuda_forward(const paddle::Tensor& qkv,
+                                              const paddle::Tensor& cu_seqlen,
+                                              const paddle::Tensor& host_seqlen,
+                                              bool is_test,
+                                              float dropout_rate,
+                                              bool zero_tensors,
+                                              bool use_fmha_mke_opt);
+std::vector<paddle::Tensor> fmha_cuda_backward(
+    const paddle::Tensor& qkv,
+    const paddle::Tensor& cu_seqlen,
+    const paddle::Tensor& host_seqlen,
+    const paddle::Tensor& softmax_input,
+    const paddle::Tensor& d_ctx_out,
+    bool is_test,
+    float dropout_rate,
+    bool zero_tensors,
+    bool use_fmha_mke_opt);
+/*
+ *x_shape(fp16) = [total_tokens, 3, num_heads, head_size]
+ *y_shape(int32) = [batch_size + 1]
+*/
+std::vector<std::vector<int64_t>> FmhaInferShape(
+    const std::vector<int64_t>& x_shape,
+    const std::vector<int64_t>& y_shape,
+    const std::vector<int64_t>& host_y_shape,
+    const bool& is_test,
+    const float& dropout_rate,
+    const bool& zero_tensors,
+    const bool& use_fmha_mke_opt) {
+  int total = x_shape[0];
+  int num_heads = x_shape[2];
+  int head_size = x_shape[3];
+  int batch_size = y_shape[0] - 1;
+  if (x_shape[1] != 3) {
+    PD_THROW(
+        "The shape for input QKV should be [total_tokens, 3, num_heas, "
+        "head_size].");
+  }
+  int max_seq_len = 512;
+  std::vector<int64_t> ctx_out_shape = {total, num_heads, head_size};
+  std::vector<int64_t> s_out_shape = {
+      batch_size, num_heads, max_seq_len, max_seq_len};
+  return {ctx_out_shape, s_out_shape};
+}
+std::vector<paddle::DataType> FmhaInferDtype(paddle::DataType x_dtype,
+                                             paddle::DataType y_dtype,
+                                             paddle::DataType host_y_dtype) {
+  return {x_dtype, x_dtype};
+}
+std::vector<paddle::Tensor> FmhaCUDAForward(const paddle::Tensor& qkv,
+                                            const paddle::Tensor& cu_seqlen,
+                                            const paddle::Tensor& host_seqlen,
+                                            bool is_test,
+                                            float dropout_rate,
+                                            bool zero_tensors,
+                                            bool use_fmha_mke_opt) {
+  CHECK_INPUT(qkv);
+  CHECK_INPUT(cu_seqlen);
+  // Note: should not use CHECK_INPUT(max_seq_len_host),
+  // because it will enforce this input to be GPU tensor
+  return fmha_cuda_forward(qkv,
+                           cu_seqlen,
+                           host_seqlen,
+                           is_test,
+                           dropout_rate,
+                           zero_tensors,
+                           use_fmha_mke_opt);
+}
+std::vector<paddle::Tensor> FmhaCUDABackward(
+    const paddle::Tensor& qkv,
+    const paddle::Tensor& cu_seqlen,
+    const paddle::Tensor& host_seqlen,
+    const paddle::Tensor& softmax_input,
+    const paddle::Tensor& d_ctx_out,
+    bool is_test,
+    float dropout_rate,
+    bool zero_tensors,
+    bool use_fmha_mke_opt) {
+  CHECK_INPUT(qkv);
+  CHECK_INPUT(cu_seqlen);
+  CHECK_INPUT(softmax_input);
+  CHECK_INPUT(d_ctx_out);
+  return fmha_cuda_backward(qkv,
+                            cu_seqlen,
+                            host_seqlen,
+                            softmax_input,
+                            d_ctx_out,
+                            is_test,
+                            dropout_rate,
+                            zero_tensors,
+                            use_fmha_mke_opt);
+}
+PD_BUILD_OP(custom_fmha)
+    .Inputs({"QKV", "CuSeqLen", "HostSeqLen"})
+    .Outputs({"CtxOut", "SOut"})
+    .Attrs({"is_test: bool",
+            "dropout_rate: float",
+            "zero_tensors: bool",
+            "use_fmha_mke_opt: bool"})
+    .SetKernelFn(PD_KERNEL(FmhaCUDAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(FmhaInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(FmhaInferDtype));
+PD_BUILD_GRAD_OP(custom_fmha)
+    .Inputs({"QKV", "CuSeqLen", "HostSeqLen", "SOut", paddle::Grad("CtxOut")})
+    .Outputs({paddle::Grad("QKV")})
+    .Attrs({"is_test: bool",
+            "dropout_rate: float",
+            "zero_tensors: bool",
+            "use_fmha_mke_opt: bool"})
+    .SetKernelFn(PD_KERNEL(FmhaCUDABackward));
--- a/external_ops/fmhalib/fmha_cuda.cu
+++ b/external_ops/fmhalib/fmha_cuda.cu
--- a/external_ops/fmhalib/test_fmha.py
+++ b/external_ops/fmhalib/test_fmha.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from custom_setup_ops import custom_fmha
+import numpy as np
+total = 2
+batch_size = 56
+num_heads = 16
+head_size = 64
+is_training = True
+max_seq_len = 512
+dropout_rate = 0.1
+cu_seqlen = np.arange(batch_size + 1)
+cu_seqlen = np.cumsum(cu_seqlen)
+total = cu_seqlen[-1]
+#print("cu_seqlen", cu_seqlen)
+#print("cu_seqlen[-1]", cu_seqlen[-1])
+cu_seqlen = paddle.to_tensor(cu_seqlen)
+cu_seqlen = paddle.cast(cu_seqlen, 'int32')
+qkv = np.random.random((total, 3, num_heads, head_size)).astype(np.float16)
+#print("qkv:", qkv)
+qkv = paddle.to_tensor(qkv, stop_gradient=False)
+max_seq_len_host = [max_seq_len]
+max_seq_len_host = paddle.to_tensor(
+    max_seq_len_host, dtype='int32', place=paddle.CPUPlace())
+ctx_out, s_out = custom_fmha(qkv, cu_seqlen, max_seq_len_host, is_training,
+                             dropout_rate, False)
+print("print ctx_out and s_out: ")
+print(ctx_out)
+print(s_out)
+# backward.
+print("print qkv.grad: ")
+grad_ctx_dout = np.random.random(
+    (total, num_heads, head_size)).astype(np.float16)
+grad_ctx_dout = paddle.to_tensor(grad_ctx_dout)
+paddle.autograd.backward([ctx_out], [grad_ctx_dout], retain_graph=True)
+print(qkv.grad)
--- a/external_ops/fused_dense_op/README.md
+++ b/external_ops/fused_dense_op/README.md
+# Function 
+Support gemm_nn/nt + bias with float16, float32 and float64 data types.
+# Correcness test
+`
+python test_fused_dense_op.py
+`
+# performance result
+`
+python test_fused_dense_perf.py
+`
--- a/external_ops/fused_dense_op/fused_dense_cuda.cc
+++ b/external_ops/fused_dense_op/fused_dense_cuda.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "paddle/extension.h"
+// @x: [x, in_feature] or [xx, xx, in_feature]
+// @y: [out_feature, in_feature]
+// @out: [x, out_feature] or [xx, xx, out_feature]
+// support transx=false, transy=true/false.
+std::vector<std::vector<int64_t>> FusedDenseInferShape(
+    const std::vector<int64_t>& x_shape,
+    const std::vector<int64_t>& y_shape,
+    const std::vector<int64_t>& bias_shape,
+    const bool& transx,
+    const bool& transy,
+    const bool& use_addto) {
+  int x_size = x_shape.size();
+  int x_m = 1;
+  for (int i = 0; i < (x_size - 1); i++) {
+    x_m *= x_shape[i];
+  }
+  int x_k = x_shape[x_size - 1];
+  int y_k = y_shape[0];
+  int y_n = y_shape[1];
+  if (transy) {
+    y_k = y_shape[1];
+    y_n = y_shape[0];
+  }
+  if (x_k != y_k) {
+    PD_THROW("The reudce dim of A and B in matmul is not equal.");
+  }
+  if (transx) {
+    PD_THROW("Only support cases: transx is False, transy are True/False.");
+  }
+  std::vector<int64_t> out_shape(x_shape);
+  out_shape[x_size - 1] = y_n;
+  return {out_shape};
+}
+std::vector<paddle::DataType> FusedDenseInferDtype(
+    paddle::DataType x_dtype,
+    paddle::DataType y_dtype,
+    paddle::DataType bias_dtype) {
+  return {x_dtype};
+}
+PD_BUILD_OP(custom_fused_dense)
+    .Inputs({"X", "Y", "Bias"})
+    .Outputs({"Out"})
+    .Attrs({"transx: bool", "transy: bool", "use_addto: bool"})
+    .SetInferShapeFn(PD_INFER_SHAPE(FusedDenseInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(FusedDenseInferDtype));
+PD_BUILD_GRAD_OP(custom_fused_dense)
+    .Inputs({"X", "Y", "Bias", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X"), paddle::Grad("Y"), paddle::Grad("Bias")})
+    .Attrs({"transx: bool", "transy: bool"});
--- a/external_ops/fused_dense_op/fused_dense_cuda.cu
+++ b/external_ops/fused_dense_op/fused_dense_cuda.cu
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define CUBLAS_VERSION 13000
+/* Includes, cuda */
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
+// includes cublaslt
+#include <cublasLt.h>
+#endif
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#define CHECK_CUBLAS_ERR(error_code)                 \
+  do {                                               \
+    if (error_code != CUBLAS_STATUS_SUCCESS) {       \
+      PD_THROW("cublas error code is ", error_code); \
+    }                                                \
+  } while (0)
+// todo: allocate 4MB. (the following code looks like 4MB * sizeof(T)?)
+constexpr auto kWorkspaceSize = (1 << 22);
+// FP64 Wrapper around cublas GEMMEx
+// TODO(limin): in fact, alpha and beta are double type.
+cublasStatus_t gemm_bias(cublasHandle_t handle,
+                         cublasOperation_t transa,
+                         cublasOperation_t transb,
+                         int m,
+                         int n,
+                         int k,
+                         const float* alpha,
+                         const double* A,
+                         int lda,
+                         const double* B,
+                         int ldb,
+                         const float* beta,
+                         double* C,
+                         int ldc) {
+  return cublasGemmEx(handle,
+                      transa,
+                      transb,
+                      m,
+                      n,
+                      k,
+                      alpha,
+                      A,
+                      CUDA_R_64F,
+                      lda,
+                      B,
+                      CUDA_R_64F,
+                      ldb,
+                      beta,
+                      C,
+                      CUDA_R_64F,
+                      ldc,
+                      CUDA_R_64F,
+                      CUBLAS_GEMM_DEFAULT);
+}
+// FP32 Wrapper around cublas GEMMEx
+cublasStatus_t gemm_bias(cublasHandle_t handle,
+                         cublasOperation_t transa,
+                         cublasOperation_t transb,
+                         int m,
+                         int n,
+                         int k,
+                         const float* alpha,
+                         const float* A,
+                         int lda,
+                         const float* B,
+                         int ldb,
+                         const float* beta,
+                         float* C,
+                         int ldc) {
+  return cublasGemmEx(handle,
+                      transa,
+                      transb,
+                      m,
+                      n,
+                      k,
+                      alpha,
+                      A,
+                      CUDA_R_32F,
+                      lda,
+                      B,
+                      CUDA_R_32F,
+                      ldb,
+                      beta,
+                      C,
+                      CUDA_R_32F,
+                      ldc,
+                      CUDA_R_32F,
+                      CUBLAS_GEMM_DEFAULT);
+}
+// FP16 Tensor core wrapper around cublas GEMMEx
+cublasStatus_t gemm_bias(cublasHandle_t handle,
+                         cublasOperation_t transa,
+                         cublasOperation_t transb,
+                         int m,
+                         int n,
+                         int k,
+                         const float* alpha,
+                         const paddle::float16* A,
+                         int lda,
+                         const paddle::float16* B,
+                         int ldb,
+                         const float* beta,
+                         paddle::float16* C,
+                         int ldc) {
+  return cublasGemmEx(handle,
+                      transa,
+                      transb,
+                      m,
+                      n,
+                      k,
+                      alpha,
+                      A,
+                      CUDA_R_16F,
+                      lda,
+                      B,
+                      CUDA_R_16F,
+                      ldb,
+                      beta,
+                      C,
+                      CUDA_R_16F,
+                      ldc,
+                      CUDA_R_32F,
+                      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
+// float16 and float32
+template <typename T>
+cublasStatus_t cublaslt_matmul_desc_init(
+    cublasLtMatmulDescOpaque_t* operationDesc) {
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  status =
+      cublasLtMatmulDescInit(operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
+  return status;
+}
+// float64
+template <>
+cublasStatus_t cublaslt_matmul_desc_init<double>(
+    cublasLtMatmulDescOpaque_t* operationDesc) {
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  status =
+      cublasLtMatmulDescInit(operationDesc, CUBLAS_COMPUTE_64F, CUDA_R_64F);
+  return status;
+}
+// float16
+template <typename T>
+cublasStatus_t set_cublaslt_matrix_layout_init(
+    cublasLtMatrixLayoutOpaque_t* Adesc,
+    cublasLtMatrixLayoutOpaque_t* Bdesc,
+    cublasLtMatrixLayoutOpaque_t* Cdesc,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int lda,
+    int ldb,
+    int ldc) {
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  status = cublasLtMatrixLayoutInit(Adesc,
+                                    CUDA_R_16F,
+                                    transa == CUBLAS_OP_N ? m : k,
+                                    transa == CUBLAS_OP_N ? k : m,
+                                    lda);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(Bdesc,
+                                    CUDA_R_16F,
+                                    transb == CUBLAS_OP_N ? k : n,
+                                    transb == CUBLAS_OP_N ? n : k,
+                                    ldb);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(Cdesc, CUDA_R_16F, m, n, ldc);
+CLEANUP:
+  return status;
+}
+template <>
+cublasStatus_t set_cublaslt_matrix_layout_init<float>(
+    cublasLtMatrixLayoutOpaque_t* Adesc,
+    cublasLtMatrixLayoutOpaque_t* Bdesc,
+    cublasLtMatrixLayoutOpaque_t* Cdesc,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int lda,
+    int ldb,
+    int ldc) {
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  status = cublasLtMatrixLayoutInit(Adesc,
+                                    CUDA_R_32F,
+                                    transa == CUBLAS_OP_N ? m : k,
+                                    transa == CUBLAS_OP_N ? k : m,
+                                    lda);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(Bdesc,
+                                    CUDA_R_32F,
+                                    transb == CUBLAS_OP_N ? k : n,
+                                    transb == CUBLAS_OP_N ? n : k,
+                                    ldb);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(Cdesc, CUDA_R_32F, m, n, ldc);
+CLEANUP:
+  return status;
+}
+template <>
+cublasStatus_t set_cublaslt_matrix_layout_init<double>(
+    cublasLtMatrixLayoutOpaque_t* Adesc,
+    cublasLtMatrixLayoutOpaque_t* Bdesc,
+    cublasLtMatrixLayoutOpaque_t* Cdesc,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int lda,
+    int ldb,
+    int ldc) {
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  status = cublasLtMatrixLayoutInit(Adesc,
+                                    CUDA_R_64F,
+                                    transa == CUBLAS_OP_N ? m : k,
+                                    transa == CUBLAS_OP_N ? k : m,
+                                    lda);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(Bdesc,
+                                    CUDA_R_64F,
+                                    transb == CUBLAS_OP_N ? k : n,
+                                    transb == CUBLAS_OP_N ? n : k,
+                                    ldb);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(Cdesc, CUDA_R_64F, m, n, ldc);
+CLEANUP:
+  return status;
+}
+#endif
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
+template <typename T>
+int gemm_bias_lt(cublasLtHandle_t ltHandle,
+                 cublasOperation_t transa,
+                 cublasOperation_t transb,
+                 int m,
+                 int n,
+                 int k,
+                 const float* alpha, /* host pointer */
+                 const T* A,
+                 int lda,
+                 const T* B,
+                 int ldb,
+                 const float* beta, /* host pointer */
+                 T* C,
+                 int ldc,
+                 void* workspace,
+                 size_t workspaceSize,
+                 cudaStream_t stream,
+                 bool use_bias,
+                 const void* bias) {
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  cublasLtMatmulDescOpaque_t operationDesc = {};
+  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
+  cublasLtMatmulPreferenceOpaque_t preference = {};
+  int returnedResults = 0;
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
+  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
+  // for details about defaults; here we just set the transforms for
+  // A and B.
+  status = cublaslt_matmul_desc_init<T>(&operationDesc);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(
+      &operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(
+      &operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  if (use_bias) {
+    status = cublasLtMatmulDescSetAttribute(
+        &operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      goto CLEANUP;
+    }
+    epilogue = CUBLASLT_EPILOGUE_BIAS;
+  }
+  status = cublasLtMatmulDescSetAttribute(&operationDesc,
+                                          CUBLASLT_MATMUL_DESC_EPILOGUE,
+                                          &epilogue,
+                                          sizeof(epilogue));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    goto CLEANUP;
+  }
+  // Create matrix descriptors. Not setting any extra attributes.
+  status = set_cublaslt_matrix_layout_init<T>(
+      &Adesc, &Bdesc, &Cdesc, transa, transb, m, n, k, lda, ldb, ldc);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  // Create preference handle; In general, extra attributes can be
+  // used here to disable tensor ops or to make sure algo selected
+  // will work with badly aligned A, B, C. However, for simplicity
+  // here we assume A,B,C are always well aligned (e.g., directly
+  // come from cudaMalloc)
+  status = cublasLtMatmulPreferenceInit(&preference);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulPreferenceSetAttribute(
+      &preference,
+      CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+      &workspaceSize,
+      sizeof(workspaceSize));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  // We just need the best available heuristic to try and run matmul.
+  // There is no guarantee that this will work. For example, if A is
+  // badly aligned, you can request more (e.g. 32) algos and try to
+  // run them one by one until something works.
+  status = cublasLtMatmulAlgoGetHeuristic(ltHandle,
+                                          &operationDesc,
+                                          &Adesc,
+                                          &Bdesc,
+                                          &Cdesc,
+                                          &Cdesc,
+                                          &preference,
+                                          1,
+                                          &heuristicResult,
+                                          &returnedResults);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  if (returnedResults == 0) {
+    status = CUBLAS_STATUS_NOT_SUPPORTED;
+    goto CLEANUP;
+  }
+  status = cublasLtMatmul(ltHandle,
+                          &operationDesc,
+                          alpha,
+                          A,
+                          &Adesc,
+                          B,
+                          &Bdesc,
+                          beta,
+                          C,
+                          &Cdesc,
+                          reinterpret_cast<void*>(C),
+                          &Cdesc,
+                          // &heuristicResult.algo,
+                          NULL,
+                          workspace,
+                          workspaceSize,
+                          stream);
+CLEANUP:
+  // Descriptors are no longer needed as all GPU work was already
+  // enqueued.
+  PADDLE_ENFORCE_GPU_SUCCESS(status);
+  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+#endif
+template <typename T>
+static int linear_bias_cuda_forward_impl(
+    const paddle::platform::CUDADeviceContext& dev_ctx,
+    const T* input_data,
+    const T* weight_data,
+    const T* bias_data,
+    bool transx,
+    bool transy,
+    int in_features,
+    int batch_size,
+    int out_features,
+    T* output_data,
+    void* lt_workspace) {
+  auto handle = dev_ctx.cublas_handle();
+  auto stream = dev_ctx.stream();
+  const float alpha = 1.0;
+  const float beta_zero = 0.0;
+  const float beta_one = 1.0;
+  int status = 1;
+  // nt
+  cublasOperation_t transpose_x = CUBLAS_OP_T;
+  cublasOperation_t transpose_y = CUBLAS_OP_N;
+  if (transy) {
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
+    status = gemm_bias_lt((cublasLtHandle_t)handle,
+                          transpose_x,
+                          transpose_y,
+                          out_features,
+                          batch_size,
+                          in_features,
+                          &alpha, /* host pointer */
+                          weight_data,
+                          in_features,
+                          input_data,
+                          in_features,
+                          &beta_zero, /* host pointer */
+                          output_data,
+                          out_features,
+                          lt_workspace,
+                          kWorkspaceSize,
+                          stream,
+                          true,
+                          bias_data);
+#endif
+    if (status != 0) {
+      PD_THROW("cublaslt gemm_bias failed with error code ", status);
+#if 0
+        output.copy_(bias);
+        status = gemm_bias(
+            handle,
+            transpose_x,
+            transpose_y,
+            out_features,
+            batch_size,
+            in_features,
+            &alpha,
+            weight,
+            in_features,
+            input_data,
+            in_features,
+            &beta_one,
+            output_data,
+            out_features);
+#endif
+    }
+  } else {
+    // nn
+    transpose_x = CUBLAS_OP_N;
+    transpose_y = CUBLAS_OP_N;
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
+    status = gemm_bias_lt((cublasLtHandle_t)handle,
+                          transpose_x,
+                          transpose_y,
+                          out_features,
+                          batch_size,
+                          in_features,
+                          &alpha, /* host pointer */
+                          weight_data,
+                          out_features,
+                          input_data,
+                          in_features,
+                          &beta_zero, /* host pointer */
+                          output_data,
+                          out_features,
+                          lt_workspace,
+                          kWorkspaceSize,
+                          stream,
+                          true,
+                          bias_data);
+#endif
+    if (status != 0) {
+      PD_THROW("cublaslt gemm_bias failed with error code ", status);
+    }
+  }
+  return status;
+}
+template <typename T>
+int gemm_bgradb_lt(cublasLtHandle_t ltHandle,
+                   cublasOperation_t transa,
+                   cublasOperation_t transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha, /* host pointer */
+                   const T* A,
+                   int lda,
+                   const T* B,
+                   int ldb,
+                   const float* beta, /* host pointer */
+                   T* C,
+                   int ldc,
+                   void* workspace,
+                   size_t workspaceSize,
+                   cudaStream_t stream,
+                   bool use_bias,
+                   const void* bgrad,
+                   cublasLtEpilogue_t epilogue) {
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  cublasLtMatmulDescOpaque_t operationDesc = {};
+  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
+  cublasLtMatmulPreferenceOpaque_t preference = {};
+  int returnedResults = 0;
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  // cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
+  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
+  // for details about defaults; here we just set the transforms for
+  // A and B.
+  status = cublaslt_matmul_desc_init<T>(&operationDesc);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(
+      &operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(
+      &operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  if (use_bias) {
+    status = cublasLtMatmulDescSetAttribute(&operationDesc,
+                                            CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+                                            &bgrad,
+                                            sizeof(bgrad));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      goto CLEANUP;
+    }
+  }
+  status = cublasLtMatmulDescSetAttribute(&operationDesc,
+                                          CUBLASLT_MATMUL_DESC_EPILOGUE,
+                                          &epilogue,
+                                          sizeof(epilogue));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    goto CLEANUP;
+  }
+  // Create matrix descriptors. Not setting any extra attributes.
+  status = set_cublaslt_matrix_layout_init<T>(
+      &Adesc, &Bdesc, &Cdesc, transa, transb, m, n, k, lda, ldb, ldc);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  // Create preference handle; In general, extra attributes can be
+  // used here to disable tensor ops or to make sure algo selected
+  // will work with badly aligned A, B, C. However, for simplicity
+  // here we assume A,B,C are always well aligned (e.g., directly
+  // come from cudaMalloc)
+  status = cublasLtMatmulPreferenceInit(&preference);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulPreferenceSetAttribute(
+      &preference,
+      CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+      &workspaceSize,
+      sizeof(workspaceSize));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  // We just need the best available heuristic to try and run matmul.
+  // There is no guarantee that this will work. For example, if A is
+  // badly aligned, you can request more (e.g. 32) algos and try to
+  // run them one by one until something works.
+  status = cublasLtMatmulAlgoGetHeuristic(ltHandle,
+                                          &operationDesc,
+                                          &Adesc,
+                                          &Bdesc,
+                                          &Cdesc,
+                                          &Cdesc,
+                                          &preference,
+                                          1,
+                                          &heuristicResult,
+                                          &returnedResults);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  if (returnedResults == 0) {
+    status = CUBLAS_STATUS_NOT_SUPPORTED;
+    goto CLEANUP;
+  }
+  status = cublasLtMatmul(ltHandle,
+                          &operationDesc,
+                          alpha,
+                          A,
+                          &Adesc,
+                          B,
+                          &Bdesc,
+                          beta,
+                          C,
+                          &Cdesc,
+                          C,
+                          &Cdesc,
+                          // &heuristicResult.algo,
+                          NULL,
+                          workspace,
+                          workspaceSize,
+                          stream);
+CLEANUP:
+  // Descriptors are no longer needed as all GPU work was already
+  // enqueued.
+  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+template <typename T>
+int linear_bias_cuda_backward_impl(
+    const paddle::platform::CUDADeviceContext& dev_ctx,
+    const T* input,
+    const T* weight,
+    const T* d_output,
+    bool transx,
+    bool transy,
+    bool use_addto,
+    int in_features,
+    int batch_size,
+    int out_features,
+    T* d_weight,
+    T* d_bias,
+    T* d_input,
+    void* lt_workspace) {
+  auto handle = dev_ctx.cublas_handle();
+  auto stream = dev_ctx.stream();
+  const float alpha = 1.0;
+  const float beta_zero = 0.0;
+  const float beta_one = 1.0;
+  int status = 1;
+  if (transy) {
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
+    // cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
+    cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BGRADB;
+    status = gemm_bgradb_lt((cublasLtHandle_t)handle,
+                            CUBLAS_OP_N,
+                            CUBLAS_OP_T,
+                            in_features,
+                            out_features,
+                            batch_size,
+                            &alpha, /* host pointer */
+                            input,
+                            in_features,
+                            d_output,
+                            out_features,
+                            &beta_zero, /* host pointer */
+                            d_weight,
+                            in_features,
+                            lt_workspace,
+                            kWorkspaceSize,
+                            stream,
+                            true,
+                            static_cast<const void*>(d_bias),
+                            epilogue);
+#endif
+    if (status != 0) {
+      PD_THROW("cublaslt gemm_bias failed with error code ", status);
+#if 0
+      status = gemm_bias(
+          handle,
+          CUBLAS_OP_N,
+          CUBLAS_OP_T,
+          in_features,
+          out_features,
+          batch_size,
+          &alpha,
+          input,
+          in_features,
+          d_output,
+          out_features,
+          &beta_zero,
+          d_weight,
+          in_features);
+#endif
+    }
+  } else {
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
+    cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BGRADA;
+    status = gemm_bgradb_lt((cublasLtHandle_t)handle,
+                            CUBLAS_OP_N,
+                            CUBLAS_OP_T,
+                            out_features,
+                            in_features,
+                            batch_size,
+                            &alpha, /* host pointer */
+                            d_output,
+                            out_features,
+                            input,
+                            in_features,
+                            &beta_zero, /* host pointer */
+                            d_weight,
+                            out_features,
+                            lt_workspace,
+                            kWorkspaceSize,
+                            stream,
+                            true,
+                            static_cast<const void*>(d_bias),
+                            epilogue);
+#endif
+    if (status != 0) {
+      PD_THROW("cublaslt gemm_bias failed with error code ", status);
+    }
+  }
+  cublasOperation_t transpose_x = CUBLAS_OP_N;
+  cublasOperation_t transpose_y = CUBLAS_OP_N;
+  const float beta_dinput = (use_addto ? beta_one : beta_zero);
+  if (transy) {
+    status = gemm_bias(handle,
+                       transpose_x,
+                       transpose_y,
+                       in_features,
+                       batch_size,
+                       out_features,
+                       &alpha,
+                       weight,
+                       in_features,
+                       d_output,
+                       out_features,
+                       &beta_dinput,
+                       d_input,
+                       in_features);
+  } else {
+    transpose_x = CUBLAS_OP_T;
+    transpose_y = CUBLAS_OP_N;
+    status = gemm_bias(handle,
+                       transpose_x,
+                       transpose_y,
+                       in_features,
+                       batch_size,
+                       out_features,
+                       &alpha,
+                       weight,
+                       out_features,
+                       d_output,
+                       out_features,
+                       &beta_dinput,
+                       d_input,
+                       in_features);
+  }
+  return status;
+}
+__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_fused_dense, ctx) {
+  namespace f = paddle::framework;
+  const auto* x = ctx.Input<f::Tensor>("X");
+  const auto* y = ctx.Input<f::Tensor>("Y");
+  const auto* bias = ctx.Input<f::Tensor>("Bias");
+  auto* out = ctx.Output<f::Tensor>("Out");
+  bool transx = ctx.Attr<bool>("transx");
+  bool transy = ctx.Attr<bool>("transy");
+  auto& dev_ctx = ctx.cuda_device_context();
+  auto place = dev_ctx.GetPlace();
+  if (transx) {
+    PD_THROW("Attr(transx) must be False currently.");
+  }
+  const auto& x_dims = x->dims();
+  int x_m = 1;
+  for (int i = 0; i < x_dims.size() - 1; i++) {
+    x_m *= x_dims[i];
+  }
+  int x_k = x_dims[x_dims.size() - 1];
+  const auto& y_dims = y->dims();
+  int y_k = y_dims[0];
+  int y_n = y_dims[1];
+  if (transy) {
+    y_k = y_dims[1];
+    y_n = y_dims[0];
+  }
+  if (x_k != y_k) {
+    PD_THROW("The reudce dim of A and B in matmul is not equal.");
+  }
+  auto out_dims = x_dims;
+  out_dims[x_dims.size() - 1] = y_n;
+  out->Resize(out_dims);
+  f::Tensor lt_workspace;
+  lt_workspace.Resize({kWorkspaceSize});
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      x->dtype(), "linear_bias_cuda_forward_impl", ([&] {
+        linear_bias_cuda_forward_impl<data_t>(
+            dev_ctx,
+            x->data<data_t>(),
+            y->data<data_t>(),
+            bias->data<data_t>(),
+            transx,
+            transy,
+            x_k,
+            x_m,
+            y_n,
+            out->mutable_data<data_t>(place),
+            lt_workspace.mutable_data<data_t>(place));
+      }));
+}
+__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_fused_dense_grad, ctx) {
+  namespace f = paddle::framework;
+  const auto* x = ctx.Input<f::Tensor>("X");
+  const auto* y = ctx.Input<f::Tensor>("Y");
+  const auto* grad_out = ctx.Input<f::Tensor>(f::GradVarName("Out"));
+  auto* grad_x = ctx.Output<f::Tensor>(f::GradVarName("X"));
+  auto* grad_y = ctx.Output<f::Tensor>(f::GradVarName("Y"));
+  auto* grad_bias = ctx.Output<f::Tensor>(f::GradVarName("Bias"));
+  bool transx = ctx.Attr<bool>("transx");
+  bool transy = ctx.Attr<bool>("transy");
+  bool use_addto = ctx.Attr<bool>("use_addto");
+  auto& dev_ctx = ctx.cuda_device_context();
+  auto place = dev_ctx.GetPlace();
+  if (transx) {
+    PD_THROW("Attr(transx) must be False currently.");
+  }
+  const auto& x_dims = x->dims();
+  int x_m = 1;
+  for (int i = 0; i < x_dims.size() - 1; i++) {
+    x_m *= x_dims[i];
+  }
+  int x_k = x_dims[x_dims.size() - 1];
+  const auto& y_dims = y->dims();
+  int y_k = y_dims[0];
+  int y_n = y_dims[1];
+  if (transy) {
+    y_k = y_dims[1];
+    y_n = y_dims[0];
+  }
+  if (x_k != y_k) {
+    PD_THROW("The reudce dim of A and B in matmul is not equal.");
+  }
+  grad_x->Resize(x_dims);
+  grad_y->Resize(y_dims);
+  grad_bias->Resize({y_n});
+  f::Tensor lt_workspace;
+  lt_workspace.Resize({kWorkspaceSize});
+#if defined(CUDA_VERSION) && CUDA_VERSION < 11000
+  PD_THROW(
+      "fused_dense_cuda_backward is not supported on cuda_version < 11000");
+#endif
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      x->dtype(), "linear_bias_cuda_backward_impl", ([&] {
+        linear_bias_cuda_backward_impl<data_t>(
+            dev_ctx,
+            x->data<data_t>(),
+            y->data<data_t>(),
+            grad_out->data<data_t>(),
+            transx,
+            transy,
+            use_addto,
+            x_k,
+            x_m,
+            y_n,
+            grad_y->mutable_data<data_t>(place),
+            grad_bias->mutable_data<data_t>(place),
+            grad_x->mutable_data<data_t>(place),
+            lt_workspace.mutable_data<data_t>(place));
+      }));
+}
--- a/external_ops/fused_dense_op/test_fused_dense_op.py
+++ b/external_ops/fused_dense_op/test_fused_dense_op.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from custom_setup_ops import custom_fused_dense
+import numpy as np
+iters = 1
+def test_fused_dense_op(x, weight, bias, transx, transy, grad_out, atol):
+    # x * weight^t + bias
+    def run_paddle_mm_bias(x, weight, bias, transx, transy, grad_out):
+        pp_x = paddle.to_tensor(x, stop_gradient=False)
+        pp_weight = paddle.to_tensor(weight, stop_gradient=False)
+        pp_bias = paddle.to_tensor(bias, stop_gradient=False)
+        pp_out = paddle.matmul(pp_x, pp_weight, transx, transy)
+        pp_bias_out = paddle.add(pp_out, pp_bias)
+        pp_grad_out = paddle.to_tensor(grad_out)
+        paddle.autograd.backward(
+            [pp_bias_out], [pp_grad_out], retain_graph=True)
+        return pp_bias_out, pp_x.grad, pp_weight.grad, pp_bias.grad
+    def run_custom_fuse_dense(x, weight, bias, transx, transy, grad_out):
+        x_tensor = paddle.to_tensor(x, stop_gradient=False)
+        weight_tensor = paddle.to_tensor(weight, stop_gradient=False)
+        bias_tensor = paddle.to_tensor(bias, stop_gradient=False)
+        out_tensor = custom_fused_dense(x_tensor, weight_tensor, bias_tensor,
+                                        transx, transy)
+        grad_out_tensor = paddle.to_tensor(grad_out)
+        paddle.autograd.backward(
+            [out_tensor], [grad_out_tensor], retain_graph=True)
+        return out_tensor, x_tensor.grad, weight_tensor.grad, bias_tensor.grad
+    ''' 
+    def run_numpy_mm_bias(x, weight, bias, transx, transy):
+        out = np.matmul(x, weight.transpose(1, 0))
+        out = out + bias
+        return out
+    '''
+    def run_ref_backward(x, weight, bias, trans, transy, grad_out):
+        x_tensor = paddle.to_tensor(x, stop_gradient=False)
+        weight_tensor = paddle.to_tensor(weight, stop_gradient=False)
+        grad_out_tensor = paddle.to_tensor(grad_out)
+        # d_weight: x * grad_out^t (nt)
+        # d_input: weight * grad_out (nn)
+        if transy:
+            ref_grad_weight = paddle.matmul(grad_out_tensor, x_tensor, True,
+                                            False)
+            ref_grad_input = paddle.matmul(grad_out_tensor, weight_tensor,
+                                           False, False)
+        else:
+            ref_grad_weight = paddle.matmul(x_tensor, grad_out_tensor, True,
+                                            False)
+            ref_grad_input = paddle.matmul(grad_out_tensor, weight_tensor,
+                                           False, True)
+        ref_grad_bias = paddle.sum(grad_out_tensor, axis=0)
+        return ref_grad_input, ref_grad_weight, ref_grad_bias
+    '''
+    ref_out = run_numpy_mm_bias(x, weight, bias, transx, transy)
+    '''
+    ref_pp_out, ref_pp_x_grad, ref_pp_weight_grad, ref_pp_bias_grad = run_paddle_mm_bias(
+        x, weight, bias, transx, transy, grad_out)
+    #print("ref_pp_x_grad shape: ", ref_pp_x_grad.shape)
+    custom_out, x_grad, weight_grad, bias_grad = run_custom_fuse_dense(
+        x, weight, bias, transx, transy, grad_out)
+    #print("x_grad shape: ", x_grad.shape)
+    new_ref_grad_input, new_ref_grad_weight, new_ref_grad_bias = run_ref_backward(
+        x, weight, bias, transx, transy, grad_out)
+    # check out
+    '''
+    np.testing.assert_allclose(
+            ref_out, custom_out.numpy(), 
+            rtol=1e-5, atol=atol)
+    '''
+    np.testing.assert_allclose(
+        ref_pp_out.numpy(), custom_out.numpy(), rtol=1e-5, atol=atol)
+    # check grad
+    np.testing.assert_allclose(
+        ref_pp_x_grad.numpy(), x_grad.numpy(), rtol=1e-5, atol=atol)
+    np.testing.assert_allclose(
+        new_ref_grad_weight.numpy(), weight_grad.numpy(), rtol=1e-5, atol=atol)
+    np.testing.assert_allclose(
+        new_ref_grad_bias.numpy(), bias_grad.numpy(), rtol=1e-5, atol=atol)
+def generate_input_data(m, dtype=np.float16):
+    # index = np.random.randint(0, 5, (m))
+    data = np.random.random((m)).astype(dtype)
+    for i in range(m):
+        #index[i] = 1
+        #data[i] = 1.0/(np.exp2(index[i]))
+        if i % 2 == 0:
+            data[i] = 0.25
+        elif i % 3 == 0:
+            data[i] = 0.5
+        else:
+            data[i] = 0.0625
+    return data
+def generate_fixed_input(x_m, in_feature, out_feature, transy, dtype):
+    x = generate_input_data(x_m * in_feature, dtype)
+    x = x.reshape(x_m, in_feature)
+    weight = generate_input_data(out_feature * in_feature, dtype)
+    if transy:
+        weight = weight.reshape(out_feature, in_feature)
+    else:
+        weight = weight.reshape(in_feature, out_feature)
+    bias = generate_input_data(out_feature, dtype)
+    grad_out = generate_input_data(x_m * out_feature, dtype)
+    grad_out = grad_out.reshape(x_m, out_feature)
+    return x, weight, bias, grad_out
+def generate_ones_input(x_m, in_feature, out_feature, transy, dtype):
+    x = np.ones(x_m * in_feature).astype(dtype)
+    x = x.reshape(x_m, in_feature)
+    weight = np.ones(out_feature * in_feature).astype(dtype)
+    if transy:
+        weight = weight.reshape(out_feature, in_feature)
+    else:
+        weight = weight.reshape(in_feature, out_feature)
+    bias = np.ones(out_feature).astype(dtype)
+    # bias = np.zeros(out_feature).astype(dtype)
+    grad_out = np.ones(x_m * out_feature).astype(dtype)
+    grad_out = grad_out.reshape(x_m, out_feature)
+    return x, weight, bias, grad_out
+def test_driver(i=0,
+                x_m=56,
+                in_feature=4,
+                out_feature=8,
+                transx=False,
+                transy=True,
+                dtype=np.float16,
+                atol=1e-2):
+    for i in range(iters):
+        if i == 0:
+            x, weight, bias, grad_out = generate_ones_input(
+                x_m, in_feature, out_feature, transy, dtype)
+        elif i == 1:
+            x, weight, bias, grad_out = generate_fixed_input(
+                x_m, in_feature, out_feature, transy, dtype)
+        else:
+            x = np.random.random((x_m, in_feature)).astype(dtype)
+            if transy:
+                weight = np.random.random(
+                    (out_feature, in_feature)).astype(dtype)
+            else:
+                weight = np.random.random(
+                    (in_feature, out_feature)).astype(dtype)
+            bias = np.random.random((out_feature)).astype(dtype)
+            grad_out = np.random.random((x_m, out_feature)).astype(dtype)
+        test_fused_dense_op(x, weight, bias, transx, transy, grad_out, atol)
+## Note: mlperf config: x_m from xx to 28672, in_feature is 1024, out_feature is 1024/4096. 
+for i in range(3):
+    print("Begin Test ", i)
+    if i == 0 or i == 1:
+        fp16_atol = 1e-5
+        atol = 1e-5
+    else:
+        fp16_atol = 0.3
+        atol = 1e-3
+    #####################################################
+    ## nt
+    ## randome input: 0.2 is not right, should set to 0.3
+    print("gemm_nt + bias test: ")
+    test_driver(
+        i,
+        x_m=56,
+        in_feature=1024,
+        out_feature=1024,
+        transx=False,
+        transy=True,
+        atol=fp16_atol)
+    test_driver(
+        i,
+        x_m=56,
+        in_feature=1024,
+        out_feature=4096,
+        transx=False,
+        transy=True,
+        atol=fp16_atol)
+    test_driver(
+        i,
+        x_m=1000,
+        in_feature=1024,
+        out_feature=1024,
+        transx=False,
+        transy=True,
+        atol=fp16_atol)
+    ## for 0.0625 input, fp16 type's max error is 0.03125 
+    test_driver(
+        i,
+        x_m=2000,
+        in_feature=1024,
+        out_feature=1024,
+        transx=False,
+        transy=True,
+        dtype=np.float32,
+        atol=atol)
+    test_driver(
+        i,
+        x_m=28672,
+        in_feature=1024,
+        out_feature=1024,
+        transx=False,
+        transy=True,
+        dtype=np.float32,
+        atol=atol)
+    test_driver(
+        i,
+        x_m=28672,
+        in_feature=1024,
+        out_feature=4096,
+        transx=False,
+        transy=True,
+        dtype=np.float32,
+        atol=atol)
+    #####################################################
+    ## nn 
+    print("gemm_nn + bias test: ")
+    test_driver(
+        i,
+        x_m=2,
+        in_feature=1,
+        out_feature=4,
+        transx=False,
+        transy=False,
+        dtype=np.float32,
+        atol=atol)
+    test_driver(
+        i,
+        x_m=56,
+        in_feature=1024,
+        out_feature=1024,
+        transx=False,
+        transy=False,
+        dtype=np.float32,
+        atol=atol)
+    test_driver(
+        i,
+        x_m=56,
+        in_feature=1024,
+        out_feature=4096,
+        transx=False,
+        transy=False,
+        dtype=np.float32,
+        atol=atol)
+    test_driver(
+        i,
+        x_m=28672,
+        in_feature=1024,
+        out_feature=1024,
+        transx=False,
+        transy=False,
+        dtype=np.float32,
+        atol=atol)
+    test_driver(
+        i,
+        x_m=28672,
+        in_feature=1024,
+        out_feature=4096,
+        transx=False,
+        transy=False,
+        dtype=np.float32,
+        atol=atol)
--- a/external_ops/fused_dense_op/test_fused_dense_perf.py
+++ b/external_ops/fused_dense_op/test_fused_dense_perf.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from custom_setup_ops import custom_fused_dense
+import numpy as np
+import time
+iters = 100
+place = paddle.CUDAPlace(0)
+def test_fused_dense_op(x, weight, bias, transx, transy, grad_out, atol):
+    # x * weight^t + bias
+    def run_paddle_mm_bias(x, weight, bias, transx, transy, grad_out):
+        pp_x = paddle.to_tensor(x, stop_gradient=False)
+        pp_weight = paddle.to_tensor(weight, stop_gradient=False)
+        pp_bias = paddle.to_tensor(bias, stop_gradient=False)
+        pp_grad_out = paddle.to_tensor(grad_out)
+        # warmup 
+        pp_out = paddle.matmul(pp_x, pp_weight, transx, transy)
+        pp_bias_out = paddle.add(pp_out, pp_bias)
+        paddle.autograd.backward(
+            [pp_bias_out], [pp_grad_out], retain_graph=True)
+        paddle.device.cuda.synchronize(place)
+        t1 = time.time()
+        for i in range(iters):
+            pp_out = paddle.matmul(pp_x, pp_weight, transx, transy)
+            pp_bias_out = paddle.add(pp_out, pp_bias)
+            paddle.autograd.backward(
+                [pp_bias_out], [pp_grad_out], retain_graph=True)
+        paddle.device.cuda.synchronize(place)
+        t2 = time.time()
+        print("paddle gemm time is: (ms)", (t2 - t1) * 1000)
+        return pp_bias_out, pp_x.grad, pp_weight.grad, pp_bias.grad
+    def run_custom_fuse_dense(x, weight, bias, transx, transy, grad_out):
+        x_tensor = paddle.to_tensor(x, stop_gradient=False)
+        weight_tensor = paddle.to_tensor(weight, stop_gradient=False)
+        bias_tensor = paddle.to_tensor(bias, stop_gradient=False)
+        grad_out_tensor = paddle.to_tensor(grad_out)
+        # warmup 
+        out_tensor = custom_fused_dense(x_tensor, weight_tensor, bias_tensor,
+                                        transx, transy)
+        paddle.autograd.backward(
+            [out_tensor], [grad_out_tensor], retain_graph=True)
+        t3 = time.time()
+        for i in range(iters):
+            out_tensor = custom_fused_dense(x_tensor, weight_tensor,
+                                            bias_tensor, transx, transy)
+            paddle.autograd.backward(
+                [out_tensor], [grad_out_tensor], retain_graph=True)
+        paddle.device.cuda.synchronize(place)
+        t4 = time.time()
+        print("cublaslt gemm time is: (ms)", (t4 - t3) * 1000)
+        return out_tensor, x_tensor.grad, weight_tensor.grad, bias_tensor.grad
+    def run_ref_backward(x, weight, bias, trans, transy, grad_out):
+        x_tensor = paddle.to_tensor(x, stop_gradient=False)
+        weight_tensor = paddle.to_tensor(weight, stop_gradient=False)
+        grad_out_tensor = paddle.to_tensor(grad_out)
+        # d_weight: x * grad_out^t (nt)
+        # d_input: weight * grad_out (nn)
+        if transy:
+            ref_grad_weight = paddle.matmul(grad_out_tensor, x_tensor, True,
+                                            False)
+            ref_grad_input = paddle.matmul(grad_out_tensor, weight_tensor,
+                                           False, False)
+        else:
+            ref_grad_weight = paddle.matmul(x_tensor, grad_out_tensor, True,
+                                            False)
+            ref_grad_input = paddle.matmul(grad_out_tensor, weight_tensor,
+                                           False, True)
+        ref_grad_bias = paddle.sum(grad_out_tensor, axis=0)
+        return ref_grad_input, ref_grad_weight, ref_grad_bias
+    #print("ref_pp_x_grad shape: ", ref_pp_x_grad.shape)
+    custom_out, x_grad, weight_grad, bias_grad = run_custom_fuse_dense(
+        x, weight, bias, transx, transy, grad_out)
+    ref_pp_out, ref_pp_x_grad, ref_pp_weight_grad, ref_pp_bias_grad = run_paddle_mm_bias(
+        x, weight, bias, transx, transy, grad_out)
+def generate_input_data(m, dtype=np.float16):
+    # index = np.random.randint(0, 5, (m))
+    data = np.random.random((m)).astype(dtype)
+    for i in range(m):
+        #index[i] = 1
+        #data[i] = 1.0/(np.exp2(index[i]))
+        if i % 2 == 0:
+            data[i] = 0.25
+        elif i % 3 == 0:
+            data[i] = 0.5
+        else:
+            data[i] = 0.0625
+    return data
+def generate_fixed_input(x_m, in_feature, out_feature, transy, dtype):
+    x = generate_input_data(x_m * in_feature, dtype)
+    x = x.reshape(x_m, in_feature)
+    weight = generate_input_data(out_feature * in_feature, dtype)
+    if transy:
+        weight = weight.reshape(out_feature, in_feature)
+    else:
+        weight = weight.reshape(in_feature, out_feature)
+    bias = generate_input_data(out_feature, dtype)
+    grad_out = generate_input_data(x_m * out_feature, dtype)
+    grad_out = grad_out.reshape(x_m, out_feature)
+    return x, weight, bias, grad_out
+def generate_ones_input(x_m, in_feature, out_feature, transy, dtype):
+    x = np.ones(x_m * in_feature).astype(dtype)
+    x = x.reshape(x_m, in_feature)
+    weight = np.ones(out_feature * in_feature).astype(dtype)
+    if transy:
+        weight = weight.reshape(out_feature, in_feature)
+    else:
+        weight = weight.reshape(in_feature, out_feature)
+    bias = np.ones(out_feature).astype(dtype)
+    # bias = np.zeros(out_feature).astype(dtype)
+    grad_out = np.ones(x_m * out_feature).astype(dtype)
+    grad_out = grad_out.reshape(x_m, out_feature)
+    return x, weight, bias, grad_out
+def test_driver(i=0,
+                x_m=56,
+                in_feature=4,
+                out_feature=8,
+                transx=False,
+                transy=True,
+                dtype=np.float16,
+                atol=1e-2):
+    print("m, in_feature, out_feature = ", x_m, in_feature, out_feature)
+    if i == 0:
+        x, weight, bias, grad_out = generate_ones_input(
+            x_m, in_feature, out_feature, transy, dtype)
+    elif i == 1:
+        x, weight, bias, grad_out = generate_fixed_input(
+            x_m, in_feature, out_feature, transy, dtype)
+    else:
+        x = np.random.random((x_m, in_feature)).astype(dtype)
+        if transy:
+            weight = np.random.random((out_feature, in_feature)).astype(dtype)
+        else:
+            weight = np.random.random((in_feature, out_feature)).astype(dtype)
+        bias = np.random.random((out_feature)).astype(dtype)
+        grad_out = np.random.random((x_m, out_feature)).astype(dtype)
+    test_fused_dense_op(x, weight, bias, transx, transy, grad_out, atol)
+    print("\n")
+## Note: mlperf config: x_m from xx to 28672, in_feature is 1024, out_feature is 1024/4096.
+i = 0
+fp16_atol = 1e-5
+atol = 1e-5
+data_type = np.float16
+###############################################################################################################
+## nt
+## randome input: 0.2 is not right, should set to 0.3
+print("gemm_nt + bias test: ")
+test_driver(
+    i,
+    x_m=56,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=True,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=1000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=True,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=2000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=True,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=4000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=True,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=8000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=True,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=16000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=True,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=28672,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=True,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=28672,
+    in_feature=1024,
+    out_feature=4096,
+    transx=False,
+    transy=True,
+    dtype=data_type,
+    atol=atol)
+###############################################################################################################
+## nn 
+print("gemm_nn + bias test: ")
+test_driver(
+    i,
+    x_m=56,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=False,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=1000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=False,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=2000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=False,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=4000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=False,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=8000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=False,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=16000,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=False,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=28672,
+    in_feature=1024,
+    out_feature=1024,
+    transx=False,
+    transy=False,
+    dtype=data_type,
+    atol=atol)
+test_driver(
+    i,
+    x_m=28672,
+    in_feature=1024,
+    out_feature=4096,
+    transx=False,
+    transy=False,
+    dtype=data_type,
+    atol=atol)
--- a/external_ops/fused_dropout_residual_ln/README.md
+++ b/external_ops/fused_dropout_residual_ln/README.md
+# function
+This fused op implements the following logics:
+```
+layer_norm(residual + dropout(input))
+```
+# Accuracy test
+python test_fused_dropout_op.py
+# Perf test
+python test_fused_dropout_perf.py
+# TODO
+1. In the unittest, we only test the accuracy when dropout_rate is 0.
+2. how to set is_test is true for dropout in eval phase?
--- a/external_ops/fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cc
+++ b/external_ops/fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "paddle/extension.h"
+std::vector<std::vector<int64_t>> FusedDropoutResidualLnInferShape(
+    const std::vector<int64_t>& x_shape,
+    const std::vector<int64_t>& residual_shape,
+    const std::vector<int64_t>& ln_scale_shape,
+    const std::vector<int64_t>& ln_bias_shape) {
+  int x_m = 1;
+  for (int i = 0; i < x_shape.size() - 1; i++) {
+    x_m *= x_shape[i];
+  }
+  const std::vector<int64_t> ln_out_shape = {x_m};
+  return {x_shape, x_shape, ln_out_shape, ln_out_shape, x_shape};
+}
+// todo: now, ln_scale is fp16; how ot set ln_mean and ln_var is fp32?
+std::vector<paddle::DataType> FusedDropoutResidualLnInferDtype(
+    paddle::DataType x_dtype,
+    paddle::DataType residual_dtype,
+    paddle::DataType ln_scale_dtype,
+    paddle::DataType ln_bias_dtype) {
+  // the type of ln_mean/ln_var is the same as ln_scale.
+  return {x_dtype,
+          paddle::DataType::UINT8,
+          paddle::DataType::FLOAT32,
+          paddle::DataType::FLOAT32,
+          // ln_scale_dtype,
+          // ln_scale_dtype,
+          x_dtype};
+}
+PD_BUILD_OP(custom_fused_dropout_residual_ln)
+    .Inputs({"X", "Residual", "LnScale", "LnBias"})
+    .Outputs({"Out", "DropoutMask", "LnMean", "LnVar", "DropoutResidualOut"})
+    .Attrs({"ln_epsilon: float",
+            "is_test: bool",
+            "fix_seed: bool",
+            "seed_val: int",
+            "is_upscale_in_train: bool",
+            "dropout_rate: float"})
+    .SetInferShapeFn(PD_INFER_SHAPE(FusedDropoutResidualLnInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(FusedDropoutResidualLnInferDtype));
+PD_BUILD_GRAD_OP(custom_fused_dropout_residual_ln)
+    .Inputs({"X",
+             "Residual",
+             "LnScale",
+             "LnBias",
+             "DropoutMask",
+             "LnMean",
+             "LnVar",
+             "DropoutResidualOut",
+             paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X"),
+              paddle::Grad("Residual"),
+              paddle::Grad("LnScale"),
+              paddle::Grad("LnBias")})
+    .Attrs({"ln_epsilon: float",
+            "is_test: bool",
+            "fix_seed: bool",
+            "seed_val: int",
+            "is_upscale_in_train: bool",
+            "dropout_rate: float"});
--- a/external_ops/fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cu
+++ b/external_ops/fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cu
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_fused_dropout_residual_ln, ctx) {
+  namespace f = paddle::framework;
+  const auto &x = *ctx.Input<f::Tensor>("X");
+  const auto &residual = *ctx.Input<f::Tensor>("Residual");
+  const auto &ln_scale = *ctx.Input<f::Tensor>("LnScale");
+  const auto &ln_bias = *ctx.Input<f::Tensor>("LnBias");
+  auto &final_out = *ctx.Output<f::Tensor>("Out");
+  auto &dropout_mask_out = *ctx.Output<f::Tensor>("DropoutMask");
+  auto &ln_mean = *ctx.Output<f::Tensor>("LnMean");
+  auto &ln_var = *ctx.Output<f::Tensor>("LnVar");
+  auto &dropout_residual_out = *ctx.Output<f::Tensor>("DropoutResidualOut");
+  auto ln_epsilon = ctx.Attr<float>("ln_epsilon");
+  auto is_test = ctx.Attr<bool>("is_test");
+  auto fix_seed = ctx.Attr<bool>("fix_seed");
+  auto seed_val = ctx.Attr<int>("seed_val");
+  auto is_upscale_in_train = ctx.Attr<bool>("is_upscale_in_train");
+  auto dropout_rate = ctx.Attr<float>("dropout_rate");
+  auto &dev_ctx = ctx.cuda_device_context();
+  auto place = dev_ctx.GetPlace();
+  // inputs
+  const auto &x_dims = x.dims();
+  int x_m = 1;
+  for (int i = 0; i < x_dims.size() - 1; i++) {
+    x_m *= x_dims[i];
+  }
+  int x_n = x_dims[x_dims.size() - 1];
+  // outputs
+  final_out.Resize(x_dims);
+  dropout_mask_out.Resize(x_dims);
+  dropout_residual_out.Resize(x_dims);
+  ln_mean.Resize({x_m});
+  ln_var.Resize({x_m});
+  paddle::operators::DropoutParam dropout_param(fix_seed,
+                                                0,
+                                                is_test,
+                                                is_upscale_in_train,
+                                                dropout_rate,
+                                                nullptr,
+                                                seed_val);
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      x.dtype(), "LayernormResidualDropoutBias", ([&] {
+        paddle::operators::FusedDropoutLayerNormHelper<data_t, uint8_t>
+            fused_dropout_layernorm_helper(
+                dev_ctx, x_m, x_n, dropout_param, ln_epsilon);
+        fused_dropout_layernorm_helper
+            .LayernormResidualDropoutBias<data_t, true>(
+                dev_ctx,
+                x.data<data_t>(),         // out_linear_out_data,
+                residual.data<data_t>(),  // residual_data
+                nullptr,                  // bias_data,
+                ln_scale.data<data_t>(),
+                ln_bias.data<data_t>(),
+                dev_ctx.Alloc<data_t>(&dropout_residual_out),
+                dev_ctx.Alloc<uint8_t>(&dropout_mask_out),
+                dev_ctx.Alloc<data_t>(&final_out),
+                dev_ctx.Alloc<paddle::operators::LayerNormParamType<data_t>>(
+                    &ln_mean),
+                dev_ctx.Alloc<paddle::operators::LayerNormParamType<data_t>>(
+                    &ln_var));
+      }));
+}
+__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_fused_dropout_residual_ln_grad, ctx) {
+  namespace f = paddle::framework;
+  const auto &ln_scale = *ctx.Input<f::Tensor>("LnScale");
+  const auto &dropout_mask_out = *ctx.Input<f::Tensor>("DropoutMask");
+  const auto &ln_mean = *ctx.Input<f::Tensor>("LnMean");
+  const auto &ln_var = *ctx.Input<f::Tensor>("LnVar");
+  const auto &dropout_residual_out =
+      *ctx.Input<f::Tensor>("DropoutResidualOut");
+  const auto &grad_out = *ctx.Input<f::Tensor>(f::GradVarName("Out"));
+  auto &grad_x = *ctx.Output<f::Tensor>(f::GradVarName("X"));
+  auto &grad_residual = *ctx.Output<f::Tensor>(f::GradVarName("Residual"));
+  auto &grad_ln_scale = *ctx.Output<f::Tensor>(f::GradVarName("LnScale"));
+  auto &grad_ln_bias = *ctx.Output<f::Tensor>(f::GradVarName("LnBias"));
+  f::Tensor grad_dropout_residual_out;
+  auto ln_epsilon = ctx.Attr<float>("ln_epsilon");
+  auto is_test = ctx.Attr<bool>("is_test");
+  auto fix_seed = ctx.Attr<bool>("fix_seed");
+  auto seed_val = ctx.Attr<int>("seed_val");
+  auto is_upscale_in_train = ctx.Attr<bool>("is_upscale_in_train");
+  auto dropout_rate = ctx.Attr<float>("dropout_rate");
+  auto &dev_ctx = ctx.cuda_device_context();
+  auto place = dev_ctx.GetPlace();
+  const auto &x_dims = grad_out.dims();
+  int x_m = 1;
+  for (int i = 0; i < x_dims.size() - 1; i++) {
+    x_m *= x_dims[i];
+  }
+  int x_n = x_dims[x_dims.size() - 1];
+  // output
+  grad_x.Resize(x_dims);
+  grad_residual.Resize(x_dims);
+  grad_dropout_residual_out.Resize(x_dims);
+  grad_ln_scale.Resize(ln_scale.dims());
+  grad_ln_bias.Resize(ln_scale.dims());
+  paddle::operators::DropoutParam dropout_param(fix_seed,
+                                                0,
+                                                is_test,
+                                                is_upscale_in_train,
+                                                dropout_rate,
+                                                nullptr,
+                                                seed_val);
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      grad_out.dtype(), "LayernormResidualDropoutBiasGrad", ([&] {
+        paddle::operators::FusedDropoutLayerNormHelper<data_t, uint8_t>
+            fused_dropout_layernorm_helper(
+                dev_ctx, x_m, x_n, dropout_param, ln_epsilon);
+        fused_dropout_layernorm_helper
+            .LayernormResidualDropoutBiasGrad<data_t, true>(
+                dev_ctx,
+                grad_out.data<data_t>(),
+                dropout_residual_out.data<data_t>(),
+                dropout_mask_out.data<uint8_t>(),
+                ln_scale.data<data_t>(),
+                ln_mean.data<paddle::operators::LayerNormParamType<data_t>>(),
+                ln_var.data<paddle::operators::LayerNormParamType<data_t>>(),
+                dev_ctx.Alloc<data_t>(&grad_dropout_residual_out),
+                dev_ctx.Alloc<data_t>(&grad_ln_scale),
+                dev_ctx.Alloc<data_t>(&grad_ln_bias),
+                dev_ctx.Alloc<data_t>(&grad_x),  // d_out_linear_out_data,
+                nullptr,                         // d_out_linear_bias_data,
+                dev_ctx.Alloc<data_t>(&grad_residual));
+      }));
+}
--- a/external_ops/fused_dropout_residual_ln/test_fused_dropout_op.py
+++ b/external_ops/fused_dropout_residual_ln/test_fused_dropout_op.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from custom_setup_ops import custom_fused_dropout_residual_ln
+import numpy as np
+iters = 2
+def test_fused_dropout_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
+                          grad_out, atol):
+    # layer_norm(dropout(x) + residual)
+    def run_paddle_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
+                      grad_out):
+        pp_x = paddle.to_tensor(x, stop_gradient=False)
+        pp_residual = paddle.to_tensor(residual, stop_gradient=False)
+        pp_ln_scale = paddle.to_tensor(ln_scale, stop_gradient=False)
+        pp_ln_bias = paddle.to_tensor(ln_bias, stop_gradient=False)
+        pp_grad_out = paddle.to_tensor(grad_out)
+        if dropout_rate > 0:
+            pp_out = paddle.nn.functional.dropout(pp_x, dropout_rate)
+            pp_add_out = paddle.add(pp_out, pp_residual)
+        else:
+            pp_add_out = paddle.add(pp_x, pp_residual)
+        pp_out = paddle.nn.functional.layer_norm(
+            pp_add_out, pp_add_out.shape[1:], pp_ln_scale, pp_ln_bias, epsilon)
+        paddle.autograd.backward([pp_out], [pp_grad_out], retain_graph=True)
+        return pp_out, pp_add_out, pp_x.grad, pp_residual.grad, pp_ln_scale.grad, pp_ln_bias.grad
+    def run_custom_fuse_dropout_op(x, residual, ln_scale, ln_bias, epsilon,
+                                   dropout_rate, grad_out):
+        x_tensor = paddle.to_tensor(x, stop_gradient=False)
+        residual_tensor = paddle.to_tensor(residual, stop_gradient=False)
+        ln_scale_tensor = paddle.to_tensor(ln_scale, stop_gradient=False)
+        ln_bias_tensor = paddle.to_tensor(ln_bias, stop_gradient=False)
+        grad_out_tensor = paddle.to_tensor(grad_out)
+        # Note: use the default config of dropout.
+        is_test = False
+        fix_seed = True
+        is_upscale_in_train = True
+        seed_val = 0
+        out_tensor, dropout_mask, ln_mean, ln_var, dropout_residual_out = custom_fused_dropout_residual_ln(
+            x_tensor, residual_tensor, ln_scale_tensor, ln_bias_tensor, epsilon,
+            is_test, fix_seed, seed_val, is_upscale_in_train, dropout_rate)
+        paddle.autograd.backward(
+            [out_tensor], [grad_out_tensor], retain_graph=True)
+        return out_tensor, dropout_residual_out, x_tensor.grad, residual_tensor.grad, ln_scale_tensor.grad, ln_bias_tensor.grad
+    pp_out, pp_add_out, pp_x_grad, pp_residual_grad, pp_ln_scale_grad, pp_ln_bias_grad = run_paddle_op(
+        x, residual, ln_scale, ln_bias, epsilon, dropout_rate, grad_out)
+    out, add_out, x_grad, residual_grad, ln_scale_grad, ln_bias_grad = run_custom_fuse_dropout_op(
+        x, residual, ln_scale, ln_bias, epsilon, dropout_rate, grad_out)
+    # check out
+    np.testing.assert_allclose(
+        pp_out.numpy(), out.numpy(), rtol=1e-5, atol=atol)
+    np.testing.assert_allclose(
+        pp_add_out.numpy(), add_out.numpy(), rtol=1e-5, atol=atol)
+    # check grad
+    np.testing.assert_allclose(
+        pp_x_grad.numpy(), x_grad.numpy(), rtol=1e-5, atol=atol)
+    np.testing.assert_allclose(
+        pp_residual_grad.numpy(), residual_grad.numpy(), rtol=1e-5, atol=atol)
+    np.testing.assert_allclose(
+        pp_ln_scale_grad.numpy(), ln_scale_grad.numpy(), rtol=1e-5, atol=atol)
+    np.testing.assert_allclose(
+        pp_ln_bias_grad.numpy(), ln_bias_grad.numpy(), rtol=1e-5, atol=atol)
+def generate_input_data(m, dtype=np.float16):
+    data = np.random.random((m)).astype(dtype)
+    for i in range(m):
+        if i % 2 == 0:
+            data[i] = 0.25
+        elif i % 3 == 0:
+            data[i] = 0.5
+        else:
+            data[i] = 0.0625
+    return data
+def generate_fixed_input(x_m, in_feature, dtype):
+    x = generate_input_data(x_m * in_feature, dtype)
+    x = x.reshape(x_m, in_feature)
+    residual = generate_input_data(x_m * in_feature, dtype)
+    residual = residual.reshape(x_m, in_feature)
+    ln_scale = generate_input_data(in_feature, dtype)
+    ln_bias = generate_input_data(in_feature, dtype)
+    grad_out = generate_input_data(x_m * in_feature, dtype)
+    grad_out = grad_out.reshape(x_m, in_feature)
+    return x, residual, ln_scale, ln_bias, grad_out
+def generate_ones_input(x_m, in_feature, dtype):
+    x = np.ones(x_m * in_feature).astype(dtype)
+    x = x.reshape(x_m, in_feature)
+    residual = np.ones(x_m * in_feature).astype(dtype)
+    residual = residual.reshape(x_m, in_feature)
+    ln_scale = np.ones(in_feature).astype(dtype)
+    ln_bias = np.ones(in_feature).astype(dtype)
+    grad_out = np.ones(x_m * in_feature).astype(dtype)
+    grad_out = grad_out.reshape(x_m, in_feature)
+    return x, residual, ln_scale, ln_bias, grad_out
+def test_driver(i=0,
+                x_m=56,
+                in_feature=4,
+                epsilon=1e-5,
+                dropout_rate=0,
+                dtype=np.float16,
+                atol=1e-2):
+    for j in range(iters):
+        if i == 0:
+            x, residual, ln_scale, ln_bias, grad_out = generate_ones_input(
+                x_m, in_feature, dtype)
+        elif i == 1:
+            x, residual, ln_scale, ln_bias, grad_out = generate_fixed_input(
+                x_m, in_feature, dtype)
+        else:
+            x = np.random.random((x_m, in_feature)).astype(dtype)
+            residual = np.random.random((x_m, in_feature)).astype(dtype)
+            ln_scale = np.random.random((in_feature)).astype(dtype)
+            ln_bias = np.random.random((in_feature)).astype(dtype)
+            grad_out = np.random.random((x_m, in_feature)).astype(dtype)
+        test_fused_dropout_op(x, residual, ln_scale, ln_bias, epsilon,
+                              dropout_rate, grad_out, atol)
+## Note: mlperf config: x_m is xx to 28672, in_feature is 1024.
+## Note: only dropout_rate=0 is tested.
+for i in range(3):
+    print("Begin Test ", i)
+    atol = 1e-5
+    if i == 0 or i == 1:
+        fp16_atol = 1e-5
+    else:
+        fp16_atol = 1e-2
+        atol = 1e-3
+    # fp16
+    test_driver(
+        i,
+        x_m=56,
+        in_feature=1024,
+        epsilon=1e-5,
+        dropout_rate=0,
+        dtype=np.float16,
+        atol=fp16_atol)
+    test_driver(
+        i,
+        x_m=28672,
+        in_feature=1024,
+        epsilon=1e-5,
+        dropout_rate=0,
+        dtype=np.float16,
+        atol=fp16_atol)
+    # fp32
+    test_driver(
+        i,
+        x_m=28672,
+        in_feature=1024,
+        epsilon=1e-5,
+        dropout_rate=0,
+        dtype=np.float32,
+        atol=atol)
--- a/external_ops/fused_dropout_residual_ln/test_fused_dropout_perf.py
+++ b/external_ops/fused_dropout_residual_ln/test_fused_dropout_perf.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from custom_setup_ops import custom_fused_dropout_residual_ln
+import numpy as np
+import time
+iters = 100
+place = paddle.CUDAPlace(0)
+def test_fused_dropout_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
+                          grad_out, atol):
+    # layer_norm(dropout(x) + residual)
+    def run_paddle_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
+                      grad_out):
+        pp_x = paddle.to_tensor(x, stop_gradient=False)
+        pp_residual = paddle.to_tensor(residual, stop_gradient=False)
+        pp_ln_scale = paddle.to_tensor(ln_scale, stop_gradient=False)
+        pp_ln_bias = paddle.to_tensor(ln_bias, stop_gradient=False)
+        pp_grad_out = paddle.to_tensor(grad_out)
+        # warmup 
+        if dropout_rate > 0:
+            pp_out = paddle.nn.functional.dropout(pp_x, dropout_rate)
+            pp_add_out = paddle.add(pp_out, pp_residual)
+        else:
+            pp_add_out = paddle.add(pp_x, pp_residual)
+        pp_out = paddle.nn.functional.layer_norm(
+            pp_add_out, pp_add_out.shape[1:], pp_ln_scale, pp_ln_bias, epsilon)
+        paddle.autograd.backward([pp_out], [pp_grad_out], retain_graph=True)
+        paddle.device.cuda.synchronize(place)
+        t1 = time.time()
+        for i in range(iters):
+            if dropout_rate > 0:
+                pp_out = paddle.nn.functional.dropout(pp_x, dropout_rate)
+                pp_add_out = paddle.add(pp_out, pp_residual)
+            else:
+                pp_add_out = paddle.add(pp_x, pp_residual)
+            pp_out = paddle.nn.functional.layer_norm(
+                pp_add_out, pp_add_out.shape[1:], pp_ln_scale, pp_ln_bias,
+                epsilon)
+            paddle.autograd.backward([pp_out], [pp_grad_out], retain_graph=True)
+        paddle.device.cuda.synchronize(place)
+        t2 = time.time()
+        print("paddle dropout+add+ln time is: (ms)", (t2 - t1) * 1000)
+        return pp_out, pp_add_out, pp_x.grad, pp_residual.grad, pp_ln_scale.grad, pp_ln_bias.grad
+    def run_custom_fuse_dropout_op(x, residual, ln_scale, ln_bias, epsilon,
+                                   dropout_rate, grad_out):
+        x_tensor = paddle.to_tensor(x, stop_gradient=False)
+        residual_tensor = paddle.to_tensor(residual, stop_gradient=False)
+        ln_scale_tensor = paddle.to_tensor(ln_scale, stop_gradient=False)
+        ln_bias_tensor = paddle.to_tensor(ln_bias, stop_gradient=False)
+        grad_out_tensor = paddle.to_tensor(grad_out)
+        # Note: use the default config of dropout.
+        is_test = False
+        fix_seed = True
+        is_upscale_in_train = True
+        seed_val = 0
+        # warmup 
+        out_tensor, dropout_mask, ln_mean, ln_var, dropout_residual_out = custom_fused_dropout_residual_ln(
+            x_tensor, residual_tensor, ln_scale_tensor, ln_bias_tensor, epsilon,
+            is_test, fix_seed, seed_val, is_upscale_in_train, dropout_rate)
+        paddle.autograd.backward(
+            [out_tensor], [grad_out_tensor], retain_graph=True)
+        paddle.device.cuda.synchronize(place)
+        t3 = time.time()
+        for i in range(iters):
+            out_tensor, dropout_mask, ln_mean, ln_var, dropout_residual_out = custom_fused_dropout_residual_ln(
+                x_tensor, residual_tensor, ln_scale_tensor, ln_bias_tensor,
+                epsilon, is_test, fix_seed, seed_val, is_upscale_in_train,
+                dropout_rate)
+            paddle.autograd.backward(
+                [out_tensor], [grad_out_tensor], retain_graph=True)
+        paddle.device.cuda.synchronize(place)
+        t4 = time.time()
+        print("fused_dropout_residual_ln op time is: (ms)", (t4 - t3) * 1000)
+        return out_tensor, dropout_residual_out, x_tensor.grad, residual_tensor.grad, ln_scale_tensor.grad, ln_bias_tensor.grad
+    pp_out, pp_add_out, pp_x_grad, pp_residual_grad, pp_ln_scale_grad, pp_ln_bias_grad = run_paddle_op(
+        x, residual, ln_scale, ln_bias, epsilon, dropout_rate, grad_out)
+    out, add_out, x_grad, residual_grad, ln_scale_grad, ln_bias_grad = run_custom_fuse_dropout_op(
+        x, residual, ln_scale, ln_bias, epsilon, dropout_rate, grad_out)
+def generate_input_data(m, dtype=np.float16):
+    data = np.random.random((m)).astype(dtype)
+    for i in range(m):
+        if i % 2 == 0:
+            data[i] = 0.25
+        elif i % 3 == 0:
+            data[i] = 0.5
+        else:
+            data[i] = 0.0625
+    return data
+def generate_fixed_input(x_m, in_feature, dtype):
+    x = generate_input_data(x_m * in_feature, dtype)
+    x = x.reshape(x_m, in_feature)
+    residual = generate_input_data(x_m * in_feature, dtype)
+    residual = residual.reshape(x_m, in_feature)
+    ln_scale = generate_input_data(in_feature, dtype)
+    ln_bias = generate_input_data(in_feature, dtype)
+    grad_out = generate_input_data(x_m * in_feature, dtype)
+    grad_out = grad_out.reshape(x_m, in_feature)
+    return x, residual, ln_scale, ln_bias, grad_out
+def generate_ones_input(x_m, in_feature, dtype):
+    x = np.ones(x_m * in_feature).astype(dtype)
+    x = x.reshape(x_m, in_feature)
+    residual = np.ones(x_m * in_feature).astype(dtype)
+    residual = residual.reshape(x_m, in_feature)
+    ln_scale = np.ones(in_feature).astype(dtype)
+    ln_bias = np.ones(in_feature).astype(dtype)
+    grad_out = np.ones(x_m * in_feature).astype(dtype)
+    grad_out = grad_out.reshape(x_m, in_feature)
+    return x, residual, ln_scale, ln_bias, grad_out
+def test_driver(i=0,
+                x_m=56,
+                in_feature=4,
+                epsilon=1e-5,
+                dropout_rate=0,
+                dtype=np.float16,
+                atol=1e-2):
+    print("nrows, ncols, dtype = ", x_m, in_feature, dtype)
+    if i == 0:
+        x, residual, ln_scale, ln_bias, grad_out = generate_ones_input(
+            x_m, in_feature, dtype)
+    elif i == 1:
+        x, residual, ln_scale, ln_bias, grad_out = generate_fixed_input(
+            x_m, in_feature, dtype)
+    else:
+        x = np.random.random((x_m, in_feature)).astype(dtype)
+        residual = np.random.random((x_m, in_feature)).astype(dtype)
+        ln_scale = np.random.random((in_feature)).astype(dtype)
+        ln_bias = np.random.random((in_feature)).astype(dtype)
+        grad_out = np.random.random((x_m, in_feature)).astype(dtype)
+    test_fused_dropout_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
+                          grad_out, atol)
+## Note: mlperf config: x_m is xx to 28672, in_feature is 1024.
+## Note: only dropout_rate=0 is tested.
+i = 0
+atol = 1e-5
+fp16_atol = 1e-5
+# fp16
+print("***************fp16 performance: ")
+x_m_values = [
+    56, 200, 500, 1000, 2000, 4000, 8000, 16000, 18000, 20000, 25000, 28000
+]
+for x_m in x_m_values:
+    test_driver(
+        i,
+        x_m=x_m,
+        in_feature=1024,
+        epsilon=1e-5,
+        dropout_rate=0.1,
+        dtype=np.float16,
+        atol=fp16_atol)
+# fp32
+print("**************fp32 performance: ")
+for x_m in x_m_values:
+    test_driver(
+        i,
+        x_m=x_m,
+        in_feature=1024,
+        epsilon=1e-5,
+        dropout_rate=0.1,
+        dtype=np.float32,
+        atol=atol)
--- a/external_ops/lr_op/lr_op_cuda.cc
+++ b/external_ops/lr_op/lr_op_cuda.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "paddle/extension.h"
+std::vector<std::vector<int64_t>> LrInferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+// input: [1], int64
+// output: [1], float
+std::vector<paddle::DataType> LrInferDtype(paddle::DataType x_dtype) {
+  return {paddle::DataType::FLOAT32};
+}
+PD_BUILD_OP(custom_lr)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .Attrs({"base_lr: float",
+            "end_lr: float",
+            "degree: float",
+            "warmup_step: int64_t",
+            "start_warmup_step: int64_t",
+            "max_step: int64_t"})
+    .SetInferShapeFn(PD_INFER_SHAPE(LrInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(LrInferDtype));
--- a/external_ops/lr_op/lr_op_cuda.cu
+++ b/external_ops/lr_op/lr_op_cuda.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+// x_data[0]: current step which is numbered from 0.
+// Note: when computing, we should use x_data[0] + 1.
+// y_data[0]: the lr var of this step
+__global__ void compute_lr_fwd_kernel(const int64_t* x_data,
+                                      float* y_data,
+                                      float base_lr,
+                                      float end_lr,
+                                      float degree,
+                                      int64_t start_warmup_step,
+                                      int64_t warmup_step,
+                                      int64_t max_step) {
+  int64_t step = x_data[0] + 1;
+  int64_t offset_step = (start_warmup_step == 0 ? 1 : 0);
+  int64_t mod_step = step - offset_step - start_warmup_step;
+  double y;
+  if (mod_step < warmup_step) {
+    auto p = mod_step / (warmup_step + 1e-6);
+    y = base_lr * p;
+  } else {
+    auto p = (step - offset_step) / static_cast<double>(max_step);
+    p = (p >= 1 ? 0 : (::pow(1 - p, degree)));
+    y = (base_lr - end_lr) * p + end_lr;
+  }
+  y_data[0] = static_cast<float>(y);
+  // y_data[0] = base_lr * (static_cast<float>(max_step - x_data[0]) /
+  // max_step);
+}
+__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_lr, ctx) {
+  namespace f = paddle::framework;
+  const auto* x = ctx.Input<f::Tensor>("X");
+  auto* out = ctx.Output<f::Tensor>("Out");
+  auto& dev_ctx = ctx.cuda_device_context();
+  auto place = dev_ctx.GetPlace();
+  auto stream = dev_ctx.stream();
+  float base_lr = ctx.Attr<float>("base_lr");
+  float end_lr = ctx.Attr<float>("end_lr");
+  float degree = ctx.Attr<float>("degree");
+  int64_t start_warmup_step = ctx.Attr<int64_t>("start_warmup_step");
+  int64_t warmup_step = ctx.Attr<int64_t>("warmup_step");
+  int64_t max_step = ctx.Attr<int64_t>("max_step");
+  const auto& x_dims = x->dims();
+  if (x_dims.size() != 1 || x_dims[0] != 1) {
+    PD_THROW("The shape of input x must be [1].");
+  }
+  auto out_dims = x_dims;
+  out->Resize(out_dims);
+  const int64_t* x_data = x->data<int64_t>();
+  float* out_data = out->mutable_data<float>(x->place());
+  compute_lr_fwd_kernel<<<1, 1, 0, stream>>>(x_data,
+                                             out_data,
+                                             base_lr,
+                                             end_lr,
+                                             degree,
+                                             start_warmup_step,
+                                             warmup_step,
+                                             max_step);
+}
--- a/external_ops/setup.py
+++ b/external_ops/setup.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import paddle.fluid.core as core
+from paddle.utils.cpp_extension import CUDAExtension, setup
+compile_dir = os.environ.get('/public/home/zhangqha/for_baidu/Paddle-develop/build', '/public/home/zhangqha/for_baidu/Paddle-develop/build')
+define_macros = []
+if core.is_compiled_with_mkldnn():
+    define_macros.append(('PADDLE_WITH_MKLDNN', None))
+if core.is_compiled_with_nccl():
+    define_macros.append(('PADDLE_WITH_NCCL', None))
+define_macros.append(('PADDLE_WITH_HIP', None))
+define_macros.append(('PADDLE_WITH_RCCL', None))
+setup(
+    name='custom_setup_ops',
+    ext_modules=CUDAExtension(
+        sources=[
+            #'./fused_dense_op/fused_dense_cuda.cc',
+            #'./fused_dense_op/fused_dense_cuda.cu',
+            ###'./fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cc',
+            ###'./fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cu',
+            #'./fmhalib/fmha_cuda.cc',
+            #'./fmhalib/fmha_cuda.cu',
+            './sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc',
+            './sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu',
+            './lr_op/lr_op_cuda.cc',
+            './lr_op/lr_op_cuda.cu',
+            './acc_merge/acc_merge.cc',
+            './acc_merge/acc_merge.cu',
+        ],
+        # extra_objects=[os.path.join(apex_lib_dir, 'libfmha.so')],
+        # include_dirs=[apex_dir],
+        # library_dirs=[apex_lib_dir],
+        # extra_link_args=['-lfmha', '-ldl', '-lcublas'],
+        _compile_dir=compile_dir,
+        define_macros=define_macros))