
#include "paddle/phi/api/backward/backward_api.h"
#include <memory>

#include "glog/logging.h"

#include "paddle/phi/api/lib/api_custom_impl.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/unary.h"

#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"

DECLARE_bool(conv2d_disable_cudnn);

namespace paddle {
namespace experimental {


PADDLE_API void atan2_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "atan2_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "atan2_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "atan2_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("atan2_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("atan2_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("atan2_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void cholesky_grad(const Tensor& out, const Tensor& out_grad, bool upper, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "cholesky_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "cholesky_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "cholesky_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("cholesky_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("cholesky_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("cholesky_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, upper, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void cholesky_solve_grad(const Tensor& x, const Tensor& y, const Tensor& out, const Tensor& out_grad, bool upper, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "cholesky_solve_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "cholesky_solve_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "cholesky_solve_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out = PrepareData(out, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("cholesky_solve_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("cholesky_solve_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("cholesky_solve_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out, *input_out_grad, upper, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void cross_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "cross_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "cross_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "cross_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("cross_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("cross_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("cross_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void diag_grad(const Tensor& x, const Tensor& out_grad, int offset, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "diag_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "diag_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "diag_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("diag_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("diag_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("diag_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, offset, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void diagonal_grad(const Tensor& x, const Tensor& out_grad, int offset, int axis1, int axis2, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "diagonal_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "diagonal_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "diagonal_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("diagonal_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("diagonal_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("diagonal_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, offset, axis1, axis2, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void digamma_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "digamma_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "digamma_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "digamma_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("digamma_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("digamma_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("digamma_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void dist_grad(const Tensor& x, const Tensor& y, const Tensor& out, const Tensor& out_grad, float p, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "dist_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "dist_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "dist_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out = PrepareData(out, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("dist_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("dist_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("dist_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out, *input_out_grad, p, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void dot_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "dot_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "dot_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "dot_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("dot_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("dot_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("dot_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void erf_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "erf_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "erf_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "erf_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("erf_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("erf_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("erf_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void erfinv_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "erfinv_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "erfinv_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "erfinv_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("erfinv_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("erfinv_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("erfinv_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void fft_c2c_grad(const Tensor& out_grad, const std::vector<int64_t>& axes, const std::string& normalization, bool forward, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fft_c2c_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fft_c2c_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fft_c2c_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fft_c2c_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fft_c2c_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const std::vector<int64_t>&, const std::string&, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fft_c2c_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, axes, normalization, forward, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void fft_c2r_grad(const Tensor& out_grad, const std::vector<int64_t>& axes, const std::string& normalization, bool forward, int64_t last_dim_size, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fft_c2r_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fft_c2r_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fft_c2r_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fft_c2r_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fft_c2r_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::FFTC2RGradInferMeta(MakeMetaTensor(*input_out_grad), axes, normalization, forward, last_dim_size, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const std::vector<int64_t>&, const std::string&, bool, int64_t, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fft_c2r_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, axes, normalization, forward, last_dim_size, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void fft_r2c_grad(const Tensor& x, const Tensor& out_grad, const std::vector<int64_t>& axes, const std::string& normalization, bool forward, bool onesided, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fft_r2c_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fft_r2c_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fft_r2c_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fft_r2c_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fft_r2c_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int64_t>&, const std::string&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fft_r2c_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, axes, normalization, forward, onesided, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void graph_send_uv_grad(const Tensor& x, const Tensor& y, const Tensor& src_index, const Tensor& dst_index, const Tensor& out_grad, const std::string& message_op, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, src_index, dst_index, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "graph_send_uv_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "graph_send_uv_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "graph_send_uv_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_src_index = PrepareData(src_index, kernel.InputAt(2), {});
  auto input_dst_index = PrepareData(dst_index, kernel.InputAt(3), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"src_index", {
     (*input_src_index).dims()}},
     {"dst_index", {
     (*input_dst_index).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("graph_send_uv_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("graph_send_uv_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::string&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("graph_send_uv_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_src_index, *input_dst_index, *input_out_grad, message_op, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void lgamma_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "lgamma_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "lgamma_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "lgamma_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("lgamma_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("lgamma_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("lgamma_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void mv_grad(const Tensor& x, const Tensor& vec, const Tensor& out_grad, Tensor* x_grad, Tensor* vec_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, vec, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "mv_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "mv_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "mv_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_vec = PrepareData(vec, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"vec", {
     (*input_vec).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("mv_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(vec_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("mv_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_vec), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("mv_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_vec, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void poisson_grad(const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "poisson_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "poisson_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "poisson_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("poisson_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("poisson_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("poisson_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void solve_grad(const Tensor& x, const Tensor& y, const Tensor& out, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "solve_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "solve_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "solve_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out = PrepareData(out, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("solve_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("solve_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("solve_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void trace_grad(const Tensor& x, const Tensor& out_grad, int offset, int axis1, int axis2, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "trace_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "trace_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "trace_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("trace_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("trace_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("trace_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, offset, axis1, axis2, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void trunc_grad(const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "trunc_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "trunc_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "trunc_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("trunc_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("trunc_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("trunc_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void abs_double_grad(const Tensor& x, const Tensor& grad_x_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "abs_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "abs_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "abs_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {true});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(1), {true});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("abs_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("abs_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("abs_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_grad_x_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void abs_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "abs_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "abs_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "abs_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {true});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("abs_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("abs_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("abs_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void acos_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "acos_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "acos_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "acos_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("acos_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("acos_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("acos_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void acosh_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "acosh_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "acosh_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "acosh_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("acosh_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("acosh_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("acosh_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void add_double_grad(const Tensor& y, const Tensor& grad_out, const paddle::optional<Tensor>& grad_x_grad, const paddle::optional<Tensor>& grad_y_grad, int axis, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(y, grad_out, grad_x_grad, grad_y_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "add_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "add_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "add_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_y = PrepareData(y, kernel.InputAt(0), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  auto input_grad_y_grad = PrepareData(grad_y_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_x_grad_record_shapes;
     if(input_grad_x_grad){
       grad_x_grad_record_shapes.push_back((*input_grad_x_grad).dims());
     }
     std::vector<phi::DDim> grad_y_grad_record_shapes;
     if(input_grad_y_grad){
       grad_y_grad_record_shapes.push_back((*input_grad_y_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"y", {
     (*input_y).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", grad_x_grad_record_shapes},
     {"grad_y_grad",
     grad_y_grad_record_shapes}};
     platform::RecordOpInfoSupplement("add_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("add_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_grad_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("add_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_y, *input_grad_out, input_grad_x_grad, input_grad_y_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void add_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "add_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "add_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "add_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("add_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("add_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("add_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void add_triple_grad(const Tensor& grad_grad_x, const Tensor& grad_grad_y, const Tensor& grad_grad_out_grad, int axis, Tensor* grad_grad_x_grad, Tensor* grad_grad_y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(grad_grad_x, grad_grad_y, grad_grad_out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "add_triple_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "add_triple_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "add_triple_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_grad_grad_x = PrepareData(grad_grad_x, kernel.InputAt(0), {});
  auto input_grad_grad_y = PrepareData(grad_grad_y, kernel.InputAt(1), {});
  auto input_grad_grad_out_grad = PrepareData(grad_grad_out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"grad_grad_x", {
     (*input_grad_grad_x).dims()}},
     {"grad_grad_y", {
     (*input_grad_grad_y).dims()}},
     {"grad_grad_out_grad", {
     (*input_grad_grad_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("add_triple_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(grad_grad_x_grad);
  auto kernel_out_1 = SetKernelOutput(grad_grad_y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("add_triple_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_grad_grad_x), MakeMetaTensor(*input_grad_grad_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("add_triple_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_grad_grad_x, *input_grad_grad_y, *input_grad_grad_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void addmm_grad(const Tensor& input, const Tensor& x, const Tensor& y, const Tensor& out_grad, float alpha, float beta, Tensor* input_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "addmm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "addmm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "addmm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_x = PrepareData(x, kernel.InputAt(1), {});
  auto input_y = PrepareData(y, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("addmm_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(input_grad);
  auto kernel_out_1 = SetKernelOutput(x_grad);
  auto kernel_out_2 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("addmm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_input), MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, float, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("addmm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_x, *input_y, *input_out_grad, alpha, beta, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void affine_grid_grad(const Tensor& output_grad, const IntArray& outputShape, bool use_cudnn, bool align_corners, Tensor* input_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(output_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "affine_grid_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "affine_grid_grad", {kernel_backend, kernel_layout, kernel_data_type}, use_cudnn);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "affine_grid_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_output_grad = PrepareData(output_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"output_grad", {
     (*input_output_grad).dims()}}};
     platform::RecordOpInfoSupplement("affine_grid_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(input_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("affine_grid_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::AffineGridGradInferMeta(MakeMetaTensor(*input_output_grad), outputShape, align_corners, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::IntArray&, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("affine_grid_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_output_grad, phi::IntArray(outputShape), align_corners, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void amax_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const std::vector<int64_t>& dims, bool keep_dim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "amax_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "amax_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "amax_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("amax_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("amax_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int64_t>&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("amax_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, dims, keep_dim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void amin_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const std::vector<int64_t>& dims, bool keep_dim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "amin_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "amin_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "amin_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("amin_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("amin_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int64_t>&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("amin_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, dims, keep_dim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void angle_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "angle_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "angle_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "angle_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {true});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("angle_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("angle_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("angle_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void argsort_grad(const Tensor& indices, const Tensor& x, const Tensor& out_grad, int axis, bool descending, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(indices, x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "argsort_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "argsort_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "argsort_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_indices = PrepareData(indices, kernel.InputAt(0), {});
  auto input_x = PrepareData(x, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"indices", {
     (*input_indices).dims()}},
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("argsort_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("argsort_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("argsort_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_indices, *input_x, *input_out_grad, axis, descending, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void asin_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "asin_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "asin_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "asin_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("asin_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("asin_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("asin_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void asinh_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "asinh_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "asinh_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "asinh_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("asinh_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("asinh_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("asinh_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void assign_out__grad(const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "assign_out__grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "assign", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "assign kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("assign_out__grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("assign_out__grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("assign_out__grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void atan_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "atan_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "atan_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "atan_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("atan_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("atan_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("atan_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void atanh_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "atanh_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "atanh_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "atanh_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("atanh_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("atanh_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("atanh_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void batch_norm_double_grad(const Tensor& x, const Tensor& scale, const paddle::optional<Tensor>& out_mean, const paddle::optional<Tensor>& out_variance, const Tensor& saved_mean, const Tensor& saved_variance, const Tensor& grad_out, const Tensor& grad_x_grad, const Tensor& grad_scale_grad, const Tensor& grad_bias_grad, float momentum, float epsilon, const std::string& data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu, Tensor* x_grad, Tensor* scale_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, scale, out_mean, out_variance, saved_mean, saved_variance, grad_out, grad_x_grad, grad_scale_grad, grad_bias_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "batch_norm_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "batch_norm_grad_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "batch_norm_grad_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_scale = PrepareData(scale, kernel.InputAt(1), {});
  auto input_out_mean = PrepareData(out_mean, kernel.InputAt(2), {});
  auto input_out_variance = PrepareData(out_variance, kernel.InputAt(3), {});
  auto input_saved_mean = PrepareData(saved_mean, kernel.InputAt(4), {});
  auto input_saved_variance = PrepareData(saved_variance, kernel.InputAt(5), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(6), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(7), {});
  auto input_grad_scale_grad = PrepareData(grad_scale_grad, kernel.InputAt(8), {});
  auto input_grad_bias_grad = PrepareData(grad_bias_grad, kernel.InputAt(9), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> out_mean_record_shapes;
     if(input_out_mean){
       out_mean_record_shapes.push_back((*input_out_mean).dims());
     }
     std::vector<phi::DDim> out_variance_record_shapes;
     if(input_out_variance){
       out_variance_record_shapes.push_back((*input_out_variance).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"scale", {
     (*input_scale).dims()}},
     {"out_mean", out_mean_record_shapes},
     {"out_variance", out_variance_record_shapes},
     {"saved_mean", {
     (*input_saved_mean).dims()}},
     {"saved_variance", {
     (*input_saved_variance).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}},
     {"grad_scale_grad", {
     (*input_grad_scale_grad).dims()}},
     {"grad_bias_grad", {
     (*input_grad_bias_grad).dims()}}};
     platform::RecordOpInfoSupplement("batch_norm_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(scale_grad);
  auto kernel_out_2 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("batch_norm_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_scale), MakeMetaTensor(*input_x), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, float, const std::string&, bool, bool, bool, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("batch_norm_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_scale, input_out_mean, input_out_variance, *input_saved_mean, *input_saved_variance, *input_grad_out, *input_grad_x_grad, *input_grad_scale_grad, *input_grad_bias_grad, momentum, epsilon, data_layout, is_test, use_global_stats, trainable_statistics, fuse_with_relu, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void batch_norm_grad(const Tensor& x, const Tensor& scale, const Tensor& bias, const paddle::optional<Tensor>& mean_out, const paddle::optional<Tensor>& variance_out, const Tensor& saved_mean, const Tensor& saved_variance, const paddle::optional<Tensor>& reserve_space, const Tensor& out_grad, float momentum, float epsilon, const std::string& data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu, Tensor* x_grad, Tensor* scale_grad, Tensor* bias_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, scale, bias, mean_out, variance_out, saved_mean, saved_variance, reserve_space, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "batch_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "batch_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "batch_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_scale = PrepareData(scale, kernel.InputAt(1), {});
  auto input_bias = PrepareData(bias, kernel.InputAt(2), {});
  auto input_mean_out = PrepareData(mean_out, kernel.InputAt(3), {});
  auto input_variance_out = PrepareData(variance_out, kernel.InputAt(4), {});
  auto input_saved_mean = PrepareData(saved_mean, kernel.InputAt(5), {});
  auto input_saved_variance = PrepareData(saved_variance, kernel.InputAt(6), {});
  auto input_reserve_space = PrepareData(reserve_space, kernel.InputAt(7), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(8), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> mean_out_record_shapes;
     if(input_mean_out){
       mean_out_record_shapes.push_back((*input_mean_out).dims());
     }
     std::vector<phi::DDim> variance_out_record_shapes;
     if(input_variance_out){
       variance_out_record_shapes.push_back((*input_variance_out).dims());
     }
     std::vector<phi::DDim> reserve_space_record_shapes;
     if(input_reserve_space){
       reserve_space_record_shapes.push_back((*input_reserve_space).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"scale", {
     (*input_scale).dims()}},
     {"bias", {
     (*input_bias).dims()}},
     {"mean_out", mean_out_record_shapes},
     {"variance_out", variance_out_record_shapes},
     {"saved_mean", {
     (*input_saved_mean).dims()}},
     {"saved_variance", {
     (*input_saved_variance).dims()}},
     {"reserve_space", reserve_space_record_shapes},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("batch_norm_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(scale_grad);
  auto kernel_out_2 = SetKernelOutput(bias_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("batch_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_scale), MakeMetaTensor(*input_bias), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, float, float, const std::string&, bool, bool, bool, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("batch_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_scale, *input_bias, input_mean_out, input_variance_out, *input_saved_mean, *input_saved_variance, input_reserve_space, *input_out_grad, momentum, epsilon, data_layout, is_test, use_global_stats, trainable_statistics, fuse_with_relu, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void bce_loss_grad(const Tensor& input, const Tensor& label, const Tensor& out_grad, Tensor* input_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, label, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "bce_loss_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "bce_loss_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "bce_loss_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_label = PrepareData(label, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"label", {
     (*input_label).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("bce_loss_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(input_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("bce_loss_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_input), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("bce_loss_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_label, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void bicubic_interp_grad(const Tensor& x, const paddle::optional<Tensor>& out_size, const paddle::optional<std::vector<Tensor>>& size_tensor, const paddle::optional<Tensor>& scale_tensor, const Tensor& output_grad, const std::string& data_layout, int out_d, int out_h, int out_w, const std::vector<float>& scale, const std::string& interp_method, bool align_corners, int align_mode, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(output_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_size, size_tensor, scale_tensor, output_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "bicubic_interp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "bicubic_interp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "bicubic_interp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_size = PrepareData(out_size, kernel.InputAt(1), {});
  auto input_size_tensor_vec = PrepareData(size_tensor, kernel.InputAt(2), {});
  paddle::optional<std::vector<const phi::DenseTensor*>> input_size_tensor;
  if (input_size_tensor_vec){
    input_size_tensor = paddle::optional<std::vector<const phi::DenseTensor*>>(input_size_tensor_vec->size());
    for (size_t i = 0; i < input_size_tensor_vec->size(); ++i) {
      input_size_tensor->at(i) = &input_size_tensor_vec->at(i);
    }
  }
  auto input_scale_tensor = PrepareData(scale_tensor, kernel.InputAt(3), {});
  auto input_output_grad = PrepareData(output_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> out_size_record_shapes;
     if(input_out_size){
       out_size_record_shapes.push_back((*input_out_size).dims());
     }
     std::vector<phi::DDim> scale_tensor_record_shapes;
     if(input_scale_tensor){
       scale_tensor_record_shapes.push_back((*input_scale_tensor).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_size", out_size_record_shapes},
     {"scale_tensor", scale_tensor_record_shapes},
     {"output_grad", {
     (*input_output_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     if (input_size_tensor){
       ddims_vec.reserve(input_size_tensor->size());
       for (size_t i = 0; i < input_size_tensor->size(); ++i) {
         ddims_vec.emplace_back((*input_size_tensor->at(i)).dims());
       }
     }
     input_shapes.emplace_back("size_tensor", ddims_vec);
     platform::RecordOpInfoSupplement("bicubic_interp_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("bicubic_interp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<std::vector<const phi::DenseTensor*>>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::string&, int, int, int, const std::vector<float>&, const std::string&, bool, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("bicubic_interp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_out_size, input_size_tensor, input_scale_tensor, *input_output_grad, data_layout, out_d, out_h, out_w, scale, interp_method, align_corners, align_mode, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void bilinear_interp_grad(const Tensor& x, const paddle::optional<Tensor>& out_size, const paddle::optional<std::vector<Tensor>>& size_tensor, const paddle::optional<Tensor>& scale_tensor, const Tensor& output_grad, const std::string& data_layout, int out_d, int out_h, int out_w, const std::vector<float>& scale, const std::string& interp_method, bool align_corners, int align_mode, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(output_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_size, size_tensor, scale_tensor, output_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "bilinear_interp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "bilinear_interp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "bilinear_interp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_size = PrepareData(out_size, kernel.InputAt(1), {});
  auto input_size_tensor_vec = PrepareData(size_tensor, kernel.InputAt(2), {});
  paddle::optional<std::vector<const phi::DenseTensor*>> input_size_tensor;
  if (input_size_tensor_vec){
    input_size_tensor = paddle::optional<std::vector<const phi::DenseTensor*>>(input_size_tensor_vec->size());
    for (size_t i = 0; i < input_size_tensor_vec->size(); ++i) {
      input_size_tensor->at(i) = &input_size_tensor_vec->at(i);
    }
  }
  auto input_scale_tensor = PrepareData(scale_tensor, kernel.InputAt(3), {});
  auto input_output_grad = PrepareData(output_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> out_size_record_shapes;
     if(input_out_size){
       out_size_record_shapes.push_back((*input_out_size).dims());
     }
     std::vector<phi::DDim> scale_tensor_record_shapes;
     if(input_scale_tensor){
       scale_tensor_record_shapes.push_back((*input_scale_tensor).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_size", out_size_record_shapes},
     {"scale_tensor", scale_tensor_record_shapes},
     {"output_grad", {
     (*input_output_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     if (input_size_tensor){
       ddims_vec.reserve(input_size_tensor->size());
       for (size_t i = 0; i < input_size_tensor->size(); ++i) {
         ddims_vec.emplace_back((*input_size_tensor->at(i)).dims());
       }
     }
     input_shapes.emplace_back("size_tensor", ddims_vec);
     platform::RecordOpInfoSupplement("bilinear_interp_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("bilinear_interp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<std::vector<const phi::DenseTensor*>>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::string&, int, int, int, const std::vector<float>&, const std::string&, bool, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("bilinear_interp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_out_size, input_size_tensor, input_scale_tensor, *input_output_grad, data_layout, out_d, out_h, out_w, scale, interp_method, align_corners, align_mode, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void bilinear_tensor_product_grad(const Tensor& x, const Tensor& y, const Tensor& weight, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad, Tensor* weight_grad, Tensor* bias_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, weight, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "bilinear_tensor_product_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "bilinear_tensor_product_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "bilinear_tensor_product_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_weight = PrepareData(weight, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"weight", {
     (*input_weight).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("bilinear_tensor_product_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  auto kernel_out_2 = SetKernelOutput(weight_grad);
  auto kernel_out_3 = SetKernelOutput(bias_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("bilinear_tensor_product_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);
  phi::MetaTensor meta_out_3(kernel_out_3);

  phi::BilinearTensorProductGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), MakeMetaTensor(*input_weight), MakeMetaTensor(*input_out_grad), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr, kernel_out_3 ? &meta_out_3 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("bilinear_tensor_product_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_weight, *input_out_grad, kernel_out_0, kernel_out_1, kernel_out_2, kernel_out_3);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
    TransDataBackend(kernel_out_3, kernel_backend, kernel_out_3);
  }
  
}

PADDLE_API void bmm_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "bmm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "bmm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "bmm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("bmm_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("bmm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::BmmGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), MakeMetaTensor(*input_out_grad), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("bmm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void brelu_grad(const Tensor& x, const Tensor& out_grad, float t_min, float t_max, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "brelu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "brelu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "brelu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("brelu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("brelu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("brelu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, t_min, t_max, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void broadcast_tensors_grad(const std::vector<Tensor>& x, const std::vector<Tensor>& out_grad, std::vector<Tensor*> x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "broadcast_tensors_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "broadcast_tensors_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "broadcast_tensors_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x_uq_ptr = TensorToDenseTensor(x);
  const auto& input_x = *input_x_uq_ptr;
  auto input_out_grad_vec = PrepareData(out_grad, kernel.InputAt(0), {});
  std::vector<const phi::DenseTensor*> input_out_grad(input_out_grad_vec->size());
  for (size_t i = 0; i < input_out_grad.size(); ++i) {
    input_out_grad[i] = &input_out_grad_vec->at(i);
  }
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes;
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     ddims_vec.reserve(input_out_grad.size());
     for (size_t i = 0; i < input_out_grad.size(); ++i) {
       ddims_vec.emplace_back((*input_out_grad[i]).dims());
     }
     input_shapes.emplace_back("out_grad", ddims_vec);
     platform::RecordOpInfoSupplement("broadcast_tensors_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(&x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("broadcast_tensors_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto x_meta_vec = MakeMetaTensor(input_x);
  std::vector<const phi::MetaTensor*> x_metas(x_meta_vec.size());
  for (size_t i = 0; i < x_meta_vec.size(); ++i) {
    x_metas[i] = &x_meta_vec[i];
  }

  auto kernel_out_meta_vec = MakeMetaTensor(kernel_out);
  std::vector<phi::MetaTensor*> kernel_out_metas(kernel_out_meta_vec.size());
  for (size_t i = 0; i < kernel_out_meta_vec.size(); ++i) {
    kernel_out_metas[i] = kernel_out[i] ? &kernel_out_meta_vec[i] : nullptr;
  }
  phi::UnchangedMultiInferMeta(x_metas, kernel_out_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const std::vector<const phi::DenseTensor*>&, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("broadcast_tensors_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void ceil_grad(const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "ceil_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "ceil_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "ceil_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("ceil_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("ceil_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("ceil_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void celu_double_grad(const Tensor& x, const Tensor& grad_out, const Tensor& grad_x_grad, float alpha, Tensor* x_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, grad_out, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "celu_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "celu_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "celu_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("celu_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("celu_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_x), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("celu_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_grad_out, *input_grad_x_grad, alpha, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void celu_grad(const Tensor& x, const Tensor& out_grad, float alpha, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "celu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "celu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "celu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("celu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("celu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("celu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, alpha, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void clip_double_grad(const Tensor& x, const Tensor& grad_x_grad, const Scalar& min, const Scalar& max, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "clip_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "clip_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "clip_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("clip_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("clip_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::Scalar&, const phi::Scalar&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("clip_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_grad_x_grad, phi::Scalar(min), phi::Scalar(max), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void clip_grad(const Tensor& x, const Tensor& out_grad, const Scalar& min, const Scalar& max, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "clip_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "clip_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "clip_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("clip_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("clip_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::Scalar&, const phi::Scalar&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("clip_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::Scalar(min), phi::Scalar(max), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void complex_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "complex_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "complex_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "complex_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("complex_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("complex_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::ComplexGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), MakeMetaTensor(*input_out_grad), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("complex_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void concat_grad(const std::vector<Tensor>& x, const Tensor& out_grad, const Scalar& axis, std::vector<Tensor*> x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "concat_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "concat_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "concat_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x_vec = PrepareData(x, kernel.InputAt(0), {});
  std::vector<const phi::DenseTensor*> input_x(input_x_vec->size());
  for (size_t i = 0; i < input_x.size(); ++i) {
    input_x[i] = &input_x_vec->at(i);
  }
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     ddims_vec.reserve(input_x.size());
     for (size_t i = 0; i < input_x.size(); ++i) {
       ddims_vec.emplace_back((*input_x[i]).dims());
     }
     input_shapes.emplace_back("x", ddims_vec);
     platform::RecordOpInfoSupplement("concat_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(&x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("concat_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto x_meta_vec = MakeMetaTensor(input_x);
  std::vector<const phi::MetaTensor*> x_metas(x_meta_vec.size());
  for (size_t i = 0; i < x_meta_vec.size(); ++i) {
    x_metas[i] = &x_meta_vec[i];
  }

  auto kernel_out_meta_vec = MakeMetaTensor(kernel_out);
  std::vector<phi::MetaTensor*> kernel_out_metas(kernel_out_meta_vec.size());
  for (size_t i = 0; i < kernel_out_meta_vec.size(); ++i) {
    kernel_out_metas[i] = kernel_out[i] ? &kernel_out_meta_vec[i] : nullptr;
  }
  phi::UnchangedMultiInferMeta(x_metas, kernel_out_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const std::vector<const phi::DenseTensor*>&, const phi::DenseTensor&, const phi::Scalar&, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("concat_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, input_x, *input_out_grad, phi::Scalar(axis), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void conj_grad(const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "conj_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "conj", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "conj kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("conj_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("conj_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("conj_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void conv2d_grad(const Tensor& input, const Tensor& filter, const Tensor& out_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::string& paddding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, Tensor* input_grad, Tensor* filter_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "conv2d_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "conv2d_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "conv2d_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("conv2d_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(input_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("conv2d_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input), MakeMetaTensor(*input_filter), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::string&, int, const std::vector<int>&, const std::string&, bool, int, bool, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("conv2d_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_filter, *input_out_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void conv2d_grad_grad(const Tensor& input, const Tensor& filter, const Tensor& grad_out, const paddle::optional<Tensor>& grad_input_grad, const paddle::optional<Tensor>& grad_filter_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::string& paddding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, Tensor* input_grad, Tensor* filter_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, grad_out, grad_input_grad, grad_filter_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "conv2d_grad_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "conv2d_grad_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "conv2d_grad_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(2), {});
  auto input_grad_input_grad = PrepareData(grad_input_grad, kernel.InputAt(3), {});
  auto input_grad_filter_grad = PrepareData(grad_filter_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_input_grad_record_shapes;
     if(input_grad_input_grad){
       grad_input_grad_record_shapes.push_back((*input_grad_input_grad).dims());
     }
     std::vector<phi::DDim> grad_filter_grad_record_shapes;
     if(input_grad_filter_grad){
       grad_filter_grad_record_shapes.push_back((*input_grad_filter_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_input_grad", grad_input_grad_record_shapes},
     {"grad_filter_grad",
     grad_filter_grad_record_shapes}};
     platform::RecordOpInfoSupplement("conv2d_grad_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(input_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  auto kernel_out_2 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("conv2d_grad_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_input), MakeMetaTensor(*input_filter), MakeMetaTensor(*input_grad_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const std::vector<int>&, const std::vector<int>&, const std::string&, int, const std::vector<int>&, const std::string&, bool, int, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("conv2d_grad_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_filter, *input_grad_out, input_grad_input_grad, input_grad_filter_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void conv2d_transpose_double_grad(const Tensor& x, const Tensor& filter, const Tensor& grad_out, const Tensor& grad_x_grad, const Tensor& grad_filter_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::vector<int>& output_padding, const IntArray& output_size, const std::string& padding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, Tensor* x_grad, Tensor* filter_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, filter, grad_out, grad_x_grad, grad_filter_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "conv2d_transpose_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "conv2d_transpose_grad_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "conv2d_transpose_grad_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(2), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(3), {});
  auto input_grad_filter_grad = PrepareData(grad_filter_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}},
     {"grad_filter_grad", {
     (*input_grad_filter_grad).dims()}}};
     platform::RecordOpInfoSupplement("conv2d_transpose_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  auto kernel_out_2 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("conv2d_transpose_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::Conv2dTransposeDoubleGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_filter), MakeMetaTensor(*input_grad_out), MakeMetaTensor(*input_grad_x_grad), MakeMetaTensor(*input_grad_filter_grad), strides, paddings, output_padding, output_size, padding_algorithm, groups, dilations, data_format, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const phi::IntArray&, const std::string&, int, const std::vector<int>&, const std::string&, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("conv2d_transpose_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_filter, *input_grad_out, *input_grad_x_grad, *input_grad_filter_grad, strides, paddings, output_padding, phi::IntArray(output_size), padding_algorithm, groups, dilations, data_format, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void conv2d_transpose_grad(const Tensor& x, const Tensor& filter, const Tensor& out_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::vector<int>& output_padding, const IntArray& output_size, const std::string& padding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, Tensor* x_grad, Tensor* filter_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, filter, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "conv2d_transpose_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "conv2d_transpose_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "conv2d_transpose_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("conv2d_transpose_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("conv2d_transpose_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::Conv2dTransposeGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_filter), MakeMetaTensor(*input_out_grad), strides, paddings, output_padding, output_size, padding_algorithm, groups, dilations, data_format, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const phi::IntArray&, const std::string&, int, const std::vector<int>&, const std::string&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("conv2d_transpose_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_filter, *input_out_grad, strides, paddings, output_padding, phi::IntArray(output_size), padding_algorithm, groups, dilations, data_format, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void conv3d_grad(const Tensor& input, const Tensor& filter, const Tensor& out_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::string& paddding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, Tensor* input_grad, Tensor* filter_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "conv3d_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "conv3d_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "conv3d_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("conv3d_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(input_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("conv3d_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input), MakeMetaTensor(*input_filter), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::string&, int, const std::vector<int>&, const std::string&, bool, int, bool, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("conv3d_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_filter, *input_out_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void conv3d_grad_grad(const Tensor& input, const Tensor& filter, const Tensor& grad_out, const paddle::optional<Tensor>& grad_input_grad, const paddle::optional<Tensor>& grad_filter_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::string& paddding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, Tensor* input_grad, Tensor* filter_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, grad_out, grad_input_grad, grad_filter_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "conv3d_grad_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "conv3d_grad_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "conv3d_grad_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(2), {});
  auto input_grad_input_grad = PrepareData(grad_input_grad, kernel.InputAt(3), {});
  auto input_grad_filter_grad = PrepareData(grad_filter_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_input_grad_record_shapes;
     if(input_grad_input_grad){
       grad_input_grad_record_shapes.push_back((*input_grad_input_grad).dims());
     }
     std::vector<phi::DDim> grad_filter_grad_record_shapes;
     if(input_grad_filter_grad){
       grad_filter_grad_record_shapes.push_back((*input_grad_filter_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_input_grad", grad_input_grad_record_shapes},
     {"grad_filter_grad",
     grad_filter_grad_record_shapes}};
     platform::RecordOpInfoSupplement("conv3d_grad_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(input_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  auto kernel_out_2 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("conv3d_grad_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_input), MakeMetaTensor(*input_filter), MakeMetaTensor(*input_grad_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const std::vector<int>&, const std::vector<int>&, const std::string&, int, const std::vector<int>&, const std::string&, bool, int, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("conv3d_grad_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_filter, *input_grad_out, input_grad_input_grad, input_grad_filter_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void conv3d_transpose_grad(const Tensor& x, const Tensor& filter, const Tensor& out_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::vector<int>& output_padding, const std::vector<int>& output_size, const std::string& padding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, Tensor* x_grad, Tensor* filter_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, filter, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "conv3d_transpose_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "conv3d_transpose_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "conv3d_transpose_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("conv3d_transpose_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("conv3d_transpose_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::ConvTransposeGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_filter), MakeMetaTensor(*input_out_grad), strides, paddings, output_padding, output_size, padding_algorithm, groups, dilations, data_format, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const std::string&, int, const std::vector<int>&, const std::string&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("conv3d_transpose_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_filter, *input_out_grad, strides, paddings, output_padding, output_size, padding_algorithm, groups, dilations, data_format, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void cos_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "cos_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "cos_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "cos_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("cos_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("cos_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("cos_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void cosh_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "cosh_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "cosh_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "cosh_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("cosh_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("cosh_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("cosh_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void crop_tensor_grad(const Tensor& x, const Tensor& out_grad, const IntArray& offsets, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "crop_tensor_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "crop_tensor_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "crop_tensor_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("crop_tensor_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("crop_tensor_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::CropTensorGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_out_grad), offsets, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("crop_tensor_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::IntArray(offsets), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void cross_entropy_with_softmax_grad(const Tensor& label, const Tensor& softmax, const Tensor& loss_grad, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis, Tensor* input_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(softmax);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(label, softmax, loss_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "cross_entropy_with_softmax_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "cross_entropy_with_softmax_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "cross_entropy_with_softmax_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_label = PrepareData(label, kernel.InputAt(0), {});
  auto input_softmax = PrepareData(softmax, kernel.InputAt(1), {});
  auto input_loss_grad = PrepareData(loss_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"label", {
     (*input_label).dims()}},
     {"softmax", {
     (*input_softmax).dims()}},
     {"loss_grad", {
     (*input_loss_grad).dims()}}};
     platform::RecordOpInfoSupplement("cross_entropy_with_softmax_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(input_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("cross_entropy_with_softmax_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::CrossEntropyWithSoftmaxGradInferMeta(MakeMetaTensor(*input_label), MakeMetaTensor(*input_softmax), MakeMetaTensor(*input_loss_grad), soft_label, use_softmax, numeric_stable_mode, ignore_index, axis, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, bool, bool, int, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("cross_entropy_with_softmax_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_label, *input_softmax, *input_loss_grad, soft_label, use_softmax, numeric_stable_mode, ignore_index, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void cumprod_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, int dim, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "cumprod_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "cumprod_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "cumprod_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("cumprod_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("cumprod_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("cumprod_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, dim, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void deformable_conv_grad(const Tensor& x, const Tensor& offset, const Tensor& filter, const paddle::optional<Tensor>& mask, const Tensor& out_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::vector<int>& dilations, int deformable_groups, int groups, int im2col_step, Tensor* x_grad, Tensor* offset_grad, Tensor* filter_grad, Tensor* mask_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, offset, filter, mask, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "deformable_conv_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "deformable_conv_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "deformable_conv_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_offset = PrepareData(offset, kernel.InputAt(1), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(2), {});
  auto input_mask = PrepareData(mask, kernel.InputAt(3), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> mask_record_shapes;
     if(input_mask){
       mask_record_shapes.push_back((*input_mask).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"offset", {
     (*input_offset).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"mask", mask_record_shapes},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("deformable_conv_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(offset_grad);
  auto kernel_out_2 = SetKernelOutput(filter_grad);
  auto kernel_out_3 = SetKernelOutput(mask_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("deformable_conv_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);
  phi::MetaTensor meta_out_3(kernel_out_3);

  phi::DeformableConvGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_offset), MakeMetaTensor(*input_filter), MakeMetaTensor(input_mask), MakeMetaTensor(*input_out_grad), strides, paddings, dilations, deformable_groups, groups, im2col_step, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr, kernel_out_3 ? &meta_out_3 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, int, int, int, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("deformable_conv_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_offset, *input_filter, input_mask, *input_out_grad, strides, paddings, dilations, deformable_groups, groups, im2col_step, kernel_out_0, kernel_out_1, kernel_out_2, kernel_out_3);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
    TransDataBackend(kernel_out_3, kernel_backend, kernel_out_3);
  }
  
}

PADDLE_API void depthwise_conv2d_grad(const Tensor& input, const Tensor& filter, const Tensor& out_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::string& paddding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn, Tensor* input_grad, Tensor* filter_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "depthwise_conv2d_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "depthwise_conv2d_grad", {kernel_backend, kernel_layout, kernel_data_type}, use_gpudnn);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "depthwise_conv2d_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("depthwise_conv2d_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(input_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("depthwise_conv2d_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input), MakeMetaTensor(*input_filter), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::string&, int, const std::vector<int>&, const std::string&, bool, int, bool, bool, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("depthwise_conv2d_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_filter, *input_out_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void depthwise_conv2d_grad_grad(const Tensor& input, const Tensor& filter, const Tensor& grad_out, const paddle::optional<Tensor>& grad_input_grad, const paddle::optional<Tensor>& grad_filter_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::string& paddding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, Tensor* input_grad, Tensor* filter_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, grad_out, grad_input_grad, grad_filter_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "depthwise_conv2d_grad_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "depthwise_conv2d_grad_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "depthwise_conv2d_grad_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(2), {});
  auto input_grad_input_grad = PrepareData(grad_input_grad, kernel.InputAt(3), {});
  auto input_grad_filter_grad = PrepareData(grad_filter_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_input_grad_record_shapes;
     if(input_grad_input_grad){
       grad_input_grad_record_shapes.push_back((*input_grad_input_grad).dims());
     }
     std::vector<phi::DDim> grad_filter_grad_record_shapes;
     if(input_grad_filter_grad){
       grad_filter_grad_record_shapes.push_back((*input_grad_filter_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_input_grad", grad_input_grad_record_shapes},
     {"grad_filter_grad",
     grad_filter_grad_record_shapes}};
     platform::RecordOpInfoSupplement("depthwise_conv2d_grad_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(input_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  auto kernel_out_2 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("depthwise_conv2d_grad_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_input), MakeMetaTensor(*input_filter), MakeMetaTensor(*input_grad_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const std::vector<int>&, const std::vector<int>&, const std::string&, int, const std::vector<int>&, const std::string&, bool, int, bool, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("depthwise_conv2d_grad_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_filter, *input_grad_out, input_grad_input_grad, input_grad_filter_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void depthwise_conv2d_transpose_grad(const Tensor& x, const Tensor& filter, const Tensor& out_grad, const std::vector<int>& strides, const std::vector<int>& paddings, const std::vector<int>& output_padding, const IntArray& output_size, const std::string& padding_algorithm, int groups, const std::vector<int>& dilations, const std::string& data_format, Tensor* x_grad, Tensor* filter_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, filter, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "depthwise_conv2d_transpose_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "depthwise_conv2d_transpose_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "depthwise_conv2d_transpose_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_filter = PrepareData(filter, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"filter", {
     (*input_filter).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("depthwise_conv2d_transpose_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(filter_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("depthwise_conv2d_transpose_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::Conv2dTransposeGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_filter), MakeMetaTensor(*input_out_grad), strides, paddings, output_padding, output_size, padding_algorithm, groups, dilations, data_format, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const phi::IntArray&, const std::string&, int, const std::vector<int>&, const std::string&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("depthwise_conv2d_transpose_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_filter, *input_out_grad, strides, paddings, output_padding, phi::IntArray(output_size), padding_algorithm, groups, dilations, data_format, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void det_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "det_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "determinant_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "determinant_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("det_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("det_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("det_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void divide_double_grad(const Tensor& y, const Tensor& out, const Tensor& grad_x, const paddle::optional<Tensor>& grad_x_grad, const paddle::optional<Tensor>& grad_y_grad, int axis, Tensor* y_grad, Tensor* out_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(y, out, grad_x, grad_x_grad, grad_y_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "divide_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "divide_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "divide_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_y = PrepareData(y, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_grad_x = PrepareData(grad_x, kernel.InputAt(2), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(3), {});
  auto input_grad_y_grad = PrepareData(grad_y_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_x_grad_record_shapes;
     if(input_grad_x_grad){
       grad_x_grad_record_shapes.push_back((*input_grad_x_grad).dims());
     }
     std::vector<phi::DDim> grad_y_grad_record_shapes;
     if(input_grad_y_grad){
       grad_y_grad_record_shapes.push_back((*input_grad_y_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"y", {
     (*input_y).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"grad_x", {
     (*input_grad_x).dims()}},
     {"grad_x_grad", grad_x_grad_record_shapes},
     {"grad_y_grad",
     grad_y_grad_record_shapes}};
     platform::RecordOpInfoSupplement("divide_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(y_grad);
  auto kernel_out_1 = SetKernelOutput(out_grad);
  auto kernel_out_2 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("divide_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_y), MakeMetaTensor(*input_grad_x), MakeMetaTensor(*input_grad_x), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, int, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("divide_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_y, *input_out, *input_grad_x, input_grad_x_grad, input_grad_y_grad, axis, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void divide_grad(const Tensor& x, const Tensor& y, const Tensor& out, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "divide_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "divide_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "divide_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out = PrepareData(out, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("divide_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("divide_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("divide_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void dropout_grad(const Tensor& mask, const Tensor& out_grad, const Scalar& p, bool is_test, const std::string& mode, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(mask, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "dropout_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "dropout_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "dropout_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_mask = PrepareData(mask, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"mask", {
     (*input_mask).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("dropout_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("dropout_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::Scalar&, bool, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("dropout_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_mask, *input_out_grad, phi::Scalar(p), is_test, mode, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void eig_grad(const Tensor& out_w, const Tensor& out_v, const Tensor& out_w_grad, const Tensor& out_v_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_v);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_w, out_v, out_w_grad, out_v_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "eig_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "eig_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "eig_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_w = PrepareData(out_w, kernel.InputAt(0), {true});
  auto input_out_v = PrepareData(out_v, kernel.InputAt(1), {});
  auto input_out_w_grad = PrepareData(out_w_grad, kernel.InputAt(2), {true});
  auto input_out_v_grad = PrepareData(out_v_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_w", {
     (*input_out_w).dims()}},
     {"out_v", {
     (*input_out_v).dims()}},
     {"out_w_grad", {
     (*input_out_w_grad).dims()}},
     {"out_v_grad", {
     (*input_out_v_grad).dims()}}};
     platform::RecordOpInfoSupplement("eig_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("eig_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_v), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("eig_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_w, *input_out_v, *input_out_w_grad, *input_out_v_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void eigh_grad(const Tensor& out_w, const Tensor& out_v, const Tensor& out_w_grad, const Tensor& out_v_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_v);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_w, out_v, out_w_grad, out_v_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "eigh_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "eigh_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "eigh_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_w = PrepareData(out_w, kernel.InputAt(0), {true});
  auto input_out_v = PrepareData(out_v, kernel.InputAt(1), {});
  auto input_out_w_grad = PrepareData(out_w_grad, kernel.InputAt(2), {true});
  auto input_out_v_grad = PrepareData(out_v_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_w", {
     (*input_out_w).dims()}},
     {"out_v", {
     (*input_out_v).dims()}},
     {"out_w_grad", {
     (*input_out_w_grad).dims()}},
     {"out_v_grad", {
     (*input_out_v_grad).dims()}}};
     platform::RecordOpInfoSupplement("eigh_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("eigh_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_v), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("eigh_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_w, *input_out_v, *input_out_w_grad, *input_out_v_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void eigvalsh_grad(const Tensor& eigenvectors, const Tensor& eigenvalues_grad, const std::string& uplo, bool is_test, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(eigenvectors);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(eigenvectors, eigenvalues_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "eigvalsh_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "eigvalsh_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "eigvalsh_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_eigenvectors = PrepareData(eigenvectors, kernel.InputAt(0), {});
  auto input_eigenvalues_grad = PrepareData(eigenvalues_grad, kernel.InputAt(1), {true});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"eigenvectors", {
     (*input_eigenvectors).dims()}},
     {"eigenvalues_grad", {
     (*input_eigenvalues_grad).dims()}}};
     platform::RecordOpInfoSupplement("eigvalsh_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("eigvalsh_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::EigvalshGradInferMeta(MakeMetaTensor(*input_eigenvectors), MakeMetaTensor(*input_eigenvalues_grad), uplo, is_test, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const std::string&, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("eigvalsh_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_eigenvectors, *input_eigenvalues_grad, uplo, is_test, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void einsum_grad(const std::vector<Tensor>& x_shape, const std::vector<Tensor>& inner_cache, const Tensor& out_grad, const std::string& equation, std::vector<Tensor*> x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x_shape, inner_cache, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "einsum_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "einsum_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "einsum_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x_shape_vec = PrepareData(x_shape, kernel.InputAt(0), {});
  std::vector<const phi::DenseTensor*> input_x_shape(input_x_shape_vec->size());
  for (size_t i = 0; i < input_x_shape.size(); ++i) {
    input_x_shape[i] = &input_x_shape_vec->at(i);
  }
  auto input_inner_cache_vec = PrepareData(inner_cache, kernel.InputAt(1), {});
  std::vector<const phi::DenseTensor*> input_inner_cache(input_inner_cache_vec->size());
  for (size_t i = 0; i < input_inner_cache.size(); ++i) {
    input_inner_cache[i] = &input_inner_cache_vec->at(i);
  }
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     ddims_vec.reserve(input_x_shape.size());
     for (size_t i = 0; i < input_x_shape.size(); ++i) {
       ddims_vec.emplace_back((*input_x_shape[i]).dims());
     }
     input_shapes.emplace_back("x_shape", ddims_vec);
     ddims_vec.clear();
     ddims_vec.reserve(input_inner_cache.size());
     for (size_t i = 0; i < input_inner_cache.size(); ++i) {
       ddims_vec.emplace_back((*input_inner_cache[i]).dims());
     }
     input_shapes.emplace_back("inner_cache", ddims_vec);
     platform::RecordOpInfoSupplement("einsum_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(&x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("einsum_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto x_shape_meta_vec = MakeMetaTensor(input_x_shape);
  std::vector<const phi::MetaTensor*> x_shape_metas(x_shape_meta_vec.size());
  for (size_t i = 0; i < x_shape_meta_vec.size(); ++i) {
    x_shape_metas[i] = &x_shape_meta_vec[i];
  }

  auto kernel_out_meta_vec = MakeMetaTensor(kernel_out);
  std::vector<phi::MetaTensor*> kernel_out_metas(kernel_out_meta_vec.size());
  for (size_t i = 0; i < kernel_out_meta_vec.size(); ++i) {
    kernel_out_metas[i] = kernel_out[i] ? &kernel_out_meta_vec[i] : nullptr;
  }
  phi::UnchangedMultiInferMeta(x_shape_metas, kernel_out_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const std::vector<const phi::DenseTensor*>&, const std::vector<const phi::DenseTensor*>&, const phi::DenseTensor&, const std::string&, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("einsum_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, input_x_shape, input_inner_cache, *input_out_grad, equation, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void elementwise_pow_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "elementwise_pow_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "elementwise_pow_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "elementwise_pow_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("elementwise_pow_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("elementwise_pow_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("elementwise_pow_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void elu_double_grad(const Tensor& x, const Tensor& grad_out, const Tensor& grad_x_grad, float alpha, Tensor* x_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, grad_out, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "elu_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "elu_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "elu_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("elu_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("elu_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_x), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("elu_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_grad_out, *input_grad_x_grad, alpha, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void elu_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, float alpha, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "elu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "elu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "elu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("elu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("elu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("elu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, alpha, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void embedding_grad(const Tensor& x, const Tensor& weight, const Tensor& out_grad, int64_t padding_idx, bool sparse, Tensor* weight_grad) {
  embedding_grad_impl(x, weight, out_grad, padding_idx, sparse, weight_grad);
}
PADDLE_API void exp_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "exp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "exp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "exp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("exp_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("exp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("exp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void expand_as_grad(const Tensor& x, const Tensor& out_grad, const std::vector<int>& target_shape, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "expand_as_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "expand_as_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "expand_as_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("expand_as_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("expand_as_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("expand_as_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, target_shape, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void expand_grad(const Tensor& x, const Tensor& out_grad, const IntArray& shape, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "expand_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "expand_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "expand_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("expand_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("expand_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("expand_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::IntArray(shape), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void expm1_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "expm1_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "expm1_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "expm1_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("expm1_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("expm1_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("expm1_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void fill_diagonal_grad(const Tensor& out_grad, float value, int offset, bool wrap, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fill_diagonal_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fill_diagonal_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fill_diagonal_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fill_diagonal_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fill_diagonal_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::FillDiagonalGradInferMeta(MakeMetaTensor(*input_out_grad), value, offset, wrap, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, float, int, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fill_diagonal_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, value, offset, wrap, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void fill_diagonal_tensor_grad(const Tensor& out_grad, int64_t offset, int dim1, int dim2, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fill_diagonal_tensor_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fill_diagonal_tensor_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fill_diagonal_tensor_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fill_diagonal_tensor_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fill_diagonal_tensor_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::FillDiagonalTensorGradInferMeta(MakeMetaTensor(*input_out_grad), offset, dim1, dim2, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, int64_t, int, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fill_diagonal_tensor_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, offset, dim1, dim2, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void fill_grad(const Tensor& out_grad, const Scalar& value, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fill_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fill_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fill_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fill_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fill_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::Scalar&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fill_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, phi::Scalar(value), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void flatten_grad(const Tensor& xshape, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_backend = ParseBackend(out_grad);

  kernel_layout = ParseLayout(out_grad);

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(xshape, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "flatten_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "flatten_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "flatten_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_xshape = PrepareData(xshape, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"xshape", {
     (*input_xshape).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("flatten_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("flatten_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::KernelWithXShapeInferMeta(MakeMetaTensor(*input_xshape), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("flatten_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_xshape, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void floor_grad(const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "floor_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "floor_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "floor_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("floor_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("floor_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("floor_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void fmax_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fmax_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fmax_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fmax_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fmax_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fmax_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fmax_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void fmin_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fmin_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fmin_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fmin_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fmin_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fmin_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fmin_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void frame_grad(const Tensor& x, const Tensor& out_grad, int frame_length, int hop_length, int axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "frame_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "frame_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "frame_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("frame_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("frame_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("frame_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, frame_length, hop_length, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void frobenius_norm_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const std::vector<int64_t>& axis, bool keep_dim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "frobenius_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "frobenius_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "frobenius_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("frobenius_norm_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("frobenius_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int64_t>&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("frobenius_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, axis, keep_dim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void gather_grad(const Tensor& x, const Tensor& index, const Tensor& out_grad, const Scalar& axis, bool overwrite, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, index, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "gather_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "gather_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "gather_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_index = PrepareData(index, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"index", {
     (*input_index).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("gather_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("gather_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::Scalar&, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("gather_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_index, *input_out_grad, phi::Scalar(axis), overwrite, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void gather_nd_grad(const Tensor& x, const Tensor& index, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, index, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "gather_nd_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "gather_nd_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "gather_nd_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_index = PrepareData(index, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"index", {
     (*input_index).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("gather_nd_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("gather_nd_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("gather_nd_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_index, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void gelu_grad(const Tensor& x, const Tensor& out_grad, bool approximate, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "gelu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "gelu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "gelu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("gelu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("gelu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("gelu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, approximate, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void graph_send_recv_grad(const Tensor& x, const Tensor& src_index, const Tensor& dst_index, const paddle::optional<Tensor>& out, const paddle::optional<Tensor>& dst_count, const Tensor& out_grad, const std::string& reduce_op, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, src_index, dst_index, out, dst_count, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "graph_send_recv_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "graph_send_recv_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "graph_send_recv_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_src_index = PrepareData(src_index, kernel.InputAt(1), {});
  auto input_dst_index = PrepareData(dst_index, kernel.InputAt(2), {});
  auto input_out = PrepareData(out, kernel.InputAt(3), {});
  auto input_dst_count = PrepareData(dst_count, kernel.InputAt(4), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(5), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> out_record_shapes;
     if(input_out){
       out_record_shapes.push_back((*input_out).dims());
     }
     std::vector<phi::DDim> dst_count_record_shapes;
     if(input_dst_count){
       dst_count_record_shapes.push_back((*input_dst_count).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"src_index", {
     (*input_src_index).dims()}},
     {"dst_index", {
     (*input_dst_index).dims()}},
     {"out", out_record_shapes},
     {"dst_count", dst_count_record_shapes},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("graph_send_recv_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("graph_send_recv_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::GeneralUnaryGradInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("graph_send_recv_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_src_index, *input_dst_index, input_out, input_dst_count, *input_out_grad, reduce_op, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void graph_send_ue_recv_grad(const Tensor& x, const Tensor& y, const Tensor& src_index, const Tensor& dst_index, const paddle::optional<Tensor>& out, const paddle::optional<Tensor>& dst_count, const Tensor& out_grad, const std::string& message_op, const std::string& reduce_op, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, src_index, dst_index, out, dst_count, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "graph_send_ue_recv_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "graph_send_ue_recv_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "graph_send_ue_recv_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_src_index = PrepareData(src_index, kernel.InputAt(2), {});
  auto input_dst_index = PrepareData(dst_index, kernel.InputAt(3), {});
  auto input_out = PrepareData(out, kernel.InputAt(4), {});
  auto input_dst_count = PrepareData(dst_count, kernel.InputAt(5), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(6), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> out_record_shapes;
     if(input_out){
       out_record_shapes.push_back((*input_out).dims());
     }
     std::vector<phi::DDim> dst_count_record_shapes;
     if(input_dst_count){
       dst_count_record_shapes.push_back((*input_dst_count).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"src_index", {
     (*input_src_index).dims()}},
     {"dst_index", {
     (*input_dst_index).dims()}},
     {"out", out_record_shapes},
     {"dst_count", dst_count_record_shapes},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("graph_send_ue_recv_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("graph_send_ue_recv_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::string&, const std::string&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("graph_send_ue_recv_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_src_index, *input_dst_index, input_out, input_dst_count, *input_out_grad, message_op, reduce_op, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void grid_sample_grad(const Tensor& x, const Tensor& grid, const Tensor& out_grad, const std::string& mode, const std::string& padding_mode, bool align_corners, Tensor* x_grad, Tensor* grid_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, grid, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "grid_sample_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "grid_sample_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "grid_sample_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_grid = PrepareData(grid, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"grid", {
     (*input_grid).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("grid_sample_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(grid_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("grid_sample_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_grid), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::string&, const std::string&, bool, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("grid_sample_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_grid, *input_out_grad, mode, padding_mode, align_corners, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void group_norm_grad(const Tensor& x, const paddle::optional<Tensor>& scale, const paddle::optional<Tensor>& bias, const Tensor& y, const Tensor& mean, const Tensor& variance, const Tensor& y_grad, float epsilon, int groups, const std::string& data_layout, Tensor* x_grad, Tensor* scale_grad, Tensor* bias_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(y_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, scale, bias, y, mean, variance, y_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "group_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "group_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "group_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_scale = PrepareData(scale, kernel.InputAt(1), {});
  auto input_bias = PrepareData(bias, kernel.InputAt(2), {});
  auto input_y = PrepareData(y, kernel.InputAt(3), {});
  auto input_mean = PrepareData(mean, kernel.InputAt(4), {});
  auto input_variance = PrepareData(variance, kernel.InputAt(5), {});
  auto input_y_grad = PrepareData(y_grad, kernel.InputAt(6), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> scale_record_shapes;
     if(input_scale){
       scale_record_shapes.push_back((*input_scale).dims());
     }
     std::vector<phi::DDim> bias_record_shapes;
     if(input_bias){
       bias_record_shapes.push_back((*input_bias).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"scale", scale_record_shapes},
     {"bias", bias_record_shapes},
     {"y", {
     (*input_y).dims()}},
     {"mean", {
     (*input_mean).dims()}},
     {"variance", {
     (*input_variance).dims()}},
     {"y_grad", {
     (*input_y_grad).dims()}}};
     platform::RecordOpInfoSupplement("group_norm_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(scale_grad);
  auto kernel_out_2 = SetKernelOutput(bias_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("group_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_y), MakeMetaTensor(input_scale), MakeMetaTensor(input_bias), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, int, const std::string&, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("group_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_scale, input_bias, *input_y, *input_mean, *input_variance, *input_y_grad, epsilon, groups, data_layout, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void gumbel_softmax_grad(const Tensor& out, const Tensor& out_grad, int axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "gumbel_softmax_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "gumbel_softmax_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "gumbel_softmax_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("gumbel_softmax_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("gumbel_softmax_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::GumbelSoftmaxGradInferMeta(MakeMetaTensor(*input_out), MakeMetaTensor(*input_out_grad), axis, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("gumbel_softmax_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void hard_shrink_grad(const Tensor& x, const Tensor& out_grad, float threshold, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "hard_shrink_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "hard_shrink_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "hard_shrink_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("hard_shrink_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("hard_shrink_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("hard_shrink_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, threshold, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void hard_sigmoid_grad(const Tensor& out, const Tensor& out_grad, float slope, float offset, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "hard_sigmoid_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "hard_sigmoid_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "hard_sigmoid_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("hard_sigmoid_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("hard_sigmoid_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("hard_sigmoid_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, slope, offset, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void hard_swish_grad(const Tensor& x, const Tensor& out_grad, float threshold, float scale, float offset, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "hard_swish_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "hard_swish_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "hard_swish_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("hard_swish_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("hard_swish_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, float, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("hard_swish_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, threshold, scale, offset, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void hierarchical_sigmoid_grad(const Tensor& x, const Tensor& w, const Tensor& label, const paddle::optional<Tensor>& path, const paddle::optional<Tensor>& code, const paddle::optional<Tensor>& bias, const Tensor& pre_out, const Tensor& out_grad, int num_classes, bool remote_prefetch, int trainer_id, const std::vector<int64_t>& height_sections, const std::vector<std::string>& epmap, const std::vector<std::string>& table_names, bool is_sparse, Tensor* x_grad, Tensor* w_grad, Tensor* bias_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, w, label, path, code, bias, pre_out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "hierarchical_sigmoid_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "hierarchical_sigmoid_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "hierarchical_sigmoid_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_w = PrepareData(w, kernel.InputAt(1), {});
  auto input_label = PrepareData(label, kernel.InputAt(2), {});
  auto input_path = PrepareData(path, kernel.InputAt(3), {});
  auto input_code = PrepareData(code, kernel.InputAt(4), {});
  auto input_bias = PrepareData(bias, kernel.InputAt(5), {});
  auto input_pre_out = PrepareData(pre_out, kernel.InputAt(6), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(7), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> path_record_shapes;
     if(input_path){
       path_record_shapes.push_back((*input_path).dims());
     }
     std::vector<phi::DDim> code_record_shapes;
     if(input_code){
       code_record_shapes.push_back((*input_code).dims());
     }
     std::vector<phi::DDim> bias_record_shapes;
     if(input_bias){
       bias_record_shapes.push_back((*input_bias).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"w", {
     (*input_w).dims()}},
     {"label", {
     (*input_label).dims()}},
     {"path", path_record_shapes},
     {"code", code_record_shapes},
     {"bias", bias_record_shapes},
     {"pre_out", {
     (*input_pre_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("hierarchical_sigmoid_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(w_grad);
  auto kernel_out_2 = SetKernelOutput(bias_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("hierarchical_sigmoid_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_w), MakeMetaTensor(input_bias), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, int, bool, int, const std::vector<int64_t>&, const std::vector<std::string>&, const std::vector<std::string>&, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("hierarchical_sigmoid_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_w, *input_label, input_path, input_code, input_bias, *input_pre_out, *input_out_grad, num_classes, remote_prefetch, trainer_id, height_sections, epmap, table_names, is_sparse, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void huber_loss_grad(const Tensor& residual, const Tensor& out_grad, float delta, Tensor* input_grad, Tensor* label_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(residual, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "huber_loss_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "huber_loss_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "huber_loss_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_residual = PrepareData(residual, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"residual", {
     (*input_residual).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("huber_loss_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(input_grad);
  auto kernel_out_1 = SetKernelOutput(label_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("huber_loss_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_residual), MakeMetaTensor(*input_residual), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("huber_loss_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_residual, *input_out_grad, delta, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void imag_grad(const Tensor& out_grad, Tensor* x_grad) {
  imag_grad_impl(out_grad, x_grad);
}
PADDLE_API void index_add_grad(const Tensor& index, const Tensor& add_value, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* add_value_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(index, add_value, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "index_add_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "index_add_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "index_add_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_index = PrepareData(index, kernel.InputAt(0), {});
  auto input_add_value = PrepareData(add_value, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"index", {
     (*input_index).dims()}},
     {"add_value", {
     (*input_add_value).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("index_add_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(add_value_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("index_add_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::IndexAddGradInferMeta(MakeMetaTensor(*input_index), MakeMetaTensor(*input_add_value), MakeMetaTensor(*input_out_grad), axis, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("index_add_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_index, *input_add_value, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void index_sample_grad(const Tensor& x, const Tensor& index, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, index, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "index_sample_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "index_sample_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "index_sample_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_index = PrepareData(index, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"index", {
     (*input_index).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("index_sample_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("index_sample_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("index_sample_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_index, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void index_select_grad(const Tensor& x, const Tensor& index, const Tensor& out_grad, int dim, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, index, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "index_select_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "index_select_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "index_select_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_index = PrepareData(index, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"index", {
     (*input_index).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("index_select_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("index_select_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("index_select_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_index, *input_out_grad, dim, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void instance_norm_double_grad(const Tensor& x, const paddle::optional<Tensor>& fwd_scale, const Tensor& saved_mean, const Tensor& saved_variance, const Tensor& grad_y, const paddle::optional<Tensor>& grad_x_grad, const paddle::optional<Tensor>& grad_scale_grad, const paddle::optional<Tensor>& grad_bias_grad, float epsilon, Tensor* x_grad, Tensor* fwd_scale_grad, Tensor* grad_y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, fwd_scale, saved_mean, saved_variance, grad_y, grad_x_grad, grad_scale_grad, grad_bias_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "instance_norm_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "instance_norm_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "instance_norm_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_fwd_scale = PrepareData(fwd_scale, kernel.InputAt(1), {});
  auto input_saved_mean = PrepareData(saved_mean, kernel.InputAt(2), {});
  auto input_saved_variance = PrepareData(saved_variance, kernel.InputAt(3), {});
  auto input_grad_y = PrepareData(grad_y, kernel.InputAt(4), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(5), {});
  auto input_grad_scale_grad = PrepareData(grad_scale_grad, kernel.InputAt(6), {});
  auto input_grad_bias_grad = PrepareData(grad_bias_grad, kernel.InputAt(7), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> fwd_scale_record_shapes;
     if(input_fwd_scale){
       fwd_scale_record_shapes.push_back((*input_fwd_scale).dims());
     }
     std::vector<phi::DDim> grad_x_grad_record_shapes;
     if(input_grad_x_grad){
       grad_x_grad_record_shapes.push_back((*input_grad_x_grad).dims());
     }
     std::vector<phi::DDim> grad_scale_grad_record_shapes;
     if(input_grad_scale_grad){
       grad_scale_grad_record_shapes.push_back((*input_grad_scale_grad).dims());
     }
     std::vector<phi::DDim> grad_bias_grad_record_shapes;
     if(input_grad_bias_grad){
       grad_bias_grad_record_shapes.push_back((*input_grad_bias_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"fwd_scale", fwd_scale_record_shapes},
     {"saved_mean", {
     (*input_saved_mean).dims()}},
     {"saved_variance", {
     (*input_saved_variance).dims()}},
     {"grad_y", {
     (*input_grad_y).dims()}},
     {"grad_x_grad", grad_x_grad_record_shapes},
     {"grad_scale_grad", grad_scale_grad_record_shapes},
     {"grad_bias_grad",
     grad_bias_grad_record_shapes}};
     platform::RecordOpInfoSupplement("instance_norm_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(fwd_scale_grad);
  auto kernel_out_2 = SetKernelOutput(grad_y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("instance_norm_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::InstanceNormDoubleGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(input_fwd_scale), MakeMetaTensor(*input_saved_mean), MakeMetaTensor(*input_saved_variance), MakeMetaTensor(*input_grad_y), MakeMetaTensor(input_grad_x_grad), MakeMetaTensor(input_grad_scale_grad), MakeMetaTensor(input_grad_bias_grad), epsilon, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, float, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("instance_norm_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_fwd_scale, *input_saved_mean, *input_saved_variance, *input_grad_y, input_grad_x_grad, input_grad_scale_grad, input_grad_bias_grad, epsilon, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void instance_norm_grad(const Tensor& x, const paddle::optional<Tensor>& scale, const Tensor& saved_mean, const Tensor& saved_variance, const Tensor& y_grad, float epsilon, Tensor* x_grad, Tensor* scale_grad, Tensor* bias_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, scale, saved_mean, saved_variance, y_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "instance_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "instance_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "instance_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_scale = PrepareData(scale, kernel.InputAt(1), {});
  auto input_saved_mean = PrepareData(saved_mean, kernel.InputAt(2), {});
  auto input_saved_variance = PrepareData(saved_variance, kernel.InputAt(3), {});
  auto input_y_grad = PrepareData(y_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> scale_record_shapes;
     if(input_scale){
       scale_record_shapes.push_back((*input_scale).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"scale", scale_record_shapes},
     {"saved_mean", {
     (*input_saved_mean).dims()}},
     {"saved_variance", {
     (*input_saved_variance).dims()}},
     {"y_grad", {
     (*input_y_grad).dims()}}};
     platform::RecordOpInfoSupplement("instance_norm_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(scale_grad);
  auto kernel_out_2 = SetKernelOutput(bias_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("instance_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::InstanceNormGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(input_scale), MakeMetaTensor(*input_saved_mean), MakeMetaTensor(*input_saved_variance), MakeMetaTensor(*input_y_grad), epsilon, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("instance_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_scale, *input_saved_mean, *input_saved_variance, *input_y_grad, epsilon, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void inverse_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "inverse_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "inverse_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "inverse_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("inverse_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("inverse_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::InverseGradInferMeta(MakeMetaTensor(*input_out), MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("inverse_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void kldiv_loss_grad(const Tensor& x, const Tensor& label, const Tensor& out_grad, const std::string& reduction, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, label, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "kldiv_loss_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "kldiv_loss_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "kldiv_loss_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_label = PrepareData(label, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"label", {
     (*input_label).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("kldiv_loss_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("kldiv_loss_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("kldiv_loss_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_label, *input_out_grad, reduction, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void kron_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "kron_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "kron_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "kron_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("kron_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("kron_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("kron_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void kthvalue_grad(const Tensor& x, const Tensor& indices, const Tensor& out_grad, int k, int axis, bool keepdim, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, indices, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "kthvalue_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "kthvalue_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "kthvalue_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_indices = PrepareData(indices, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"indices", {
     (*input_indices).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("kthvalue_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("kthvalue_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("kthvalue_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_indices, *input_out_grad, k, axis, keepdim, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void label_smooth_grad(const Tensor& out_grad, float epsilon, Tensor* label_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "label_smooth_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "label_smooth_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "label_smooth_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("label_smooth_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(label_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("label_smooth_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("label_smooth_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, epsilon, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void layer_norm_grad(const Tensor& x, const paddle::optional<Tensor>& scale, const paddle::optional<Tensor>& bias, const Tensor& mean, const Tensor& variance, const Tensor& out_grad, float epsilon, int begin_norm_axis, bool is_test, Tensor* x_grad, Tensor* scale_grad, Tensor* bias_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, scale, bias, mean, variance, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "layer_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "layer_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "layer_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_scale = PrepareData(scale, kernel.InputAt(1), {});
  auto input_bias = PrepareData(bias, kernel.InputAt(2), {});
  auto input_mean = PrepareData(mean, kernel.InputAt(3), {});
  auto input_variance = PrepareData(variance, kernel.InputAt(4), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(5), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> scale_record_shapes;
     if(input_scale){
       scale_record_shapes.push_back((*input_scale).dims());
     }
     std::vector<phi::DDim> bias_record_shapes;
     if(input_bias){
       bias_record_shapes.push_back((*input_bias).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"scale", scale_record_shapes},
     {"bias", bias_record_shapes},
     {"mean", {
     (*input_mean).dims()}},
     {"variance", {
     (*input_variance).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("layer_norm_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(scale_grad);
  auto kernel_out_2 = SetKernelOutput(bias_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("layer_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::LayerNormGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(input_scale), MakeMetaTensor(input_bias), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, int, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("layer_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_scale, input_bias, *input_mean, *input_variance, *input_out_grad, epsilon, begin_norm_axis, is_test, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void leaky_relu_double_grad(const Tensor& x, const Tensor& grad_x_grad, float alpha, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "leaky_relu_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "leaky_relu_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "leaky_relu_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("leaky_relu_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("leaky_relu_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_grad_x_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("leaky_relu_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_grad_x_grad, alpha, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void leaky_relu_grad(const Tensor& x, const Tensor& out_grad, float alpha, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "leaky_relu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "leaky_relu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "leaky_relu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("leaky_relu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("leaky_relu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("leaky_relu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, alpha, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void lerp_grad(const Tensor& x, const Tensor& y, const Tensor& weight, const Tensor& out, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, weight, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "lerp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "lerp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "lerp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_weight = PrepareData(weight, kernel.InputAt(2), {});
  auto input_out = PrepareData(out, kernel.InputAt(3), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"weight", {
     (*input_weight).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("lerp_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("lerp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("lerp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_weight, *input_out, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void linear_interp_grad(const Tensor& x, const paddle::optional<Tensor>& out_size, const paddle::optional<std::vector<Tensor>>& size_tensor, const paddle::optional<Tensor>& scale_tensor, const Tensor& output_grad, const std::string& data_layout, int out_d, int out_h, int out_w, const std::vector<float>& scale, const std::string& interp_method, bool align_corners, int align_mode, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(output_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_size, size_tensor, scale_tensor, output_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "linear_interp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "linear_interp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "linear_interp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_size = PrepareData(out_size, kernel.InputAt(1), {});
  auto input_size_tensor_vec = PrepareData(size_tensor, kernel.InputAt(2), {});
  paddle::optional<std::vector<const phi::DenseTensor*>> input_size_tensor;
  if (input_size_tensor_vec){
    input_size_tensor = paddle::optional<std::vector<const phi::DenseTensor*>>(input_size_tensor_vec->size());
    for (size_t i = 0; i < input_size_tensor_vec->size(); ++i) {
      input_size_tensor->at(i) = &input_size_tensor_vec->at(i);
    }
  }
  auto input_scale_tensor = PrepareData(scale_tensor, kernel.InputAt(3), {});
  auto input_output_grad = PrepareData(output_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> out_size_record_shapes;
     if(input_out_size){
       out_size_record_shapes.push_back((*input_out_size).dims());
     }
     std::vector<phi::DDim> scale_tensor_record_shapes;
     if(input_scale_tensor){
       scale_tensor_record_shapes.push_back((*input_scale_tensor).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_size", out_size_record_shapes},
     {"scale_tensor", scale_tensor_record_shapes},
     {"output_grad", {
     (*input_output_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     if (input_size_tensor){
       ddims_vec.reserve(input_size_tensor->size());
       for (size_t i = 0; i < input_size_tensor->size(); ++i) {
         ddims_vec.emplace_back((*input_size_tensor->at(i)).dims());
       }
     }
     input_shapes.emplace_back("size_tensor", ddims_vec);
     platform::RecordOpInfoSupplement("linear_interp_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("linear_interp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<std::vector<const phi::DenseTensor*>>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::string&, int, int, int, const std::vector<float>&, const std::string&, bool, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("linear_interp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_out_size, input_size_tensor, input_scale_tensor, *input_output_grad, data_layout, out_d, out_h, out_w, scale, interp_method, align_corners, align_mode, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void log10_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "log10_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "log10_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "log10_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("log10_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("log10_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("log10_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void log1p_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "log1p_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "log1p_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "log1p_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("log1p_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("log1p_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("log1p_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void log2_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "log2_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "log2_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "log2_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("log2_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("log2_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("log2_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void log_double_grad(const Tensor& x, const Tensor& grad_out, const Tensor& grad_x_grad, Tensor* x_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, grad_out, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "log_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "log_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "log_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("log_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("log_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_x), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("log_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_grad_out, *input_grad_x_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void log_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "log_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "log_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "log_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("log_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("log_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("log_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void log_loss_grad(const Tensor& input, const Tensor& label, const Tensor& out_grad, float epsilon, Tensor* input_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, label, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "log_loss_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "log_loss_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "log_loss_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_label = PrepareData(label, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"label", {
     (*input_label).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("log_loss_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(input_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("log_loss_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_input), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("log_loss_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_label, *input_out_grad, epsilon, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void log_softmax_grad(const Tensor& out, const Tensor& out_grad, int axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "log_softmax_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "log_softmax_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "log_softmax_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("log_softmax_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("log_softmax_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("log_softmax_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void logcumsumexp_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, int axis, bool flatten, bool exclusive, bool reverse, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "logcumsumexp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "logcumsumexp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "logcumsumexp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("logcumsumexp_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("logcumsumexp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, bool, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("logcumsumexp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, axis, flatten, exclusive, reverse, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void logit_grad(const Tensor& x, const Tensor& out_grad, float eps, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "logit_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "logit_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "logit_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("logit_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("logit_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("logit_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, eps, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void logsigmoid_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "logsigmoid_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "logsigmoid_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "logsigmoid_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("logsigmoid_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("logsigmoid_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("logsigmoid_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void logsumexp_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const std::vector<int64_t>& axis, bool keepdim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "logsumexp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "logsumexp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "logsumexp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("logsumexp_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("logsumexp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int64_t>&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("logsumexp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, axis, keepdim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void lu_grad(const Tensor& x, const Tensor& out, const Tensor& pivots, const Tensor& out_grad, bool pivot, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, pivots, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "lu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "lu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "lu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_pivots = PrepareData(pivots, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"pivots", {
     (*input_pivots).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("lu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("lu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::LUGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_out), MakeMetaTensor(*input_pivots), MakeMetaTensor(*input_out_grad), pivot, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("lu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_pivots, *input_out_grad, pivot, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void lu_unpack_grad(const Tensor& x, const Tensor& pivots, const Tensor& l, const Tensor& u, const Tensor& pmat, const Tensor& l_grad, const Tensor& u_grad, bool unpack_ludata, bool unpack_pivots, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, pivots, l, u, pmat, l_grad, u_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "lu_unpack_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "lu_unpack_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "lu_unpack_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_pivots = PrepareData(pivots, kernel.InputAt(1), {});
  auto input_l = PrepareData(l, kernel.InputAt(2), {});
  auto input_u = PrepareData(u, kernel.InputAt(3), {});
  auto input_pmat = PrepareData(pmat, kernel.InputAt(4), {});
  auto input_l_grad = PrepareData(l_grad, kernel.InputAt(5), {});
  auto input_u_grad = PrepareData(u_grad, kernel.InputAt(6), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"pivots", {
     (*input_pivots).dims()}},
     {"l", {
     (*input_l).dims()}},
     {"u", {
     (*input_u).dims()}},
     {"pmat", {
     (*input_pmat).dims()}},
     {"l_grad", {
     (*input_l_grad).dims()}},
     {"u_grad", {
     (*input_u_grad).dims()}}};
     platform::RecordOpInfoSupplement("lu_unpack_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("lu_unpack_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::LUUnpackGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_pivots), MakeMetaTensor(*input_l), MakeMetaTensor(*input_u), MakeMetaTensor(*input_pmat), MakeMetaTensor(*input_l_grad), MakeMetaTensor(*input_u_grad), unpack_ludata, unpack_pivots, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("lu_unpack_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_pivots, *input_l, *input_u, *input_pmat, *input_l_grad, *input_u_grad, unpack_ludata, unpack_pivots, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void margin_cross_entropy_grad(const Tensor& logits, const Tensor& label, const Tensor& softmax, const Tensor& loss_grad, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale, Tensor* logits_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(softmax);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(logits, label, softmax, loss_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "margin_cross_entropy_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "margin_cross_entropy_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "margin_cross_entropy_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_logits = PrepareData(logits, kernel.InputAt(0), {});
  auto input_label = PrepareData(label, kernel.InputAt(1), {});
  auto input_softmax = PrepareData(softmax, kernel.InputAt(2), {});
  auto input_loss_grad = PrepareData(loss_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"logits", {
     (*input_logits).dims()}},
     {"label", {
     (*input_label).dims()}},
     {"softmax", {
     (*input_softmax).dims()}},
     {"loss_grad", {
     (*input_loss_grad).dims()}}};
     platform::RecordOpInfoSupplement("margin_cross_entropy_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(logits_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("margin_cross_entropy_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::MarginCrossEntropyGradInferMeta(MakeMetaTensor(*input_logits), MakeMetaTensor(*input_label), MakeMetaTensor(*input_softmax), MakeMetaTensor(*input_loss_grad), return_softmax, ring_id, rank, nranks, margin1, margin2, margin3, scale, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, int, int, int, float, float, float, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("margin_cross_entropy_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_logits, *input_label, *input_softmax, *input_loss_grad, return_softmax, ring_id, rank, nranks, margin1, margin2, margin3, scale, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void masked_select_grad(const Tensor& x, const Tensor& mask, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, mask, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "masked_select_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "masked_select_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "masked_select_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_mask = PrepareData(mask, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"mask", {
     (*input_mask).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("masked_select_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("masked_select_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("masked_select_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_mask, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void matmul_double_grad(const Tensor& x, const Tensor& y, const Tensor& grad_out, const paddle::optional<Tensor>& grad_x_grad, const paddle::optional<Tensor>& grad_y_grad, bool transpose_x, bool transpose_y, Tensor* x_grad, Tensor* y_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, grad_out, grad_x_grad, grad_y_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "matmul_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "matmul_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "matmul_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(2), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(3), {});
  auto input_grad_y_grad = PrepareData(grad_y_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_x_grad_record_shapes;
     if(input_grad_x_grad){
       grad_x_grad_record_shapes.push_back((*input_grad_x_grad).dims());
     }
     std::vector<phi::DDim> grad_y_grad_record_shapes;
     if(input_grad_y_grad){
       grad_y_grad_record_shapes.push_back((*input_grad_y_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", grad_x_grad_record_shapes},
     {"grad_y_grad",
     grad_y_grad_record_shapes}};
     platform::RecordOpInfoSupplement("matmul_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  auto kernel_out_2 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("matmul_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), MakeMetaTensor(*input_grad_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, bool, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("matmul_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_grad_out, input_grad_x_grad, input_grad_y_grad, transpose_x, transpose_y, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void matmul_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x, bool transpose_y, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "matmul_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "matmul_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "matmul_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("matmul_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("matmul_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, bool, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("matmul_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, transpose_x, transpose_y, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void matmul_triple_grad(const Tensor& x, const Tensor& y, const Tensor& fwd_grad_out, const Tensor& fwd_grad_grad_x, const Tensor& fwd_grad_grad_y, const paddle::optional<Tensor>& grad_x_grad, const paddle::optional<Tensor>& grad_y_grad, const paddle::optional<Tensor>& grad_grad_out_grad, bool transpose_x, bool transpose_y, Tensor* x_grad, Tensor* y_grad, Tensor* fwd_grad_out_grad, Tensor* fwd_grad_grad_x_grad, Tensor* fwd_grad_grad_y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, fwd_grad_out, fwd_grad_grad_x, fwd_grad_grad_y, grad_x_grad, grad_y_grad, grad_grad_out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "matmul_triple_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "matmul_triple_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "matmul_triple_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_fwd_grad_out = PrepareData(fwd_grad_out, kernel.InputAt(2), {});
  auto input_fwd_grad_grad_x = PrepareData(fwd_grad_grad_x, kernel.InputAt(3), {});
  auto input_fwd_grad_grad_y = PrepareData(fwd_grad_grad_y, kernel.InputAt(4), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(5), {});
  auto input_grad_y_grad = PrepareData(grad_y_grad, kernel.InputAt(6), {});
  auto input_grad_grad_out_grad = PrepareData(grad_grad_out_grad, kernel.InputAt(7), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_x_grad_record_shapes;
     if(input_grad_x_grad){
       grad_x_grad_record_shapes.push_back((*input_grad_x_grad).dims());
     }
     std::vector<phi::DDim> grad_y_grad_record_shapes;
     if(input_grad_y_grad){
       grad_y_grad_record_shapes.push_back((*input_grad_y_grad).dims());
     }
     std::vector<phi::DDim> grad_grad_out_grad_record_shapes;
     if(input_grad_grad_out_grad){
       grad_grad_out_grad_record_shapes.push_back((*input_grad_grad_out_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"fwd_grad_out", {
     (*input_fwd_grad_out).dims()}},
     {"fwd_grad_grad_x", {
     (*input_fwd_grad_grad_x).dims()}},
     {"fwd_grad_grad_y", {
     (*input_fwd_grad_grad_y).dims()}},
     {"grad_x_grad", grad_x_grad_record_shapes},
     {"grad_y_grad", grad_y_grad_record_shapes},
     {"grad_grad_out_grad",
     grad_grad_out_grad_record_shapes}};
     platform::RecordOpInfoSupplement("matmul_triple_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  auto kernel_out_2 = SetKernelOutput(fwd_grad_out_grad);
  auto kernel_out_3 = SetKernelOutput(fwd_grad_grad_x_grad);
  auto kernel_out_4 = SetKernelOutput(fwd_grad_grad_y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("matmul_triple_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);
  phi::MetaTensor meta_out_3(kernel_out_3);
  phi::MetaTensor meta_out_4(kernel_out_4);

  phi::GeneralQuinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), MakeMetaTensor(*input_fwd_grad_out), MakeMetaTensor(*input_fwd_grad_grad_x), MakeMetaTensor(*input_fwd_grad_grad_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr, kernel_out_3 ? &meta_out_3 : nullptr, kernel_out_4 ? &meta_out_4 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, bool, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("matmul_triple_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_fwd_grad_out, *input_fwd_grad_grad_x, *input_fwd_grad_grad_y, input_grad_x_grad, input_grad_y_grad, input_grad_grad_out_grad, transpose_x, transpose_y, kernel_out_0, kernel_out_1, kernel_out_2, kernel_out_3, kernel_out_4);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
    TransDataBackend(kernel_out_3, kernel_backend, kernel_out_3);
    TransDataBackend(kernel_out_4, kernel_backend, kernel_out_4);
  }
  
}

PADDLE_API void matrix_power_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, int n, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "matrix_power_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "matrix_power_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "matrix_power_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("matrix_power_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("matrix_power_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("matrix_power_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, n, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void max_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const IntArray& dims, bool keep_dim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "max_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "max_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "max_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("max_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("max_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("max_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, phi::IntArray(dims), keep_dim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void max_pool2d_with_index_grad(const Tensor& x, const Tensor& mask, const Tensor& out_grad, const std::vector<int>& kernel_size, const std::vector<int>& strides, const std::vector<int>& paddings, bool global_pooling, bool adaptive, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, mask, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "max_pool2d_with_index_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "max_pool2d_with_index_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "max_pool2d_with_index_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_mask = PrepareData(mask, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"mask", {
     (*input_mask).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("max_pool2d_with_index_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("max_pool2d_with_index_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::MaxPoolWithIndexGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_mask), MakeMetaTensor(*input_out_grad), kernel_size, strides, paddings, global_pooling, adaptive, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("max_pool2d_with_index_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_mask, *input_out_grad, kernel_size, strides, paddings, global_pooling, adaptive, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void max_pool3d_with_index_grad(const Tensor& x, const Tensor& mask, const Tensor& out_grad, const std::vector<int>& kernel_size, const std::vector<int>& strides, const std::vector<int>& paddings, bool global_pooling, bool adaptive, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, mask, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "max_pool3d_with_index_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "max_pool3d_with_index_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "max_pool3d_with_index_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_mask = PrepareData(mask, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"mask", {
     (*input_mask).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("max_pool3d_with_index_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("max_pool3d_with_index_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::MaxPoolWithIndexGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_mask), MakeMetaTensor(*input_out_grad), kernel_size, strides, paddings, global_pooling, adaptive, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("max_pool3d_with_index_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_mask, *input_out_grad, kernel_size, strides, paddings, global_pooling, adaptive, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void maximum_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "maximum_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "maximum_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "maximum_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("maximum_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("maximum_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("maximum_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void maxout_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, int groups, int axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "maxout_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "maxout_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "maxout_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("maxout_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("maxout_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::GeneralUnaryGradInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("maxout_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, groups, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void mean_all_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "mean_all_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "mean_all_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "mean_all_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("mean_all_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("mean_all_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("mean_all_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void mean_grad(const Tensor& x, const Tensor& out_grad, const IntArray& dims, bool keep_dim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "mean_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "mean_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "mean_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("mean_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("mean_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("mean_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::IntArray(dims), keep_dim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void meshgrid_grad(const std::vector<Tensor>& inputs, const std::vector<Tensor>& outputs_grad, std::vector<Tensor*> inputs_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(inputs, outputs_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "meshgrid_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "meshgrid_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "meshgrid_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_inputs_vec = PrepareData(inputs, kernel.InputAt(0), {});
  std::vector<const phi::DenseTensor*> input_inputs(input_inputs_vec->size());
  for (size_t i = 0; i < input_inputs.size(); ++i) {
    input_inputs[i] = &input_inputs_vec->at(i);
  }
  auto input_outputs_grad_vec = PrepareData(outputs_grad, kernel.InputAt(1), {});
  std::vector<const phi::DenseTensor*> input_outputs_grad(input_outputs_grad_vec->size());
  for (size_t i = 0; i < input_outputs_grad.size(); ++i) {
    input_outputs_grad[i] = &input_outputs_grad_vec->at(i);
  }
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes;
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     ddims_vec.reserve(input_inputs.size());
     for (size_t i = 0; i < input_inputs.size(); ++i) {
       ddims_vec.emplace_back((*input_inputs[i]).dims());
     }
     input_shapes.emplace_back("inputs", ddims_vec);
     ddims_vec.clear();
     ddims_vec.reserve(input_outputs_grad.size());
     for (size_t i = 0; i < input_outputs_grad.size(); ++i) {
       ddims_vec.emplace_back((*input_outputs_grad[i]).dims());
     }
     input_shapes.emplace_back("outputs_grad", ddims_vec);
     platform::RecordOpInfoSupplement("meshgrid_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(&inputs_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("meshgrid_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto inputs_meta_vec = MakeMetaTensor(input_inputs);
  std::vector<const phi::MetaTensor*> inputs_metas(inputs_meta_vec.size());
  for (size_t i = 0; i < inputs_meta_vec.size(); ++i) {
    inputs_metas[i] = &inputs_meta_vec[i];
  }

  auto outputs_grad_meta_vec = MakeMetaTensor(input_outputs_grad);
  std::vector<const phi::MetaTensor*> outputs_grad_metas(outputs_grad_meta_vec.size());
  for (size_t i = 0; i < outputs_grad_meta_vec.size(); ++i) {
    outputs_grad_metas[i] = &outputs_grad_meta_vec[i];
  }

  auto kernel_out_meta_vec = MakeMetaTensor(kernel_out);
  std::vector<phi::MetaTensor*> kernel_out_metas(kernel_out_meta_vec.size());
  for (size_t i = 0; i < kernel_out_meta_vec.size(); ++i) {
    kernel_out_metas[i] = kernel_out[i] ? &kernel_out_meta_vec[i] : nullptr;
  }
  phi::MeshgridGradInferMeta(inputs_metas, outputs_grad_metas, kernel_out_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const std::vector<const phi::DenseTensor*>&, const std::vector<const phi::DenseTensor*>&, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("meshgrid_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, input_inputs, input_outputs_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void min_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const IntArray& dims, bool keep_dim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "min_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "min_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "min_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("min_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("min_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("min_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, phi::IntArray(dims), keep_dim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void minimum_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "minimum_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "minimum_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "minimum_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("minimum_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("minimum_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("minimum_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void mish_grad(const Tensor& x, const Tensor& out_grad, float threshold, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "mish_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "mish_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "mish_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("mish_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("mish_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("mish_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, threshold, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void mode_grad(const Tensor& x, const Tensor& indices, const Tensor& out_grad, int axis, bool keepdim, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, indices, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "mode_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "mode_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "mode_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_indices = PrepareData(indices, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"indices", {
     (*input_indices).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("mode_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("mode_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("mode_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_indices, *input_out_grad, axis, keepdim, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void multi_dot_grad(const std::vector<Tensor>& x, const Tensor& out_grad, std::vector<Tensor*> x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "multi_dot_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "multi_dot_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "multi_dot_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x_vec = PrepareData(x, kernel.InputAt(0), {});
  std::vector<const phi::DenseTensor*> input_x(input_x_vec->size());
  for (size_t i = 0; i < input_x.size(); ++i) {
    input_x[i] = &input_x_vec->at(i);
  }
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     ddims_vec.reserve(input_x.size());
     for (size_t i = 0; i < input_x.size(); ++i) {
       ddims_vec.emplace_back((*input_x[i]).dims());
     }
     input_shapes.emplace_back("x", ddims_vec);
     platform::RecordOpInfoSupplement("multi_dot_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(&x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("multi_dot_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto x_meta_vec = MakeMetaTensor(input_x);
  std::vector<const phi::MetaTensor*> x_metas(x_meta_vec.size());
  for (size_t i = 0; i < x_meta_vec.size(); ++i) {
    x_metas[i] = &x_meta_vec[i];
  }

  auto kernel_out_meta_vec = MakeMetaTensor(kernel_out);
  std::vector<phi::MetaTensor*> kernel_out_metas(kernel_out_meta_vec.size());
  for (size_t i = 0; i < kernel_out_meta_vec.size(); ++i) {
    kernel_out_metas[i] = kernel_out[i] ? &kernel_out_meta_vec[i] : nullptr;
  }
  phi::MultiDotGradInferMeta(x_metas, MakeMetaTensor(*input_out_grad), kernel_out_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const std::vector<const phi::DenseTensor*>&, const phi::DenseTensor&, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("multi_dot_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void multiplex_grad(const std::vector<Tensor>& ins, const Tensor& ids, const Tensor& out_grad, std::vector<Tensor*> ins_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(ins, ids, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "multiplex_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "multiplex_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "multiplex_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_ids = PrepareData(ids, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"ids", {
     (*input_ids).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("multiplex_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(&ins_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("multiplex_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto kernel_out_meta_vec = MakeMetaTensor(kernel_out);
  std::vector<phi::MetaTensor*> kernel_out_metas(kernel_out_meta_vec.size());
  for (size_t i = 0; i < kernel_out_meta_vec.size(); ++i) {
    kernel_out_metas[i] = kernel_out[i] ? &kernel_out_meta_vec[i] : nullptr;
  }
  phi::MultiplexGradInferMeta(MakeMetaTensor(*input_ids), MakeMetaTensor(*input_out_grad), kernel_out_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("multiplex_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_ids, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void multiply_double_grad(const Tensor& x, const Tensor& y, const Tensor& grad_out, const paddle::optional<Tensor>& grad_x_grad, const paddle::optional<Tensor>& grad_y_grad, int axis, Tensor* x_grad, Tensor* y_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, grad_out, grad_x_grad, grad_y_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "multiply_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "multiply_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "multiply_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(2), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(3), {});
  auto input_grad_y_grad = PrepareData(grad_y_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_x_grad_record_shapes;
     if(input_grad_x_grad){
       grad_x_grad_record_shapes.push_back((*input_grad_x_grad).dims());
     }
     std::vector<phi::DDim> grad_y_grad_record_shapes;
     if(input_grad_y_grad){
       grad_y_grad_record_shapes.push_back((*input_grad_y_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", grad_x_grad_record_shapes},
     {"grad_y_grad",
     grad_y_grad_record_shapes}};
     platform::RecordOpInfoSupplement("multiply_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  auto kernel_out_2 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("multiply_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), MakeMetaTensor(*input_grad_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, int, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("multiply_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_grad_out, input_grad_x_grad, input_grad_y_grad, axis, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void multiply_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "multiply_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "multiply_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "multiply_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("multiply_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("multiply_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("multiply_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void multiply_triple_grad(const Tensor& x, const Tensor& y, const Tensor& fwd_grad_out, const paddle::optional<Tensor>& fwd_grad_grad_x, const paddle::optional<Tensor>& fwd_grad_grad_y, const Tensor& grad_x_grad, const Tensor& grad_y_grad, const paddle::optional<Tensor>& grad_grad_out_grad, int axis, Tensor* x_grad, Tensor* y_grad, Tensor* fwd_grad_out_grad, Tensor* fwd_grad_grad_x_grad, Tensor* fwd_grad_grad_y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, fwd_grad_out, fwd_grad_grad_x, fwd_grad_grad_y, grad_x_grad, grad_y_grad, grad_grad_out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "multiply_triple_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "multiply_triple_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "multiply_triple_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_fwd_grad_out = PrepareData(fwd_grad_out, kernel.InputAt(2), {});
  auto input_fwd_grad_grad_x = PrepareData(fwd_grad_grad_x, kernel.InputAt(3), {});
  auto input_fwd_grad_grad_y = PrepareData(fwd_grad_grad_y, kernel.InputAt(4), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(5), {});
  auto input_grad_y_grad = PrepareData(grad_y_grad, kernel.InputAt(6), {});
  auto input_grad_grad_out_grad = PrepareData(grad_grad_out_grad, kernel.InputAt(7), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> fwd_grad_grad_x_record_shapes;
     if(input_fwd_grad_grad_x){
       fwd_grad_grad_x_record_shapes.push_back((*input_fwd_grad_grad_x).dims());
     }
     std::vector<phi::DDim> fwd_grad_grad_y_record_shapes;
     if(input_fwd_grad_grad_y){
       fwd_grad_grad_y_record_shapes.push_back((*input_fwd_grad_grad_y).dims());
     }
     std::vector<phi::DDim> grad_grad_out_grad_record_shapes;
     if(input_grad_grad_out_grad){
       grad_grad_out_grad_record_shapes.push_back((*input_grad_grad_out_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"fwd_grad_out", {
     (*input_fwd_grad_out).dims()}},
     {"fwd_grad_grad_x", fwd_grad_grad_x_record_shapes},
     {"fwd_grad_grad_y", fwd_grad_grad_y_record_shapes},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}},
     {"grad_y_grad", {
     (*input_grad_y_grad).dims()}},
     {"grad_grad_out_grad",
     grad_grad_out_grad_record_shapes}};
     platform::RecordOpInfoSupplement("multiply_triple_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  auto kernel_out_2 = SetKernelOutput(fwd_grad_out_grad);
  auto kernel_out_3 = SetKernelOutput(fwd_grad_grad_x_grad);
  auto kernel_out_4 = SetKernelOutput(fwd_grad_grad_y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("multiply_triple_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);
  phi::MetaTensor meta_out_3(kernel_out_3);
  phi::MetaTensor meta_out_4(kernel_out_4);

  phi::GeneralQuinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), MakeMetaTensor(*input_fwd_grad_out), MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr, kernel_out_3 ? &meta_out_3 : nullptr, kernel_out_4 ? &meta_out_4 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, int, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("multiply_triple_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_fwd_grad_out, input_fwd_grad_grad_x, input_fwd_grad_grad_y, *input_grad_x_grad, *input_grad_y_grad, input_grad_grad_out_grad, axis, kernel_out_0, kernel_out_1, kernel_out_2, kernel_out_3, kernel_out_4);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
    TransDataBackend(kernel_out_3, kernel_backend, kernel_out_3);
    TransDataBackend(kernel_out_4, kernel_backend, kernel_out_4);
  }
  
}

PADDLE_API void nearest_interp_grad(const Tensor& x, const paddle::optional<Tensor>& out_size, const paddle::optional<std::vector<Tensor>>& size_tensor, const paddle::optional<Tensor>& scale_tensor, const Tensor& output_grad, const std::string& data_layout, int out_d, int out_h, int out_w, const std::vector<float>& scale, const std::string& interp_method, bool align_corners, int align_mode, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(output_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_size, size_tensor, scale_tensor, output_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "nearest_interp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "nearest_interp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "nearest_interp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_size = PrepareData(out_size, kernel.InputAt(1), {});
  auto input_size_tensor_vec = PrepareData(size_tensor, kernel.InputAt(2), {});
  paddle::optional<std::vector<const phi::DenseTensor*>> input_size_tensor;
  if (input_size_tensor_vec){
    input_size_tensor = paddle::optional<std::vector<const phi::DenseTensor*>>(input_size_tensor_vec->size());
    for (size_t i = 0; i < input_size_tensor_vec->size(); ++i) {
      input_size_tensor->at(i) = &input_size_tensor_vec->at(i);
    }
  }
  auto input_scale_tensor = PrepareData(scale_tensor, kernel.InputAt(3), {});
  auto input_output_grad = PrepareData(output_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> out_size_record_shapes;
     if(input_out_size){
       out_size_record_shapes.push_back((*input_out_size).dims());
     }
     std::vector<phi::DDim> scale_tensor_record_shapes;
     if(input_scale_tensor){
       scale_tensor_record_shapes.push_back((*input_scale_tensor).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_size", out_size_record_shapes},
     {"scale_tensor", scale_tensor_record_shapes},
     {"output_grad", {
     (*input_output_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     if (input_size_tensor){
       ddims_vec.reserve(input_size_tensor->size());
       for (size_t i = 0; i < input_size_tensor->size(); ++i) {
         ddims_vec.emplace_back((*input_size_tensor->at(i)).dims());
       }
     }
     input_shapes.emplace_back("size_tensor", ddims_vec);
     platform::RecordOpInfoSupplement("nearest_interp_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("nearest_interp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<std::vector<const phi::DenseTensor*>>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::string&, int, int, int, const std::vector<float>&, const std::string&, bool, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("nearest_interp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_out_size, input_size_tensor, input_scale_tensor, *input_output_grad, data_layout, out_d, out_h, out_w, scale, interp_method, align_corners, align_mode, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void nll_loss_grad(const Tensor& input, const Tensor& label, const paddle::optional<Tensor>& weight, const Tensor& total_weight, const Tensor& out_grad, int64_t ignore_index, const std::string& reduction, Tensor* input_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(input);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, label, weight, total_weight, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "nll_loss_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "nll_loss_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "nll_loss_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_label = PrepareData(label, kernel.InputAt(1), {});
  auto input_weight = PrepareData(weight, kernel.InputAt(2), {});
  auto input_total_weight = PrepareData(total_weight, kernel.InputAt(3), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> weight_record_shapes;
     if(input_weight){
       weight_record_shapes.push_back((*input_weight).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"label", {
     (*input_label).dims()}},
     {"weight", weight_record_shapes},
     {"total_weight", {
     (*input_total_weight).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("nll_loss_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(input_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("nll_loss_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::NllLossGradInferMeta(MakeMetaTensor(*input_input), MakeMetaTensor(*input_label), MakeMetaTensor(input_weight), MakeMetaTensor(*input_total_weight), MakeMetaTensor(*input_out_grad), ignore_index, reduction, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, int64_t, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("nll_loss_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_label, input_weight, *input_total_weight, *input_out_grad, ignore_index, reduction, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void norm_grad(const Tensor& x, const Tensor& norm, const Tensor& out_grad, int axis, float epsilon, bool is_test, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, norm, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_norm = PrepareData(norm, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"norm", {
     (*input_norm).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("norm_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, float, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_norm, *input_out_grad, axis, epsilon, is_test, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void overlap_add_grad(const Tensor& x, const Tensor& out_grad, int hop_length, int axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "overlap_add_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "overlap_add_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "overlap_add_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("overlap_add_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("overlap_add_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::OverlapAddGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_out_grad), hop_length, axis, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("overlap_add_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, hop_length, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void p_norm_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, float porder, int axis, float epsilon, bool keepdim, bool asvector, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "p_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "p_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "p_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("p_norm_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("p_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, float, int, float, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("p_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, porder, axis, epsilon, keepdim, asvector, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pad3d_double_grad(const Tensor& grad_x_grad, const IntArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pad3d_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pad3d", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pad3d kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("pad3d_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pad3d_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::Pad3dInferMeta(MakeMetaTensor(*input_grad_x_grad), paddings, mode, pad_value, data_format, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::IntArray&, const std::string&, float, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pad3d_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_grad_x_grad, phi::IntArray(paddings), mode, pad_value, data_format, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pad3d_grad(const Tensor& x, const Tensor& out_grad, const IntArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pad3d_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pad3d_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pad3d_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("pad3d_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pad3d_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, const std::string&, float, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pad3d_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::IntArray(paddings), mode, pad_value, data_format, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pad_double_grad(const Tensor& grad_x_grad, const std::vector<int>& paddings, const Scalar& pad_value, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pad_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("pad_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pad_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::PadInferMeta(MakeMetaTensor(*input_grad_x_grad), paddings, pad_value, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const std::vector<int>&, const phi::Scalar&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pad_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_grad_x_grad, paddings, phi::Scalar(pad_value), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pad_grad(const Tensor& x, const Tensor& out_grad, const std::vector<int>& paddings, const Scalar& pad_value, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pad_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pad_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pad_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = x.impl();
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("pad_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pad_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const std::vector<int>&, const phi::Scalar&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pad_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, paddings, phi::Scalar(pad_value), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pixel_shuffle_grad(const Tensor& out_grad, int upscale_factor, const std::string& data_format, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pixel_shuffle_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pixel_shuffle_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pixel_shuffle_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("pixel_shuffle_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pixel_shuffle_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::PixelShuffleGradInferMeta(MakeMetaTensor(*input_out_grad), upscale_factor, data_format, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, int, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pixel_shuffle_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, upscale_factor, data_format, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pool2d_double_grad(const Tensor& grad_x_grad, const IntArray& kernel_size, const std::vector<int>& strides, const std::vector<int>& paddings, bool ceil_mode, bool exclusive, const std::string& data_format, const std::string& pooling_type, bool global_pooling, bool adaptive, const std::string& padding_algorithm, bool use_gpudnn, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pool2d_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pool2d_double_grad", {kernel_backend, kernel_layout, kernel_data_type}, use_gpudnn);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pool2d_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("pool2d_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pool2d_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::Pool2DInferMeta(MakeMetaTensor(*input_grad_x_grad), kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::IntArray&, const std::vector<int>&, const std::vector<int>&, bool, bool, const std::string&, const std::string&, bool, bool, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pool2d_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_grad_x_grad, phi::IntArray(kernel_size), strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pool2d_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const IntArray& kernel_size, const std::vector<int>& strides, const std::vector<int>& paddings, bool ceil_mode, bool exclusive, const std::string& data_format, const std::string& pooling_type, bool global_pooling, bool adaptive, const std::string& padding_algorithm, bool use_gpudnn, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pool2d_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pool2d_grad", {kernel_backend, kernel_layout, kernel_data_type}, use_gpudnn);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pool2d_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("pool2d_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pool2d_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, const std::vector<int>&, const std::vector<int>&, bool, bool, const std::string&, const std::string&, bool, bool, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pool2d_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, phi::IntArray(kernel_size), strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pool3d_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const std::vector<int>& kernel_size, const std::vector<int>& strides, const std::vector<int>& paddings, bool ceil_mode, bool exclusive, const std::string& data_format, const std::string& pooling_type, bool global_pooling, bool adaptive, const std::string& padding_algorithm, bool use_gpudnn, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pool3d_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pool3d_grad", {kernel_backend, kernel_layout, kernel_data_type}, use_gpudnn);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pool3d_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("pool3d_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pool3d_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, bool, bool, const std::string&, const std::string&, bool, bool, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pool3d_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void pow_grad(const Tensor& x, const Tensor& out_grad, const Scalar& s, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "pow_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "pow_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "pow_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("pow_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("pow_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::Scalar&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("pow_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::Scalar(s), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void prelu_grad(const Tensor& x, const Tensor& alpha, const Tensor& out_grad, const std::string& data_format, const std::string& mode, Tensor* x_grad, Tensor* alpha_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, alpha, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "prelu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "prelu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "prelu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_alpha = PrepareData(alpha, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"alpha", {
     (*input_alpha).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("prelu_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(alpha_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("prelu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_alpha), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::string&, const std::string&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("prelu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_alpha, *input_out_grad, data_format, mode, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void psroi_pool_grad(const Tensor& x, const Tensor& boxes, const paddle::optional<Tensor>& boxes_num, const Tensor& out_grad, int pooled_height, int pooled_width, int output_channels, float spatial_scale, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, boxes, boxes_num, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "psroi_pool_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "psroi_pool_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "psroi_pool_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_boxes = PrepareData(boxes, kernel.InputAt(1), {});
  auto input_boxes_num = PrepareData(boxes_num, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> boxes_num_record_shapes;
     if(input_boxes_num){
       boxes_num_record_shapes.push_back((*input_boxes_num).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"boxes", {
     (*input_boxes).dims()}},
     {"boxes_num", boxes_num_record_shapes},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("psroi_pool_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("psroi_pool_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::GeneralUnaryGradInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, int, int, int, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("psroi_pool_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_boxes, input_boxes_num, *input_out_grad, pooled_height, pooled_width, output_channels, spatial_scale, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void put_along_axis_grad(const Tensor& x, const Tensor& index, const Tensor& out_grad, int axis, const std::string& reduce, Tensor* x_grad, Tensor* value_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, index, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "put_along_axis_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "put_along_axis_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "put_along_axis_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_index = PrepareData(index, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"index", {
     (*input_index).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("put_along_axis_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(value_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("put_along_axis_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_index), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, const std::string&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("put_along_axis_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_index, *input_out_grad, axis, reduce, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void qr_grad(const Tensor& x, const Tensor& q, const Tensor& r, const Tensor& q_grad, const Tensor& r_grad, const std::string& mode, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, q, r, q_grad, r_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "qr_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "qr_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "qr_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_q = PrepareData(q, kernel.InputAt(1), {});
  auto input_r = PrepareData(r, kernel.InputAt(2), {});
  auto input_q_grad = PrepareData(q_grad, kernel.InputAt(3), {});
  auto input_r_grad = PrepareData(r_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"q", {
     (*input_q).dims()}},
     {"r", {
     (*input_r).dims()}},
     {"q_grad", {
     (*input_q_grad).dims()}},
     {"r_grad", {
     (*input_r_grad).dims()}}};
     platform::RecordOpInfoSupplement("qr_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("qr_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("qr_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_q, *input_r, *input_q_grad, *input_r_grad, mode, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void real_grad(const Tensor& out_grad, Tensor* x_grad) {
  real_grad_impl(out_grad, x_grad);
}
PADDLE_API void reciprocal_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "reciprocal_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "reciprocal_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "reciprocal_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("reciprocal_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("reciprocal_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("reciprocal_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void reduce_prod_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, const IntArray& dims, bool keep_dim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "reduce_prod_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "prod_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "prod_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("reduce_prod_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("reduce_prod_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("reduce_prod_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, phi::IntArray(dims), keep_dim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void relu6_grad(const Tensor& out, const Tensor& out_grad, float threshold, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "relu6_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "relu6_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "relu6_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("relu6_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("relu6_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("relu6_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, threshold, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void relu_double_grad(const Tensor& out, const Tensor& grad_x_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "relu_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "relu_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "relu_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("relu_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("relu_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("relu_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_grad_x_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void relu_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "relu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "relu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "relu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("relu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("relu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("relu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void renorm_grad(const Tensor& x, const Tensor& out_grad, float p, int axis, float max_norm, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "renorm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "renorm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "renorm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("renorm_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("renorm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, int, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("renorm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, p, axis, max_norm, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void repeat_interleave_grad(const Tensor& x, const Tensor& out_grad, int repeats, int dim, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "repeat_interleave_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "repeat_interleave_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "repeat_interleave_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("repeat_interleave_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("repeat_interleave_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("repeat_interleave_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, repeats, dim, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void repeat_interleave_with_tensor_index_grad(const Tensor& x, const Tensor& repeats, const Tensor& out_grad, int dim, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, repeats, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "repeat_interleave_with_tensor_index_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "repeat_interleave_with_tensor_index_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "repeat_interleave_with_tensor_index_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_repeats = PrepareData(repeats, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"repeats", {
     (*input_repeats).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("repeat_interleave_with_tensor_index_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("repeat_interleave_with_tensor_index_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("repeat_interleave_with_tensor_index_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_repeats, *input_out_grad, dim, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void reshape_double_grad(const Tensor& grad_out, const Tensor& grad_x_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(grad_out, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "reshape_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "reshape_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "reshape_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(0), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("reshape_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("reshape_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_grad_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("reshape_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_grad_out, *input_grad_x_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void reshape_grad(const Tensor& xshape, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_backend = ParseBackend(out_grad);

  kernel_layout = ParseLayout(out_grad);

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(xshape, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "reshape_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "reshape_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "reshape_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_xshape = xshape.impl();
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("reshape_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("reshape_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::KernelWithXShapeInferMeta(MakeMetaTensor(*input_xshape), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("reshape_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void reverse_array_grad(const std::vector<Tensor>& out_grad, const IntArray& axis, std::vector<Tensor*> x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "reverse_array_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "reverse", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "reverse kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad_vec = PrepareData(out_grad, kernel.InputAt(0), {});
  std::vector<const phi::DenseTensor*> input_out_grad(input_out_grad_vec->size());
  for (size_t i = 0; i < input_out_grad.size(); ++i) {
    input_out_grad[i] = &input_out_grad_vec->at(i);
  }
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes;
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     ddims_vec.reserve(input_out_grad.size());
     for (size_t i = 0; i < input_out_grad.size(); ++i) {
       ddims_vec.emplace_back((*input_out_grad[i]).dims());
     }
     input_shapes.emplace_back("out_grad", ddims_vec);
     platform::RecordOpInfoSupplement("reverse_array_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(&x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("reverse_array_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto out_grad_meta_vec = MakeMetaTensor(input_out_grad);
  std::vector<const phi::MetaTensor*> out_grad_metas(out_grad_meta_vec.size());
  for (size_t i = 0; i < out_grad_meta_vec.size(); ++i) {
    out_grad_metas[i] = &out_grad_meta_vec[i];
  }

  auto kernel_out_meta_vec = MakeMetaTensor(kernel_out);
  std::vector<phi::MetaTensor*> kernel_out_metas(kernel_out_meta_vec.size());
  for (size_t i = 0; i < kernel_out_meta_vec.size(); ++i) {
    kernel_out_metas[i] = kernel_out[i] ? &kernel_out_meta_vec[i] : nullptr;
  }
  phi::ReverseArrayInferMeta(out_grad_metas, axis, kernel_out_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const std::vector<const phi::DenseTensor*>&, const phi::IntArray&, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("reverse_array_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, input_out_grad, phi::IntArray(axis), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void rnn_grad(const Tensor& x, const std::vector<Tensor>& pre_state, const std::vector<Tensor>& weight_list, const paddle::optional<Tensor>& sequence_length, const Tensor& out, const Tensor& dropout_state_out, const Tensor& reserve, const Tensor& out_grad, const std::vector<Tensor>& state_grad, float dropout_prob, bool is_bidirec, int input_size, int hidden_size, int num_layers, const std::string& mode, int seed, bool is_test, Tensor* x_grad, std::vector<Tensor*> pre_state_grad, std::vector<Tensor*> weight_list_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, pre_state, weight_list, sequence_length, out, dropout_state_out, reserve, out_grad, state_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "rnn_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "rnn_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "rnn_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_pre_state_vec = PrepareData(pre_state, kernel.InputAt(1), {});
  std::vector<const phi::DenseTensor*> input_pre_state(input_pre_state_vec->size());
  for (size_t i = 0; i < input_pre_state.size(); ++i) {
    input_pre_state[i] = &input_pre_state_vec->at(i);
  }
  auto input_weight_list_vec = PrepareData(weight_list, kernel.InputAt(2), {});
  std::vector<const phi::DenseTensor*> input_weight_list(input_weight_list_vec->size());
  for (size_t i = 0; i < input_weight_list.size(); ++i) {
    input_weight_list[i] = &input_weight_list_vec->at(i);
  }
  auto input_sequence_length = PrepareData(sequence_length, kernel.InputAt(3), {});
  auto input_out = PrepareData(out, kernel.InputAt(4), {});
  auto input_dropout_state_out = PrepareData(dropout_state_out, kernel.InputAt(5), {});
  auto input_reserve = PrepareData(reserve, kernel.InputAt(6), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(7), {});
  auto input_state_grad_vec = PrepareData(state_grad, kernel.InputAt(8), {});
  std::vector<const phi::DenseTensor*> input_state_grad(input_state_grad_vec->size());
  for (size_t i = 0; i < input_state_grad.size(); ++i) {
    input_state_grad[i] = &input_state_grad_vec->at(i);
  }
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> sequence_length_record_shapes;
     if(input_sequence_length){
       sequence_length_record_shapes.push_back((*input_sequence_length).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"sequence_length", sequence_length_record_shapes},
     {"out", {
     (*input_out).dims()}},
     {"dropout_state_out", {
     (*input_dropout_state_out).dims()}},
     {"reserve", {
     (*input_reserve).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     ddims_vec.reserve(input_pre_state.size());
     for (size_t i = 0; i < input_pre_state.size(); ++i) {
       ddims_vec.emplace_back((*input_pre_state[i]).dims());
     }
     input_shapes.emplace_back("pre_state", ddims_vec);
     ddims_vec.clear();
     ddims_vec.reserve(input_weight_list.size());
     for (size_t i = 0; i < input_weight_list.size(); ++i) {
       ddims_vec.emplace_back((*input_weight_list[i]).dims());
     }
     input_shapes.emplace_back("weight_list", ddims_vec);
     ddims_vec.clear();
     ddims_vec.reserve(input_state_grad.size());
     for (size_t i = 0; i < input_state_grad.size(); ++i) {
       ddims_vec.emplace_back((*input_state_grad[i]).dims());
     }
     input_shapes.emplace_back("state_grad", ddims_vec);
     platform::RecordOpInfoSupplement("rnn_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(&pre_state_grad);
  auto kernel_out_2 = SetKernelOutput(&weight_list_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("rnn_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto pre_state_meta_vec = MakeMetaTensor(input_pre_state);
  std::vector<const phi::MetaTensor*> pre_state_metas(pre_state_meta_vec.size());
  for (size_t i = 0; i < pre_state_meta_vec.size(); ++i) {
    pre_state_metas[i] = &pre_state_meta_vec[i];
  }

  auto weight_list_meta_vec = MakeMetaTensor(input_weight_list);
  std::vector<const phi::MetaTensor*> weight_list_metas(weight_list_meta_vec.size());
  for (size_t i = 0; i < weight_list_meta_vec.size(); ++i) {
    weight_list_metas[i] = &weight_list_meta_vec[i];
  }
  phi::MetaTensor meta_out_0(kernel_out_0);

  auto kernel_out_1_meta_vec = MakeMetaTensor(kernel_out_1);
  std::vector<phi::MetaTensor*> kernel_out_1_metas(kernel_out_1_meta_vec.size());
  for (size_t i = 0; i < kernel_out_1_meta_vec.size(); ++i) {
    kernel_out_1_metas[i] = kernel_out_1[i] ? &kernel_out_1_meta_vec[i] : nullptr;
  }
  auto kernel_out_2_meta_vec = MakeMetaTensor(kernel_out_2);
  std::vector<phi::MetaTensor*> kernel_out_2_metas(kernel_out_2_meta_vec.size());
  for (size_t i = 0; i < kernel_out_2_meta_vec.size(); ++i) {
    kernel_out_2_metas[i] = kernel_out_2[i] ? &kernel_out_2_meta_vec[i] : nullptr;
  }
  phi::RnnGradInferMeta(MakeMetaTensor(*input_x), pre_state_metas, weight_list_metas, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1_metas, kernel_out_2_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const std::vector<const phi::DenseTensor*>&, const std::vector<const phi::DenseTensor*>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<const phi::DenseTensor*>&, float, bool, int, int, int, const std::string&, int, bool, phi::DenseTensor*, std::vector<phi::DenseTensor*>&, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("rnn_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_pre_state, input_weight_list, input_sequence_length, *input_out, *input_dropout_state_out, *input_reserve, *input_out_grad, input_state_grad, dropout_prob, is_bidirec, input_size, hidden_size, num_layers, mode, seed, is_test, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void roi_align_grad(const Tensor& x, const Tensor& boxes, const paddle::optional<Tensor>& boxes_num, const Tensor& out_grad, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(boxes);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, boxes, boxes_num, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "roi_align_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "roi_align_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "roi_align_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_boxes = PrepareData(boxes, kernel.InputAt(1), {});
  auto input_boxes_num = PrepareData(boxes_num, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> boxes_num_record_shapes;
     if(input_boxes_num){
       boxes_num_record_shapes.push_back((*input_boxes_num).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"boxes", {
     (*input_boxes).dims()}},
     {"boxes_num", boxes_num_record_shapes},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("roi_align_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("roi_align_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, int, int, float, int, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("roi_align_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_boxes, input_boxes_num, *input_out_grad, pooled_height, pooled_width, spatial_scale, sampling_ratio, aligned, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void roi_pool_grad(const Tensor& x, const Tensor& boxes, const paddle::optional<Tensor>& boxes_num, const Tensor& arg_max, const Tensor& out_grad, int pooled_height, int pooled_width, float spatial_scale, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, boxes, boxes_num, arg_max, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "roi_pool_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "roi_pool_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "roi_pool_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_boxes = PrepareData(boxes, kernel.InputAt(1), {});
  auto input_boxes_num = PrepareData(boxes_num, kernel.InputAt(2), {});
  auto input_arg_max = PrepareData(arg_max, kernel.InputAt(3), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> boxes_num_record_shapes;
     if(input_boxes_num){
       boxes_num_record_shapes.push_back((*input_boxes_num).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"boxes", {
     (*input_boxes).dims()}},
     {"boxes_num", boxes_num_record_shapes},
     {"arg_max", {
     (*input_arg_max).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("roi_pool_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("roi_pool_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("roi_pool_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_boxes, input_boxes_num, *input_arg_max, *input_out_grad, pooled_height, pooled_width, spatial_scale, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void roll_grad(const Tensor& x, const Tensor& out_grad, const IntArray& shifts, const std::vector<int64_t>& axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "roll_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "roll_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "roll_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("roll_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("roll_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, const std::vector<int64_t>&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("roll_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::IntArray(shifts), axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void round_grad(const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "round_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "round_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "round_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("round_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("round_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("round_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void rsqrt_double_grad(const Tensor& out, const Tensor& grad_x, const Tensor& grad_x_grad, Tensor* out_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, grad_x, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "rsqrt_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "rsqrt_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "rsqrt_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_grad_x = PrepareData(grad_x, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"grad_x", {
     (*input_grad_x).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("rsqrt_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(out_grad);
  auto kernel_out_1 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("rsqrt_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_out), MakeMetaTensor(*input_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("rsqrt_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_grad_x, *input_grad_x_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void rsqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "rsqrt_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "rsqrt_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "rsqrt_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("rsqrt_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("rsqrt_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("rsqrt_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void scatter_grad(const Tensor& index, const Tensor& updates, const Tensor& out_grad, bool overwrite, Tensor* x_grad, Tensor* updates_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(index, updates, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "scatter_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "scatter_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "scatter_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_index = PrepareData(index, kernel.InputAt(0), {});
  auto input_updates = PrepareData(updates, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"index", {
     (*input_index).dims()}},
     {"updates", {
     (*input_updates).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("scatter_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(updates_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("scatter_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::ScatterGradInferMeta(MakeMetaTensor(*input_index), MakeMetaTensor(*input_updates), MakeMetaTensor(*input_out_grad), overwrite, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("scatter_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_index, *input_updates, *input_out_grad, overwrite, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void scatter_nd_add_grad(const Tensor& index, const Tensor& updates, const Tensor& out_grad, Tensor* x_grad, Tensor* updates_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(index, updates, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "scatter_nd_add_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "scatter_nd_add_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "scatter_nd_add_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_index = PrepareData(index, kernel.InputAt(0), {});
  auto input_updates = PrepareData(updates, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"index", {
     (*input_index).dims()}},
     {"updates", {
     (*input_updates).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("scatter_nd_add_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(updates_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("scatter_nd_add_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::ScatterNdAddGradInferMeta(MakeMetaTensor(*input_index), MakeMetaTensor(*input_updates), MakeMetaTensor(*input_out_grad), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("scatter_nd_add_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_index, *input_updates, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void segment_pool_grad(const Tensor& x, const Tensor& segment_ids, const Tensor& out, const paddle::optional<Tensor>& summed_ids, const Tensor& out_grad, const std::string& pooltype, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, segment_ids, out, summed_ids, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "segment_pool_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "segment_pool_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "segment_pool_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_segment_ids = PrepareData(segment_ids, kernel.InputAt(1), {});
  auto input_out = PrepareData(out, kernel.InputAt(2), {});
  auto input_summed_ids = PrepareData(summed_ids, kernel.InputAt(3), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> summed_ids_record_shapes;
     if(input_summed_ids){
       summed_ids_record_shapes.push_back((*input_summed_ids).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"segment_ids", {
     (*input_segment_ids).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"summed_ids", summed_ids_record_shapes},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("segment_pool_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("segment_pool_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("segment_pool_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_segment_ids, *input_out, input_summed_ids, *input_out_grad, pooltype, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void selu_grad(const Tensor& out, const Tensor& out_grad, float scale, float alpha, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "selu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "selu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "selu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("selu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("selu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("selu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, scale, alpha, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void sigmoid_cross_entropy_with_logits_grad(const Tensor& x, const Tensor& label, const Tensor& out_grad, bool normalize, int ignore_index, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, label, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sigmoid_cross_entropy_with_logits_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sigmoid_cross_entropy_with_logits_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sigmoid_cross_entropy_with_logits_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_label = PrepareData(label, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"label", {
     (*input_label).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("sigmoid_cross_entropy_with_logits_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sigmoid_cross_entropy_with_logits_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sigmoid_cross_entropy_with_logits_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_label, *input_out_grad, normalize, ignore_index, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void sigmoid_double_grad(const Tensor& out, const Tensor& fwd_grad_out, const Tensor& grad_x_grad, Tensor* out_grad, Tensor* fwd_grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, fwd_grad_out, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sigmoid_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sigmoid_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sigmoid_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_fwd_grad_out = PrepareData(fwd_grad_out, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"fwd_grad_out", {
     (*input_fwd_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("sigmoid_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(out_grad);
  auto kernel_out_1 = SetKernelOutput(fwd_grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sigmoid_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_out), MakeMetaTensor(*input_fwd_grad_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sigmoid_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_fwd_grad_out, *input_grad_x_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void sigmoid_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sigmoid_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sigmoid_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sigmoid_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("sigmoid_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sigmoid_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sigmoid_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void sigmoid_triple_grad(const Tensor& out, const Tensor& fwd_grad_out, const Tensor& grad_grad_x, const Tensor& grad_out_grad, const paddle::optional<Tensor>& grad_grad_out_grad, Tensor* out_grad, Tensor* fwd_grad_out_grad, Tensor* grad_grad_x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, fwd_grad_out, grad_grad_x, grad_out_grad, grad_grad_out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sigmoid_triple_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sigmoid_triple_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sigmoid_triple_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_fwd_grad_out = PrepareData(fwd_grad_out, kernel.InputAt(1), {});
  auto input_grad_grad_x = PrepareData(grad_grad_x, kernel.InputAt(2), {});
  auto input_grad_out_grad = PrepareData(grad_out_grad, kernel.InputAt(3), {});
  auto input_grad_grad_out_grad = PrepareData(grad_grad_out_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_grad_out_grad_record_shapes;
     if(input_grad_grad_out_grad){
       grad_grad_out_grad_record_shapes.push_back((*input_grad_grad_out_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"fwd_grad_out", {
     (*input_fwd_grad_out).dims()}},
     {"grad_grad_x", {
     (*input_grad_grad_x).dims()}},
     {"grad_out_grad", {
     (*input_grad_out_grad).dims()}},
     {"grad_grad_out_grad",
     grad_grad_out_grad_record_shapes}};
     platform::RecordOpInfoSupplement("sigmoid_triple_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(out_grad);
  auto kernel_out_1 = SetKernelOutput(fwd_grad_out_grad);
  auto kernel_out_2 = SetKernelOutput(grad_grad_x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sigmoid_triple_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_out), MakeMetaTensor(*input_fwd_grad_out), MakeMetaTensor(*input_grad_grad_x), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sigmoid_triple_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_fwd_grad_out, *input_grad_grad_x, *input_grad_out_grad, input_grad_grad_out_grad, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void silu_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "silu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "silu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "silu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("silu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("silu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("silu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void sin_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sin_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sin_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sin_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("sin_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sin_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sin_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void sinh_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sinh_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sinh_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sinh_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("sinh_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sinh_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sinh_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void slice_grad(const Tensor& input, const Tensor& out_grad, const std::vector<int64_t>& axes, const IntArray& starts, const IntArray& ends, const std::vector<int64_t>& infer_flags, const std::vector<int64_t>& decrease_axis, Tensor* input_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(input, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "slice_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "slice_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "slice_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_input = PrepareData(input, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"input", {
     (*input_input).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("slice_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(input_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("slice_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_input), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int64_t>&, const phi::IntArray&, const phi::IntArray&, const std::vector<int64_t>&, const std::vector<int64_t>&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("slice_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_input, *input_out_grad, axes, phi::IntArray(starts), phi::IntArray(ends), infer_flags, decrease_axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void slogdet_grad(const Tensor& x, const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "slogdet_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "slogdeterminant_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "slogdeterminant_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out = PrepareData(out, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("slogdet_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("slogdet_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("slogdet_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void soft_shrink_grad(const Tensor& x, const Tensor& out_grad, float lambda, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "soft_shrink_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "soft_shrink_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "soft_shrink_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("soft_shrink_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("soft_shrink_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("soft_shrink_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, lambda, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void softmax_grad(const Tensor& out, const Tensor& out_grad, int axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "softmax_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "softmax_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "softmax_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("softmax_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("softmax_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("softmax_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void softplus_grad(const Tensor& x, const Tensor& out_grad, float beta, float threshold, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "softplus_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "softplus_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "softplus_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("softplus_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("softplus_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("softplus_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, beta, threshold, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void softsign_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "softsign_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "softsign_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "softsign_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("softsign_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("softsign_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("softsign_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void spectral_norm_grad(const Tensor& weight, const Tensor& u, const Tensor& v, const Tensor& out_grad, int dim, int power_iters, float eps, Tensor* weight_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(weight, u, v, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "spectral_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "spectral_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "spectral_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_weight = PrepareData(weight, kernel.InputAt(0), {});
  auto input_u = PrepareData(u, kernel.InputAt(1), {});
  auto input_v = PrepareData(v, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"weight", {
     (*input_weight).dims()}},
     {"u", {
     (*input_u).dims()}},
     {"v", {
     (*input_v).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("spectral_norm_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(weight_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("spectral_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::SpectralNormGradInferMeta(MakeMetaTensor(*input_weight), MakeMetaTensor(*input_u), MakeMetaTensor(*input_v), MakeMetaTensor(*input_out_grad), dim, power_iters, eps, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, int, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("spectral_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_weight, *input_u, *input_v, *input_out_grad, dim, power_iters, eps, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void sqrt_double_grad(const Tensor& out, const Tensor& grad_x, const Tensor& grad_x_grad, Tensor* out_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, grad_x, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sqrt_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sqrt_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sqrt_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_grad_x = PrepareData(grad_x, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"grad_x", {
     (*input_grad_x).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("sqrt_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(out_grad);
  auto kernel_out_1 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sqrt_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_out), MakeMetaTensor(*input_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sqrt_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_grad_x, *input_grad_x_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sqrt_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sqrt_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sqrt_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("sqrt_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sqrt_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sqrt_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void square_double_grad(const Tensor& x, const Tensor& grad_out, const Tensor& grad_x_grad, Tensor* x_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, grad_out, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "square_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "square_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "square_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("square_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("square_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_x), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("square_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_grad_out, *input_grad_x_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void square_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "square_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "square_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "square_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("square_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("square_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("square_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void squared_l2_norm_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "squared_l2_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "squared_l2_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "squared_l2_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("squared_l2_norm_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("squared_l2_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("squared_l2_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void squeeze_grad(const Tensor& xshape, const Tensor& out_grad, const IntArray& axes, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(xshape, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "squeeze_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "squeeze_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "squeeze_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_xshape = PrepareData(xshape, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"xshape", {
     (*input_xshape).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("squeeze_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("squeeze_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::KernelWithXShapeInferMeta(MakeMetaTensor(*input_xshape), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("squeeze_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_xshape, *input_out_grad, phi::IntArray(axes), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void stack_grad(const std::vector<Tensor>& x, const Tensor& out_grad, int axis, std::vector<Tensor*> x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "stack_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "stack_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "stack_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("stack_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(&x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("stack_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto kernel_out_meta_vec = MakeMetaTensor(kernel_out);
  std::vector<phi::MetaTensor*> kernel_out_metas(kernel_out_meta_vec.size());
  for (size_t i = 0; i < kernel_out_meta_vec.size(); ++i) {
    kernel_out_metas[i] = kernel_out[i] ? &kernel_out_meta_vec[i] : nullptr;
  }
  phi::StackGradInferMeta(MakeMetaTensor(*input_out_grad), axis, kernel_out_metas);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, int, std::vector<phi::DenseTensor*>&);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("stack_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void strided_slice_grad(const Tensor& x, const Tensor& out_grad, const std::vector<int>& axes, const IntArray& starts, const IntArray& ends, const IntArray& strides, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "strided_slice_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "strided_slice_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "strided_slice_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("strided_slice_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("strided_slice_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::GeneralUnaryGradInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const phi::IntArray&, const phi::IntArray&, const phi::IntArray&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("strided_slice_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, axes, phi::IntArray(starts), phi::IntArray(ends), phi::IntArray(strides), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void subtract_double_grad(const Tensor& y, const Tensor& grad_out, const paddle::optional<Tensor>& grad_x_grad, const paddle::optional<Tensor>& grad_y_grad, int axis, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(y, grad_out, grad_x_grad, grad_y_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "subtract_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "subtract_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "subtract_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_y = PrepareData(y, kernel.InputAt(0), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  auto input_grad_y_grad = PrepareData(grad_y_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> grad_x_grad_record_shapes;
     if(input_grad_x_grad){
       grad_x_grad_record_shapes.push_back((*input_grad_x_grad).dims());
     }
     std::vector<phi::DDim> grad_y_grad_record_shapes;
     if(input_grad_y_grad){
       grad_y_grad_record_shapes.push_back((*input_grad_y_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"y", {
     (*input_y).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", grad_x_grad_record_shapes},
     {"grad_y_grad",
     grad_y_grad_record_shapes}};
     platform::RecordOpInfoSupplement("subtract_double_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("subtract_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_grad_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("subtract_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_y, *input_grad_out, input_grad_x_grad, input_grad_y_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void subtract_grad(const Tensor& x, const Tensor& y, const Tensor& out_grad, int axis, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "subtract_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "subtract_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "subtract_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("subtract_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("subtract_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("subtract_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out_grad, axis, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void sum_grad(const Tensor& x, const Tensor& out_grad, const IntArray& dims, bool keep_dim, bool reduce_all, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sum_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sum_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sum_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("sum_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sum_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sum_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::IntArray(dims), keep_dim, reduce_all, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void svd_grad(const Tensor& x, const Tensor& u, const Tensor& vh, const Tensor& s, const paddle::optional<Tensor>& u_grad, const paddle::optional<Tensor>& vh_grad, const paddle::optional<Tensor>& s_grad, bool full, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, u, vh, s, u_grad, vh_grad, s_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "svd_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "svd_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "svd_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_u = PrepareData(u, kernel.InputAt(1), {});
  auto input_vh = PrepareData(vh, kernel.InputAt(2), {});
  auto input_s = PrepareData(s, kernel.InputAt(3), {});
  auto input_u_grad = PrepareData(u_grad, kernel.InputAt(4), {});
  auto input_vh_grad = PrepareData(vh_grad, kernel.InputAt(5), {});
  auto input_s_grad = PrepareData(s_grad, kernel.InputAt(6), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> u_grad_record_shapes;
     if(input_u_grad){
       u_grad_record_shapes.push_back((*input_u_grad).dims());
     }
     std::vector<phi::DDim> vh_grad_record_shapes;
     if(input_vh_grad){
       vh_grad_record_shapes.push_back((*input_vh_grad).dims());
     }
     std::vector<phi::DDim> s_grad_record_shapes;
     if(input_s_grad){
       s_grad_record_shapes.push_back((*input_s_grad).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"u", {
     (*input_u).dims()}},
     {"vh", {
     (*input_vh).dims()}},
     {"s", {
     (*input_s).dims()}},
     {"u_grad", u_grad_record_shapes},
     {"vh_grad", vh_grad_record_shapes},
     {"s_grad",
     s_grad_record_shapes}};
     platform::RecordOpInfoSupplement("svd_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("svd_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<phi::DenseTensor>&, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("svd_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_u, *input_vh, *input_s, input_u_grad, input_vh_grad, input_s_grad, full, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void swish_grad(const Tensor& x, const Tensor& out_grad, float bete, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "swish_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "swish_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "swish_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("swish_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("swish_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::GeneralUnaryGradInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("swish_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, bete, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void sync_batch_norm_grad(const Tensor& x, const Tensor& scale, const Tensor& bias, const Tensor& saved_mean, const Tensor& saved_variance, const paddle::optional<Tensor>& reserve_space, const Tensor& out_grad, float momentum, float epsilon, const std::string& data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu, Tensor* x_grad, Tensor* scale_grad, Tensor* bias_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(out_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, scale, bias, saved_mean, saved_variance, reserve_space, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "sync_batch_norm_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "sync_batch_norm_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "sync_batch_norm_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_scale = PrepareData(scale, kernel.InputAt(1), {});
  auto input_bias = PrepareData(bias, kernel.InputAt(2), {});
  auto input_saved_mean = PrepareData(saved_mean, kernel.InputAt(3), {});
  auto input_saved_variance = PrepareData(saved_variance, kernel.InputAt(4), {});
  auto input_reserve_space = PrepareData(reserve_space, kernel.InputAt(5), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(6), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> reserve_space_record_shapes;
     if(input_reserve_space){
       reserve_space_record_shapes.push_back((*input_reserve_space).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"scale", {
     (*input_scale).dims()}},
     {"bias", {
     (*input_bias).dims()}},
     {"saved_mean", {
     (*input_saved_mean).dims()}},
     {"saved_variance", {
     (*input_saved_variance).dims()}},
     {"reserve_space", reserve_space_record_shapes},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("sync_batch_norm_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(scale_grad);
  auto kernel_out_2 = SetKernelOutput(bias_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("sync_batch_norm_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_scale), MakeMetaTensor(*input_bias), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, float, float, const std::string&, bool, bool, bool, bool, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("sync_batch_norm_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_scale, *input_bias, *input_saved_mean, *input_saved_variance, input_reserve_space, *input_out_grad, momentum, epsilon, data_layout, is_test, use_global_stats, trainable_statistics, fuse_with_relu, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void take_along_axis_grad(const Tensor& x, const Tensor& index, const Tensor& out_grad, int axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, index, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "take_along_axis_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "take_along_axis_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "take_along_axis_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_index = PrepareData(index, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"index", {
     (*input_index).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("take_along_axis_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("take_along_axis_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("take_along_axis_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_index, *input_out_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void tan_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "tan_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "tan_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "tan_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("tan_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("tan_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("tan_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void tanh_double_grad(const Tensor& out, const Tensor& grad_out, const Tensor& grad_x_grad, Tensor* out_grad, Tensor* grad_out_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, grad_out, grad_x_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "tanh_double_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "tanh_double_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "tanh_double_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_grad_out = PrepareData(grad_out, kernel.InputAt(1), {});
  auto input_grad_x_grad = PrepareData(grad_x_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"grad_out", {
     (*input_grad_out).dims()}},
     {"grad_x_grad", {
     (*input_grad_x_grad).dims()}}};
     platform::RecordOpInfoSupplement("tanh_double_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(out_grad);
  auto kernel_out_1 = SetKernelOutput(grad_out_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("tanh_double_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_out), MakeMetaTensor(*input_out), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("tanh_double_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_grad_out, *input_grad_x_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void tanh_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "tanh_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "tanh_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "tanh_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("tanh_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("tanh_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("tanh_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void tanh_shrink_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "tanh_shrink_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "tanh_shrink_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "tanh_shrink_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("tanh_shrink_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("tanh_shrink_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("tanh_shrink_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void tanh_triple_grad(const Tensor& out, const Tensor& grad_out_forward, const Tensor& grad_x_grad_forward, const Tensor& grad_out_new_grad, const Tensor& grad_out_grad_grad, Tensor* out_grad, Tensor* grad_out_forward_grad, Tensor* grad_x_grad_forward_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out, grad_out_forward, grad_x_grad_forward, grad_out_new_grad, grad_out_grad_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "tanh_triple_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "tanh_triple_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "tanh_triple_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out = PrepareData(out, kernel.InputAt(0), {});
  auto input_grad_out_forward = PrepareData(grad_out_forward, kernel.InputAt(1), {});
  auto input_grad_x_grad_forward = PrepareData(grad_x_grad_forward, kernel.InputAt(2), {});
  auto input_grad_out_new_grad = PrepareData(grad_out_new_grad, kernel.InputAt(3), {});
  auto input_grad_out_grad_grad = PrepareData(grad_out_grad_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out", {
     (*input_out).dims()}},
     {"grad_out_forward", {
     (*input_grad_out_forward).dims()}},
     {"grad_x_grad_forward", {
     (*input_grad_x_grad_forward).dims()}},
     {"grad_out_new_grad", {
     (*input_grad_out_new_grad).dims()}},
     {"grad_out_grad_grad", {
     (*input_grad_out_grad_grad).dims()}}};
     platform::RecordOpInfoSupplement("tanh_triple_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(out_grad);
  auto kernel_out_1 = SetKernelOutput(grad_out_forward_grad);
  auto kernel_out_2 = SetKernelOutput(grad_x_grad_forward_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("tanh_triple_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);

  phi::GeneralTernaryGradInferMeta(MakeMetaTensor(*input_out), MakeMetaTensor(*input_out), MakeMetaTensor(*input_grad_x_grad_forward), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("tanh_triple_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out, *input_grad_out_forward, *input_grad_x_grad_forward, *input_grad_out_new_grad, *input_grad_out_grad_grad, kernel_out_0, kernel_out_1, kernel_out_2);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
  }
  
}

PADDLE_API void temporal_shift_grad(const Tensor& out_grad, int seg_num, float shift_ratio, const std::string& data_format_str, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "temporal_shift_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "temporal_shift_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "temporal_shift_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("temporal_shift_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("temporal_shift_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, int, float, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("temporal_shift_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, seg_num, shift_ratio, data_format_str, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void thresholded_relu_grad(const Tensor& x, const Tensor& out_grad, float threshold, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "thresholded_relu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "thresholded_relu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "thresholded_relu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("thresholded_relu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("thresholded_relu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("thresholded_relu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, threshold, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void tile_grad(const Tensor& x, const Tensor& out_grad, const IntArray& repeat_times, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "tile_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "tile_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "tile_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("tile_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("tile_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::IntArray&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("tile_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, phi::IntArray(repeat_times), kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void top_k_grad(const Tensor& x, const Tensor& indices, const Tensor& out_grad, const Scalar& k, int axis, bool largest, bool sorted, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, indices, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "top_k_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "top_k_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "top_k_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_indices = PrepareData(indices, kernel.InputAt(1), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"indices", {
     (*input_indices).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("top_k_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("top_k_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::Scalar&, int, bool, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("top_k_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_indices, *input_out_grad, phi::Scalar(k), axis, largest, sorted, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void transpose_grad(const Tensor& out_grad, const std::vector<int>& axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "transpose_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "transpose_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "transpose_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("transpose_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("transpose_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::TransposeGradInferMeta(MakeMetaTensor(*input_out_grad), axis, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const std::vector<int>&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("transpose_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void triangular_solve_grad(const Tensor& x, const Tensor& y, const Tensor& out, const Tensor& out_grad, bool upper, bool tranpose, bool unitriangular, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, y, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "triangular_solve_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "triangular_solve_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "triangular_solve_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_y = PrepareData(y, kernel.InputAt(1), {});
  auto input_out = PrepareData(out, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("triangular_solve_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("triangular_solve_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, bool, bool, bool, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("triangular_solve_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_y, *input_out, *input_out_grad, upper, tranpose, unitriangular, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void tril_triu_grad(const Tensor& out_grad, int diagonal, bool lower, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "tril_triu_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "tril_triu_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "tril_triu_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("tril_triu_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("tril_triu_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_out_grad), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, int, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("tril_triu_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, diagonal, lower, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void trilinear_interp_grad(const Tensor& x, const paddle::optional<Tensor>& out_size, const paddle::optional<std::vector<Tensor>>& size_tensor, const paddle::optional<Tensor>& scale_tensor, const Tensor& output_grad, const std::string& data_layout, int out_d, int out_h, int out_w, const std::vector<float>& scale, const std::string& interp_method, bool align_corners, int align_mode, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(output_grad);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_size, size_tensor, scale_tensor, output_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "trilinear_interp_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "trilinear_interp_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "trilinear_interp_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_size = PrepareData(out_size, kernel.InputAt(1), {});
  auto input_size_tensor_vec = PrepareData(size_tensor, kernel.InputAt(2), {});
  paddle::optional<std::vector<const phi::DenseTensor*>> input_size_tensor;
  if (input_size_tensor_vec){
    input_size_tensor = paddle::optional<std::vector<const phi::DenseTensor*>>(input_size_tensor_vec->size());
    for (size_t i = 0; i < input_size_tensor_vec->size(); ++i) {
      input_size_tensor->at(i) = &input_size_tensor_vec->at(i);
    }
  }
  auto input_scale_tensor = PrepareData(scale_tensor, kernel.InputAt(3), {});
  auto input_output_grad = PrepareData(output_grad, kernel.InputAt(4), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> out_size_record_shapes;
     if(input_out_size){
       out_size_record_shapes.push_back((*input_out_size).dims());
     }
     std::vector<phi::DDim> scale_tensor_record_shapes;
     if(input_scale_tensor){
       scale_tensor_record_shapes.push_back((*input_scale_tensor).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_size", out_size_record_shapes},
     {"scale_tensor", scale_tensor_record_shapes},
     {"output_grad", {
     (*input_output_grad).dims()}}};
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     if (input_size_tensor){
       ddims_vec.reserve(input_size_tensor->size());
       for (size_t i = 0; i < input_size_tensor->size(); ++i) {
         ddims_vec.emplace_back((*input_size_tensor->at(i)).dims());
       }
     }
     input_shapes.emplace_back("size_tensor", ddims_vec);
     platform::RecordOpInfoSupplement("trilinear_interp_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("trilinear_interp_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const paddle::optional<std::vector<const phi::DenseTensor*>>&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const std::string&, int, int, int, const std::vector<float>&, const std::string&, bool, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("trilinear_interp_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, input_out_size, input_size_tensor, input_scale_tensor, *input_output_grad, data_layout, out_d, out_h, out_w, scale, interp_method, align_corners, align_mode, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void unfold_grad(const Tensor& x, const Tensor& out_grad, const std::vector<int>& kernel_sizes, const std::vector<int>& strides, const std::vector<int>& paddings, const std::vector<int>& dilations, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "unfold_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "unfold_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "unfold_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("unfold_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("unfold_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("unfold_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, kernel_sizes, strides, paddings, dilations, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void uniform_random_inplace_grad(const Tensor& out_grad, float min, float max, int seed, int diag_num, int diag_step, float diag_val, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "uniform_random_inplace_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "uniform_random_inplace_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "uniform_random_inplace_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("uniform_random_inplace_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("uniform_random_inplace_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UniformRandomInplaceGradInferMeta(MakeMetaTensor(*input_out_grad), min, max, seed, diag_num, diag_step, diag_val, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, float, float, int, int, int, float, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("uniform_random_inplace_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_out_grad, min, max, seed, diag_num, diag_step, diag_val, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void unsqueeze_grad(const Tensor& xshape, const Tensor& out_grad, const IntArray& axes, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(xshape, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "unsqueeze_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "unsqueeze_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "unsqueeze_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_xshape = PrepareData(xshape, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"xshape", {
     (*input_xshape).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("unsqueeze_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("unsqueeze_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::KernelWithXShapeInferMeta(MakeMetaTensor(*input_xshape), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("unsqueeze_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_xshape, *input_out_grad, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void unstack_grad(const std::vector<Tensor>& out_grad, int axis, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "unstack_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "unstack_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "unstack_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_out_grad_vec = PrepareData(out_grad, kernel.InputAt(0), {});
  std::vector<const phi::DenseTensor*> input_out_grad(input_out_grad_vec->size());
  for (size_t i = 0; i < input_out_grad.size(); ++i) {
    input_out_grad[i] = &input_out_grad_vec->at(i);
  }
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes;
     std::vector<phi::DDim> ddims_vec;
     ddims_vec.clear();
     ddims_vec.reserve(input_out_grad.size());
     for (size_t i = 0; i < input_out_grad.size(); ++i) {
       ddims_vec.emplace_back((*input_out_grad[i]).dims());
     }
     input_shapes.emplace_back("out_grad", ddims_vec);
     platform::RecordOpInfoSupplement("unstack_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("unstack_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }

  auto out_grad_meta_vec = MakeMetaTensor(input_out_grad);
  std::vector<const phi::MetaTensor*> out_grad_metas(out_grad_meta_vec.size());
  for (size_t i = 0; i < out_grad_meta_vec.size(); ++i) {
    out_grad_metas[i] = &out_grad_meta_vec[i];
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnStackGradInferMeta(out_grad_metas, axis, &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const std::vector<const phi::DenseTensor*>&, int, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("unstack_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, input_out_grad, axis, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void warpctc_grad(const Tensor& logits, const paddle::optional<Tensor>& logits_length, const Tensor& warpctcgrad, const Tensor& loss_grad, int blank, bool norm_by_times, Tensor* logits_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(logits, logits_length, warpctcgrad, loss_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "warpctc_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "warpctc_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "warpctc_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_logits = PrepareData(logits, kernel.InputAt(0), {});
  auto input_logits_length = PrepareData(logits_length, kernel.InputAt(1), {});
  auto input_warpctcgrad = PrepareData(warpctcgrad, kernel.InputAt(2), {});
  auto input_loss_grad = PrepareData(loss_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> logits_length_record_shapes;
     if(input_logits_length){
       logits_length_record_shapes.push_back((*input_logits_length).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"logits", {
     (*input_logits).dims()}},
     {"logits_length", logits_length_record_shapes},
     {"warpctcgrad", {
     (*input_warpctcgrad).dims()}},
     {"loss_grad", {
     (*input_loss_grad).dims()}}};
     platform::RecordOpInfoSupplement("warpctc_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(logits_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("warpctc_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_logits), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, int, bool, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("warpctc_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_logits, input_logits_length, *input_warpctcgrad, *input_loss_grad, blank, norm_by_times, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void where_grad(const Tensor& condition, const Tensor& x, const Tensor& y, const Tensor& out_grad, Tensor* x_grad, Tensor* y_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(condition, x, y, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "where_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "where_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "where_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_condition = PrepareData(condition, kernel.InputAt(0), {});
  auto input_x = PrepareData(x, kernel.InputAt(1), {});
  auto input_y = PrepareData(y, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"condition", {
     (*input_condition).dims()}},
     {"x", {
     (*input_x).dims()}},
     {"y", {
     (*input_y).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("where_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(y_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("where_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);

  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_y), kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("where_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_condition, *input_x, *input_y, *input_out_grad, kernel_out_0, kernel_out_1);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
  }
  
}

PADDLE_API void yolov3_loss_grad(const Tensor& x, const Tensor& gt_box, const Tensor& gt_label, const paddle::optional<Tensor>& gt_score, const Tensor& objectness_mask, const Tensor& gt_match_mask, const Tensor& loss_grad, const std::vector<int>& anchors, const std::vector<int>& anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth, float scale_x_y, Tensor* x_grad, Tensor* gt_box_grad, Tensor* gt_label_grad, Tensor* gt_score_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, gt_box, gt_label, gt_score, objectness_mask, gt_match_mask, loss_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "yolov3_loss_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "yolov3_loss_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "yolov3_loss_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_gt_box = PrepareData(gt_box, kernel.InputAt(1), {});
  auto input_gt_label = PrepareData(gt_label, kernel.InputAt(2), {});
  auto input_gt_score = PrepareData(gt_score, kernel.InputAt(3), {});
  auto input_objectness_mask = PrepareData(objectness_mask, kernel.InputAt(4), {});
  auto input_gt_match_mask = PrepareData(gt_match_mask, kernel.InputAt(5), {});
  auto input_loss_grad = PrepareData(loss_grad, kernel.InputAt(6), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<phi::DDim> gt_score_record_shapes;
     if(input_gt_score){
       gt_score_record_shapes.push_back((*input_gt_score).dims());
     }
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"gt_box", {
     (*input_gt_box).dims()}},
     {"gt_label", {
     (*input_gt_label).dims()}},
     {"gt_score", gt_score_record_shapes},
     {"objectness_mask", {
     (*input_objectness_mask).dims()}},
     {"gt_match_mask", {
     (*input_gt_match_mask).dims()}},
     {"loss_grad", {
     (*input_loss_grad).dims()}}};
     platform::RecordOpInfoSupplement("yolov3_loss_grad", input_shapes);
  }

  auto kernel_out_0 = SetKernelOutput(x_grad);
  auto kernel_out_1 = SetKernelOutput(gt_box_grad);
  auto kernel_out_2 = SetKernelOutput(gt_label_grad);
  auto kernel_out_3 = SetKernelOutput(gt_score_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("yolov3_loss_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out_0(kernel_out_0);
  phi::MetaTensor meta_out_1(kernel_out_1);
  phi::MetaTensor meta_out_2(kernel_out_2);
  phi::MetaTensor meta_out_3(kernel_out_3);

  phi::Yolov3LossGradInferMeta(MakeMetaTensor(*input_x), MakeMetaTensor(*input_gt_box), MakeMetaTensor(*input_gt_label), MakeMetaTensor(input_gt_score), MakeMetaTensor(*input_objectness_mask), MakeMetaTensor(*input_gt_match_mask), MakeMetaTensor(*input_loss_grad), anchors, anchor_mask, class_num, ignore_thresh, downsample_ratio, use_label_smooth, scale_x_y, kernel_out_0 ? &meta_out_0 : nullptr, kernel_out_1 ? &meta_out_1 : nullptr, kernel_out_2 ? &meta_out_2 : nullptr, kernel_out_3 ? &meta_out_3 : nullptr);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const paddle::optional<phi::DenseTensor>&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, int, float, int, bool, float, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("yolov3_loss_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_gt_box, *input_gt_label, input_gt_score, *input_objectness_mask, *input_gt_match_mask, *input_loss_grad, anchors, anchor_mask, class_num, ignore_thresh, downsample_ratio, use_label_smooth, scale_x_y, kernel_out_0, kernel_out_1, kernel_out_2, kernel_out_3);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out_0, kernel_backend, kernel_out_0);
    TransDataBackend(kernel_out_1, kernel_backend, kernel_out_1);
    TransDataBackend(kernel_out_2, kernel_backend, kernel_out_2);
    TransDataBackend(kernel_out_3, kernel_backend, kernel_out_3);
  }
  
}

PADDLE_API void fold_grad(const Tensor& x, const Tensor& out_grad, const std::vector<int>& output_sizes, const std::vector<int>& kernel_sizes, const std::vector<int>& strides, const std::vector<int>& paddings, const std::vector<int>& dilations, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "fold_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "fold_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "fold_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("fold_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("fold_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("fold_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_out_grad, output_sizes, kernel_sizes, strides, paddings, dilations, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void unpool3d_grad(const Tensor& x, const Tensor& indices, const Tensor& out, const Tensor& out_grad, const std::vector<int>& ksize, const std::vector<int>& strides, const std::vector<int>& padding, const std::vector<int>& output_size, const std::string& data_format, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, indices, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "unpool3d_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "unpool3d_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "unpool3d_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_indices = PrepareData(indices, kernel.InputAt(1), {});
  auto input_out = PrepareData(out, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"indices", {
     (*input_indices).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("unpool3d_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("unpool3d_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("unpool3d_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_indices, *input_out, *input_out_grad, ksize, strides, padding, output_size, data_format, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}

PADDLE_API void unpool_grad(const Tensor& x, const Tensor& indices, const Tensor& out, const Tensor& out_grad, const std::vector<int>& ksize, const std::vector<int>& strides, const std::vector<int>& padding, const IntArray& output_size, const std::string& data_format, Tensor* x_grad) {

  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;

  kernel_data_type = ParseDataType(x);

  if (kernel_backend == Backend::UNDEFINED
        || kernel_layout == DataLayout::UNDEFINED
        || kernel_data_type == DataType::UNDEFINED ) {
    auto kernel_key_set = ParseKernelKeyByInputArgs(x, indices, out, out_grad);
    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
    if (kernel_backend == Backend::UNDEFINED) {
      kernel_backend = kernel_key.backend();
    }
    if (kernel_layout == DataLayout::UNDEFINED) {
      kernel_layout = kernel_key.layout();
    }
    if (kernel_data_type == DataType::UNDEFINED) {
      kernel_data_type = kernel_key.dtype();
    }
  }

  VLOG(6) << "unpool_grad API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "unpool_grad", {kernel_backend, kernel_layout, kernel_data_type});
  const auto& kernel = kernel_result.kernel;
  VLOG(6) << "unpool_grad kernel: " << kernel;
  auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);

  auto input_x = PrepareData(x, kernel.InputAt(0), {});
  auto input_indices = PrepareData(indices, kernel.InputAt(1), {});
  auto input_out = PrepareData(out, kernel.InputAt(2), {});
  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(3), {});
  if(platform::RecordOpInfoSupplement::IsEnabled()){
     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{
     {"x", {
     (*input_x).dims()}},
     {"indices", {
     (*input_indices).dims()}},
     {"out", {
     (*input_out).dims()}},
     {"out_grad", {
     (*input_out_grad).dims()}}};
     platform::RecordOpInfoSupplement("unpool_grad", input_shapes);
  }

  auto kernel_out = SetKernelOutput(x_grad);
  paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    infer_shape_record_event = new paddle::platform::RecordEvent("unpool_grad infer_meta", paddle::platform::TracerEventType::OperatorInner, 1);
  }
  phi::MetaTensor meta_out(kernel_out);

  phi::UnchangedInferMeta(MakeMetaTensor(*input_x), &meta_out);

  if(infer_shape_record_event != nullptr){
    delete infer_shape_record_event;
  }
  using kernel_signature = void(*)(const platform::DeviceContext&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const phi::DenseTensor&, const std::vector<int>&, const std::vector<int>&, const std::vector<int>&, const phi::IntArray&, const std::string&, phi::DenseTensor*);
  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
  paddle::platform::RecordEvent* kernel_record_event = nullptr;
  if(paddle::platform::RecordEvent::IsEnabled()){
    kernel_record_event = new paddle::platform::RecordEvent("unpool_grad compute", paddle::platform::TracerEventType::OperatorInner, 1);
  }
    (*kernel_fn)(*dev_ctx, *input_x, *input_indices, *input_out, *input_out_grad, ksize, strides, padding, phi::IntArray(output_size), data_format, kernel_out);
  if(kernel_record_event != nullptr){
    delete kernel_record_event;
  }
  if (kernel_result.has_fallback_cpu) {

    TransDataBackend(kernel_out, kernel_backend, kernel_out);
  }
  
}


}  // namespace experimental
}  // namespace paddle
