2.4.2

dbe08e9b · yuguo960516yuguo · b5499578 · dbe08e9b · dbe08e9b · dbe08e9b
Commit dbe08e9b authored Jun 12, 2023 by yuguo960516yuguo
20 changed files
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -107,15 +107,21 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
            sizeof(bias_data)));
    if (enable_auxiliary && activation != "none") {
-      size_t reserve_space_size = 0;
+      // Note (Ming Huang): The initialization of ReseveSpace is happened in the
+      // dev_ctx.Alloc. Therefore, we set real date type up here.
      if (activation == "relu") {
-        // Count in bits.
+        paddle::experimental::DataType rs_type =
-        reserve_space_size = phi::product(out->dims()) / 8;
+            paddle::experimental::DataType::BOOL;
+        size_t reserve_space_size =
+            phi::product(reserve_space->dims()) * SizeOf(rs_type);
+        dev_ctx.Alloc(reserve_space, rs_type, reserve_space_size);
      } else {
-        reserve_space_size = phi::product(out->dims()) * sizeof(T);
+        size_t reserve_space_size =
+            phi::product(reserve_space->dims()) * sizeof(T);
+        dev_ctx.Alloc<T>(reserve_space, reserve_space_size);
      }
-      dev_ctx.Alloc(reserve_space, out->type(), reserve_space_size);
-      void* aux_data = reinterpret_cast<void*>(reserve_space->data<T>());
+      void* aux_data = reserve_space->data();
      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::cublasLtMatmulDescSetAttribute(
@@ -185,7 +191,6 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
                                                              stream,
                                                              workspace->ptr(),
                                                              workspace_size);
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cublasLtMatmul(lt_handle,
                                          operation_desc,
@@ -478,7 +483,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
              sizeof(epiloque_func_for_dx)));
      if (activation_grad != "none") {
-        auto* aux_data = reserve_space->data<T>();
+        auto* aux_data = reserve_space->data();
        PADDLE_ENFORCE_GPU_SUCCESS(
            platform::dynload::cublasLtMatmulDescSetAttribute(
                dx_operation_desc,

--- a/paddle/fluid/operators/huber_loss_op_mlu.cc
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+namespace paddle {
+namespace operators {
+using Tensor = phi::DenseTensor;
+template <typename T>
+class HuberLossMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* residual = ctx.Output<Tensor>("Residual");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto delta = ctx.Attr<float>("delta");
+    auto place = ctx.GetPlace();
+    // compute y-x
+    cnnlDataType_t data_type = ToCnnlDataType<T>();
+    residual->mutable_data<T>(x->dims(), place);
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlOpTensorDesc sub_op_desc(
+        CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx,
+                      sub_op_desc.get(),
+                      x_desc.get(),
+                      GetBasePtr(y),
+                      x_desc.get(),
+                      GetBasePtr(x),
+                      x_desc.get(),
+                      GetBasePtr(residual),
+                      data_type);
+    // compute smoothl1loss
+    out->mutable_data<T>(x->dims(), place);
+    cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
+        CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
+                                           // here
+    MLUCnnl::SmoothL1LossForward(ctx,
+                                 x_desc.get(),
+                                 GetBasePtr(x),
+                                 x_desc.get(), /* target has same shape as x */
+                                 GetBasePtr(y),
+                                 static_cast<float>(delta),
+                                 smoothl1_algo,
+                                 x_desc.get(), /* out has same shape as x */
+                                 GetBasePtr(out));
+    // compute multiply by delta
+    Tensor scale_tensor, bias_tensor;
+    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
+    const int axis = std::max(out->dims().size() - 1, 0);
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Scale(ctx,
+                   axis,
+                   out_desc.get(),
+                   GetBasePtr(out),
+                   scale_desc.get(),
+                   GetBasePtr(&scale_tensor),
+                   bias_desc.get(),
+                   GetBasePtr(&bias_tensor),
+                   out_desc.get(),
+                   GetBasePtr(out));
+  }
+};
+template <typename T>
+class HuberLossGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    auto* residual = ctx.Input<Tensor>("Residual");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto delta = ctx.Attr<float>("delta");
+    auto place = ctx.GetPlace();
+    Tensor t_grad_rd;
+    t_grad_rd =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
+    MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
+    if (dx || dy) {
+      Tensor t_zero;
+      t_zero =
+          ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
+      MLUCnnlTensorDesc residual_desc(*residual);
+      MLUCnnlTensorDesc dout_desc(*dout);
+      cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
+          CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
+                                             // here
+      MLUCnnl::SmoothL1LossBackward(ctx,
+                                    residual_desc.get(),
+                                    GetBasePtr(residual),
+                                    residual_desc.get(),
+                                    GetBasePtr(&t_zero),
+                                    dout_desc.get(),
+                                    GetBasePtr(dout),
+                                    static_cast<float>(delta),
+                                    smoothl1_algo,
+                                    t_grad_rd_desc.get(),
+                                    GetBasePtr(&t_grad_rd));
+    }
+    // compute multiply by delta
+    Tensor scale_tensor, bias_tensor;
+    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
+    const int axis = std::max(t_grad_rd.dims().size() - 1, 0);
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+    if (dx) {
+      dx->mutable_data<T>(place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(-delta), &scale_tensor);
+      MLUCnnlTensorDesc out_desc(*dx);
+      MLUCnnl::Scale(ctx,
+                     axis,
+                     t_grad_rd_desc.get(),
+                     GetBasePtr(&t_grad_rd),
+                     scale_desc.get(),
+                     GetBasePtr(&scale_tensor),
+                     bias_desc.get(),
+                     GetBasePtr(&bias_tensor),
+                     out_desc.get(),
+                     GetBasePtr(dx));
+    }
+    if (dy) {
+      dy->mutable_data<T>(place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
+      MLUCnnlTensorDesc out_desc(*dy);
+      MLUCnnl::Scale(ctx,
+                     axis,
+                     t_grad_rd_desc.get(),
+                     GetBasePtr(&t_grad_rd),
+                     scale_desc.get(),
+                     GetBasePtr(&scale_tensor),
+                     bias_desc.get(),
+                     GetBasePtr(&bias_tensor),
+                     out_desc.get(),
+                     GetBasePtr(dy));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(huber_loss,
+                       ops::HuberLossMLUKernel<float>,
+                       ops::HuberLossMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(huber_loss_grad,
+                       ops::HuberLossGradMLUKernel<float>,
+                       ops::HuberLossGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -39,6 +39,14 @@ cc_test(
  SRCS test.cc
  DEPS jit_kernel_helper)
 if(NOT WIN32)
+  set(cuda_less12_and_gcc_greater12 false)
+  if(DEFINED CMAKE_CUDA_COMPILER_VERSION)
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0
+       AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 12.0)
+      set(cuda_less12_and_gcc_greater12 true)
+    endif()
+  endif()
+  if(NOT cuda_less12_and_gcc_greater12)
    cc_binary(
      jit_kernel_benchmark
      SRCS
@@ -47,6 +55,7 @@ if(NOT WIN32)
      jit_kernel_helper
      device_tracer
      tensor)
+  endif()
 endif()
 if(WITH_TESTING AND TEST jit_kernel_test)
  set_tests_properties(jit_kernel_test PROPERTIES TIMEOUT 120)

--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -24,7 +24,8 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
-#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 namespace phi {
@@ -37,6 +38,9 @@ namespace operators {
 using paddle::platform::MKLDNNDeviceContext;
 using phi::CPUContext;
 using platform::to_void_cast;
+using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
 template <typename T>
 class SumMKLDNNHandler

--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -97,4 +97,6 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(one_hot_v2, ops::OneHotV2MLUKernel<int32_t>);
+REGISTER_OP_MLU_KERNEL(one_hot_v2,
+                       ops::OneHotV2MLUKernel<int32_t>,
+                       ops::OneHotV2MLUKernel<int64_t>);
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
--- a/paddle/fluid/operators/select_output_op.cc
+++ b/paddle/fluid/operators/select_output_op.cc
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc