Commit dbe08e9b authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

2.4.2

parent b5499578
...@@ -107,15 +107,21 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> { ...@@ -107,15 +107,21 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
sizeof(bias_data))); sizeof(bias_data)));
if (enable_auxiliary && activation != "none") { if (enable_auxiliary && activation != "none") {
size_t reserve_space_size = 0; // Note (Ming Huang): The initialization of ReseveSpace is happened in the
// dev_ctx.Alloc. Therefore, we set real date type up here.
if (activation == "relu") { if (activation == "relu") {
// Count in bits. paddle::experimental::DataType rs_type =
reserve_space_size = phi::product(out->dims()) / 8; paddle::experimental::DataType::BOOL;
size_t reserve_space_size =
phi::product(reserve_space->dims()) * SizeOf(rs_type);
dev_ctx.Alloc(reserve_space, rs_type, reserve_space_size);
} else { } else {
reserve_space_size = phi::product(out->dims()) * sizeof(T); size_t reserve_space_size =
phi::product(reserve_space->dims()) * sizeof(T);
dev_ctx.Alloc<T>(reserve_space, reserve_space_size);
} }
dev_ctx.Alloc(reserve_space, out->type(), reserve_space_size);
void* aux_data = reinterpret_cast<void*>(reserve_space->data<T>()); void* aux_data = reserve_space->data();
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cublasLtMatmulDescSetAttribute( platform::dynload::cublasLtMatmulDescSetAttribute(
...@@ -185,7 +191,6 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> { ...@@ -185,7 +191,6 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
stream, stream,
workspace->ptr(), workspace->ptr(),
workspace_size); workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cublasLtMatmul(lt_handle, platform::dynload::cublasLtMatmul(lt_handle,
operation_desc, operation_desc,
...@@ -478,7 +483,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> { ...@@ -478,7 +483,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
sizeof(epiloque_func_for_dx))); sizeof(epiloque_func_for_dx)));
if (activation_grad != "none") { if (activation_grad != "none") {
auto* aux_data = reserve_space->data<T>(); auto* aux_data = reserve_space->data();
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cublasLtMatmulDescSetAttribute( platform::dynload::cublasLtMatmulDescSetAttribute(
dx_operation_desc, dx_operation_desc,
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
using Tensor = phi::DenseTensor;
template <typename T>
class HuberLossMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = GetDevCtxFromCTX(ctx);
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* residual = ctx.Output<Tensor>("Residual");
auto* out = ctx.Output<Tensor>("Out");
auto delta = ctx.Attr<float>("delta");
auto place = ctx.GetPlace();
// compute y-x
cnnlDataType_t data_type = ToCnnlDataType<T>();
residual->mutable_data<T>(x->dims(), place);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlOpTensorDesc sub_op_desc(
CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
sub_op_desc.get(),
x_desc.get(),
GetBasePtr(y),
x_desc.get(),
GetBasePtr(x),
x_desc.get(),
GetBasePtr(residual),
data_type);
// compute smoothl1loss
out->mutable_data<T>(x->dims(), place);
cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
CNNL_SMOOTHL1LOSS_REDUCTION_NONE; // defines whether to do reduction
// here
MLUCnnl::SmoothL1LossForward(ctx,
x_desc.get(),
GetBasePtr(x),
x_desc.get(), /* target has same shape as x */
GetBasePtr(y),
static_cast<float>(delta),
smoothl1_algo,
x_desc.get(), /* out has same shape as x */
GetBasePtr(out));
// compute multiply by delta
Tensor scale_tensor, bias_tensor;
scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
const int axis = std::max(out->dims().size() - 1, 0);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnlTensorDesc bias_desc(bias_tensor);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Scale(ctx,
axis,
out_desc.get(),
GetBasePtr(out),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class HuberLossGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = GetDevCtxFromCTX(ctx);
auto* residual = ctx.Input<Tensor>("Residual");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto delta = ctx.Attr<float>("delta");
auto place = ctx.GetPlace();
Tensor t_grad_rd;
t_grad_rd =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
if (dx || dy) {
Tensor t_zero;
t_zero =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
MLUCnnlTensorDesc residual_desc(*residual);
MLUCnnlTensorDesc dout_desc(*dout);
cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
CNNL_SMOOTHL1LOSS_REDUCTION_NONE; // defines whether to do reduction
// here
MLUCnnl::SmoothL1LossBackward(ctx,
residual_desc.get(),
GetBasePtr(residual),
residual_desc.get(),
GetBasePtr(&t_zero),
dout_desc.get(),
GetBasePtr(dout),
static_cast<float>(delta),
smoothl1_algo,
t_grad_rd_desc.get(),
GetBasePtr(&t_grad_rd));
}
// compute multiply by delta
Tensor scale_tensor, bias_tensor;
scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
const int axis = std::max(t_grad_rd.dims().size() - 1, 0);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnlTensorDesc bias_desc(bias_tensor);
if (dx) {
dx->mutable_data<T>(place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(-delta), &scale_tensor);
MLUCnnlTensorDesc out_desc(*dx);
MLUCnnl::Scale(ctx,
axis,
t_grad_rd_desc.get(),
GetBasePtr(&t_grad_rd),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(),
GetBasePtr(dx));
}
if (dy) {
dy->mutable_data<T>(place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
MLUCnnlTensorDesc out_desc(*dy);
MLUCnnl::Scale(ctx,
axis,
t_grad_rd_desc.get(),
GetBasePtr(&t_grad_rd),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(),
GetBasePtr(dy));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(huber_loss,
ops::HuberLossMLUKernel<float>,
ops::HuberLossMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(huber_loss_grad,
ops::HuberLossGradMLUKernel<float>,
ops::HuberLossGradMLUKernel<plat::float16>);
...@@ -39,6 +39,14 @@ cc_test( ...@@ -39,6 +39,14 @@ cc_test(
SRCS test.cc SRCS test.cc
DEPS jit_kernel_helper) DEPS jit_kernel_helper)
if(NOT WIN32) if(NOT WIN32)
set(cuda_less12_and_gcc_greater12 false)
if(DEFINED CMAKE_CUDA_COMPILER_VERSION)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0
AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 12.0)
set(cuda_less12_and_gcc_greater12 true)
endif()
endif()
if(NOT cuda_less12_and_gcc_greater12)
cc_binary( cc_binary(
jit_kernel_benchmark jit_kernel_benchmark
SRCS SRCS
...@@ -47,6 +55,7 @@ if(NOT WIN32) ...@@ -47,6 +55,7 @@ if(NOT WIN32)
jit_kernel_helper jit_kernel_helper
device_tracer device_tracer
tensor) tensor)
endif()
endif() endif()
if(WITH_TESTING AND TEST jit_kernel_test) if(WITH_TESTING AND TEST jit_kernel_test)
set_tests_properties(jit_kernel_test PROPERTIES TIMEOUT 120) set_tests_properties(jit_kernel_test PROPERTIES TIMEOUT 120)
......
...@@ -24,7 +24,8 @@ ...@@ -24,7 +24,8 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/mkldnn_reuse.h"
namespace phi { namespace phi {
...@@ -37,6 +38,9 @@ namespace operators { ...@@ -37,6 +38,9 @@ namespace operators {
using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNDeviceContext;
using phi::CPUContext; using phi::CPUContext;
using platform::to_void_cast; using platform::to_void_cast;
using Tensor = framework::Tensor;
using SelectedRows = phi::SelectedRows;
using LoDTensor = framework::LoDTensor;
template <typename T> template <typename T>
class SumMKLDNNHandler class SumMKLDNNHandler
......
This diff is collapsed.
This diff is collapsed.
...@@ -97,4 +97,6 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> { ...@@ -97,4 +97,6 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(one_hot_v2, ops::OneHotV2MLUKernel<int32_t>); REGISTER_OP_MLU_KERNEL(one_hot_v2,
ops::OneHotV2MLUKernel<int32_t>,
ops::OneHotV2MLUKernel<int64_t>);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment