Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
dbe08e9b
Commit
dbe08e9b
authored
Jun 12, 2023
by
yuguo960516yuguo
Browse files
2.4.2
parent
b5499578
Changes
302
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1225 additions
and
673 deletions
+1225
-673
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+13
-8
paddle/fluid/operators/huber_loss_op_mlu.cc
paddle/fluid/operators/huber_loss_op_mlu.cc
+187
-0
paddle/fluid/operators/jit/CMakeLists.txt
paddle/fluid/operators/jit/CMakeLists.txt
+17
-8
paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+18
-15
paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+12
-14
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+5
-1
paddle/fluid/operators/mlu/mlu_baseop.cc
paddle/fluid/operators/mlu/mlu_baseop.cc
+370
-11
paddle/fluid/operators/mlu/mlu_baseop.h
paddle/fluid/operators/mlu/mlu_baseop.h
+187
-0
paddle/fluid/operators/one_hot_v2_op_mlu.cc
paddle/fluid/operators/one_hot_v2_op_mlu.cc
+3
-1
paddle/fluid/operators/optimizers/adam_op_mlu.cc
paddle/fluid/operators/optimizers/adam_op_mlu.cc
+270
-5
paddle/fluid/operators/pool_op_mlu.cc
paddle/fluid/operators/pool_op_mlu.cc
+3
-4
paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+110
-0
paddle/fluid/operators/select_output_op.cc
paddle/fluid/operators/select_output_op.cc
+2
-1
paddle/fluid/operators/strided_slice_op_mlu.cc
paddle/fluid/operators/strided_slice_op_mlu.cc
+5
-0
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+11
-99
paddle/fluid/operators/sum_op.cu
paddle/fluid/operators/sum_op.cu
+0
-280
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+0
-222
paddle/fluid/operators/sum_op_mlu.cc
paddle/fluid/operators/sum_op_mlu.cc
+4
-1
paddle/fluid/operators/sum_op_npu.cc
paddle/fluid/operators/sum_op_npu.cc
+4
-1
paddle/fluid/operators/sum_op_xpu.cc
paddle/fluid/operators/sum_op_xpu.cc
+4
-2
No files found.
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
View file @
dbe08e9b
...
@@ -107,15 +107,21 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
...
@@ -107,15 +107,21 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
sizeof
(
bias_data
)));
sizeof
(
bias_data
)));
if
(
enable_auxiliary
&&
activation
!=
"none"
)
{
if
(
enable_auxiliary
&&
activation
!=
"none"
)
{
size_t
reserve_space_size
=
0
;
// Note (Ming Huang): The initialization of ReseveSpace is happened in the
// dev_ctx.Alloc. Therefore, we set real date type up here.
if
(
activation
==
"relu"
)
{
if
(
activation
==
"relu"
)
{
// Count in bits.
paddle
::
experimental
::
DataType
rs_type
=
reserve_space_size
=
phi
::
product
(
out
->
dims
())
/
8
;
paddle
::
experimental
::
DataType
::
BOOL
;
size_t
reserve_space_size
=
phi
::
product
(
reserve_space
->
dims
())
*
SizeOf
(
rs_type
);
dev_ctx
.
Alloc
(
reserve_space
,
rs_type
,
reserve_space_size
);
}
else
{
}
else
{
reserve_space_size
=
phi
::
product
(
out
->
dims
())
*
sizeof
(
T
);
size_t
reserve_space_size
=
phi
::
product
(
reserve_space
->
dims
())
*
sizeof
(
T
);
dev_ctx
.
Alloc
<
T
>
(
reserve_space
,
reserve_space_size
);
}
}
dev_ctx
.
Alloc
(
reserve_space
,
out
->
type
(),
reserve_space_size
);
void
*
aux_data
=
reinterpret_cast
<
void
*>
(
reserve_space
->
data
<
T
>
()
);
void
*
aux_data
=
reserve_space
->
data
(
);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasLtMatmulDescSetAttribute
(
platform
::
dynload
::
cublasLtMatmulDescSetAttribute
(
...
@@ -185,7 +191,6 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
...
@@ -185,7 +191,6 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
stream
,
stream
,
workspace
->
ptr
(),
workspace
->
ptr
(),
workspace_size
);
workspace_size
);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasLtMatmul
(
lt_handle
,
platform
::
dynload
::
cublasLtMatmul
(
lt_handle
,
operation_desc
,
operation_desc
,
...
@@ -478,7 +483,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
...
@@ -478,7 +483,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
sizeof
(
epiloque_func_for_dx
)));
sizeof
(
epiloque_func_for_dx
)));
if
(
activation_grad
!=
"none"
)
{
if
(
activation_grad
!=
"none"
)
{
auto
*
aux_data
=
reserve_space
->
data
<
T
>
();
auto
*
aux_data
=
reserve_space
->
data
();
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasLtMatmulDescSetAttribute
(
platform
::
dynload
::
cublasLtMatmulDescSetAttribute
(
dx_operation_desc
,
dx_operation_desc
,
...
...
paddle/fluid/operators/huber_loss_op_mlu.cc
0 → 100644
View file @
dbe08e9b
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
phi
::
DenseTensor
;
template
<
typename
T
>
class
HuberLossMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
residual
=
ctx
.
Output
<
Tensor
>
(
"Residual"
);
auto
*
out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
delta
=
ctx
.
Attr
<
float
>
(
"delta"
);
auto
place
=
ctx
.
GetPlace
();
// compute y-x
cnnlDataType_t
data_type
=
ToCnnlDataType
<
T
>
();
residual
->
mutable_data
<
T
>
(
x
->
dims
(),
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlOpTensorDesc
sub_op_desc
(
CNNL_OP_TENSOR_SUB
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
sub_op_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
y
),
x_desc
.
get
(),
GetBasePtr
(
x
),
x_desc
.
get
(),
GetBasePtr
(
residual
),
data_type
);
// compute smoothl1loss
out
->
mutable_data
<
T
>
(
x
->
dims
(),
place
);
cnnlSmoothL1LossAlgorithm_t
smoothl1_algo
=
CNNL_SMOOTHL1LOSS_REDUCTION_NONE
;
// defines whether to do reduction
// here
MLUCnnl
::
SmoothL1LossForward
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
x_desc
.
get
(),
/* target has same shape as x */
GetBasePtr
(
y
),
static_cast
<
float
>
(
delta
),
smoothl1_algo
,
x_desc
.
get
(),
/* out has same shape as x */
GetBasePtr
(
out
));
// compute multiply by delta
Tensor
scale_tensor
,
bias_tensor
;
scale_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
bias_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
delta
),
&
scale_tensor
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.
f
),
&
bias_tensor
);
const
int
axis
=
std
::
max
(
out
->
dims
().
size
()
-
1
,
0
);
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Scale
(
ctx
,
axis
,
out_desc
.
get
(),
GetBasePtr
(
out
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
HuberLossGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
auto
*
residual
=
ctx
.
Input
<
Tensor
>
(
"Residual"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
delta
=
ctx
.
Attr
<
float
>
(
"delta"
);
auto
place
=
ctx
.
GetPlace
();
Tensor
t_grad_rd
;
t_grad_rd
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
residual
->
dims
(),
dev_ctx
);
MLUCnnlTensorDesc
t_grad_rd_desc
(
t_grad_rd
);
if
(
dx
||
dy
)
{
Tensor
t_zero
;
t_zero
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
residual
->
dims
(),
dev_ctx
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.
f
),
&
t_zero
);
MLUCnnlTensorDesc
residual_desc
(
*
residual
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
cnnlSmoothL1LossAlgorithm_t
smoothl1_algo
=
CNNL_SMOOTHL1LOSS_REDUCTION_NONE
;
// defines whether to do reduction
// here
MLUCnnl
::
SmoothL1LossBackward
(
ctx
,
residual_desc
.
get
(),
GetBasePtr
(
residual
),
residual_desc
.
get
(),
GetBasePtr
(
&
t_zero
),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
static_cast
<
float
>
(
delta
),
smoothl1_algo
,
t_grad_rd_desc
.
get
(),
GetBasePtr
(
&
t_grad_rd
));
}
// compute multiply by delta
Tensor
scale_tensor
,
bias_tensor
;
scale_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
bias_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.
f
),
&
bias_tensor
);
const
int
axis
=
std
::
max
(
t_grad_rd
.
dims
().
size
()
-
1
,
0
);
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
place
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
-
delta
),
&
scale_tensor
);
MLUCnnlTensorDesc
out_desc
(
*
dx
);
MLUCnnl
::
Scale
(
ctx
,
axis
,
t_grad_rd_desc
.
get
(),
GetBasePtr
(
&
t_grad_rd
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
dx
));
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
place
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
delta
),
&
scale_tensor
);
MLUCnnlTensorDesc
out_desc
(
*
dy
);
MLUCnnl
::
Scale
(
ctx
,
axis
,
t_grad_rd_desc
.
get
(),
GetBasePtr
(
&
t_grad_rd
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
dy
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
huber_loss
,
ops
::
HuberLossMLUKernel
<
float
>
,
ops
::
HuberLossMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
huber_loss_grad
,
ops
::
HuberLossGradMLUKernel
<
float
>
,
ops
::
HuberLossGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/jit/CMakeLists.txt
View file @
dbe08e9b
...
@@ -39,14 +39,23 @@ cc_test(
...
@@ -39,14 +39,23 @@ cc_test(
SRCS test.cc
SRCS test.cc
DEPS jit_kernel_helper
)
DEPS jit_kernel_helper
)
if
(
NOT WIN32
)
if
(
NOT WIN32
)
cc_binary
(
set
(
cuda_less12_and_gcc_greater12 false
)
jit_kernel_benchmark
if
(
DEFINED CMAKE_CUDA_COMPILER_VERSION
)
SRCS
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
LESS 12.0
benchmark.cc
AND
${
CMAKE_CXX_COMPILER_VERSION
}
VERSION_GREATER 12.0
)
DEPS
set
(
cuda_less12_and_gcc_greater12 true
)
jit_kernel_helper
endif
()
device_tracer
endif
()
tensor
)
if
(
NOT cuda_less12_and_gcc_greater12
)
cc_binary
(
jit_kernel_benchmark
SRCS
benchmark.cc
DEPS
jit_kernel_helper
device_tracer
tensor
)
endif
()
endif
()
endif
()
if
(
WITH_TESTING AND TEST jit_kernel_test
)
if
(
WITH_TESTING AND TEST jit_kernel_test
)
set_tests_properties
(
jit_kernel_test PROPERTIES TIMEOUT 120
)
set_tests_properties
(
jit_kernel_test PROPERTIES TIMEOUT 120
)
...
...
paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
View file @
dbe08e9b
...
@@ -214,10 +214,7 @@ class MatMulMKLDNNHandler
...
@@ -214,10 +214,7 @@ class MatMulMKLDNNHandler
}
}
astream
.
wait
();
astream
.
wait
();
auto
format
=
out
->
set_mem_desc
(
dst_memory_p
->
get_desc
().
reshape
(
out
->
dims
()));
MKLDNNFormatForSize
(
out
->
dims
().
size
(),
dnnl
::
memory
::
format_tag
::
nchw
);
out
->
set_format
(
format
);
out
->
set_layout
(
DataLayout
::
kMKLDNN
);
}
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireDstMemory
(
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireDstMemory
(
...
@@ -651,10 +648,18 @@ void ExecuteMatMulV2(const ExecutionContext &ctx,
...
@@ -651,10 +648,18 @@ void ExecuteMatMulV2(const ExecutionContext &ctx,
auto
&
astream
=
MKLDNNDeviceContext
::
tls
().
get_stream
();
auto
&
astream
=
MKLDNNDeviceContext
::
tls
().
get_stream
();
matmul_p
->
execute
(
astream
,
matmul_args
);
matmul_p
->
execute
(
astream
,
matmul_args
);
astream
.
wait
();
astream
.
wait
();
auto
format
=
MKLDNNFormatForSize
(
out
->
dims
().
size
(),
dnnl
::
memory
::
format_tag
::
nchw
);
// TODO(jczaja): Explain why int8 format of dst is ABCD and do not need
out
->
set_format
(
format
);
// permute
out
->
set_layout
(
DataLayout
::
kMKLDNN
);
if
(
IsOutputFused
(
ctx
)
&&
!
IsInt8
<
T_out
>
())
{
auto
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_Out"
);
auto
permuted_md
=
dst_memory_p
->
get_desc
().
permute_axes
(
axis
);
out
->
set_mem_desc
(
permuted_md
.
reshape
(
phi
::
vectorize
<
int64_t
>
(
out
->
dims
())));
}
else
{
out
->
set_mem_desc
(
dst_memory_p
->
get_desc
().
reshape
(
phi
::
vectorize
<
int64_t
>
(
out
->
dims
())));
}
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -836,8 +841,7 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -836,8 +841,7 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
reduction_p
->
execute
(
astream
,
reduction_args
);
reduction_p
->
execute
(
astream
,
reduction_args
);
astream
.
wait
();
astream
.
wait
();
dx
->
set_format
(
paddle
::
platform
::
GetMKLDNNFormat
(
dx
->
set_mem_desc
(
dst_memory_p
->
get_desc
().
reshape
(
squeezed_dims
));
dst_memory_p
->
get_desc
().
reshape
(
squeezed_dims
)));
}
}
std
::
vector
<
int64_t
>
ExtendDimsWithOnes
(
const
std
::
vector
<
int64_t
>
&
dims
,
std
::
vector
<
int64_t
>
ExtendDimsWithOnes
(
const
std
::
vector
<
int64_t
>
&
dims
,
...
@@ -1119,9 +1123,8 @@ void MatMulGradMKLDNNKernel<T>::ExecuteMatMulGrad(
...
@@ -1119,9 +1123,8 @@ void MatMulGradMKLDNNKernel<T>::ExecuteMatMulGrad(
matmul_p
->
execute
(
astream
,
matmul_args
);
matmul_p
->
execute
(
astream
,
matmul_args
);
astream
.
wait
();
astream
.
wait
();
out
->
set_layout
(
framework
::
DataLayout
::
kMKLDNN
);
out
->
set_mem_desc
(
out
->
set_format
(
platform
::
GetMKLDNNFormat
(
dst_memory_p
->
get_desc
().
reshape
(
vectorize
<
int64_t
>
(
out
->
dims
())));
dst_memory_p
->
get_desc
().
reshape
(
vectorize
<
int64_t
>
(
out
->
dims
()))));
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -1184,13 +1187,13 @@ void MatMulGradMKLDNNKernel<T>::RunKernel(const ExecutionContext &ctx) const {
...
@@ -1184,13 +1187,13 @@ void MatMulGradMKLDNNKernel<T>::RunKernel(const ExecutionContext &ctx) const {
if
(
dx
)
{
if
(
dx
)
{
if
(
dx_dims
!=
x
.
dims
())
{
if
(
dx_dims
!=
x
.
dims
())
{
dx
->
Resize
(
dx_dims
);
dx
->
Resize
(
dx_dims
);
dx
->
set_
format
(
x
.
format
());
dx
->
set_
mem_desc
(
x
.
mem_desc
());
}
}
}
}
if
(
dy
)
{
if
(
dy
)
{
if
(
dy_dims
!=
y
.
dims
())
{
if
(
dy_dims
!=
y
.
dims
())
{
dy
->
Resize
(
dy_dims
);
dy
->
Resize
(
dy_dims
);
dy
->
set_
format
(
y
.
format
());
dy
->
set_
mem_desc
(
y
.
mem_desc
());
}
}
}
}
}
}
...
...
paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
View file @
dbe08e9b
...
@@ -221,7 +221,7 @@ class MulPrimitiveFactory {
...
@@ -221,7 +221,7 @@ class MulPrimitiveFactory {
to_void_cast
<
T
>
(
x_tmp
.
data
<
T
>
()));
to_void_cast
<
T
>
(
x_tmp
.
data
<
T
>
()));
x_tmp
.
Resize
(
data
->
dims
());
x_tmp
.
Resize
(
data
->
dims
());
x_tmp
.
set_
format
(
platform
::
GetMKLDNNFormat
(
dst_mdesc
)
)
;
x_tmp
.
set_
mem_desc
(
dst_mdesc
);
data_matrix
=
framework
::
ReshapeToMatrix
(
x_tmp
,
num_col_dims
);
data_matrix
=
framework
::
ReshapeToMatrix
(
x_tmp
,
num_col_dims
);
}
else
{
}
else
{
data_matrix
=
framework
::
ReshapeToMatrix
(
*
data
,
num_col_dims
);
data_matrix
=
framework
::
ReshapeToMatrix
(
*
data
,
num_col_dims
);
...
@@ -235,11 +235,7 @@ class MulPrimitiveFactory {
...
@@ -235,11 +235,7 @@ class MulPrimitiveFactory {
const
Tensor
*
in
)
{
const
Tensor
*
in
)
{
x_input_
->
set_data_handle
(
to_void_cast
<
XT
>
(
in
->
data
<
XT
>
()));
x_input_
->
set_data_handle
(
to_void_cast
<
XT
>
(
in
->
data
<
XT
>
()));
output_
->
set_data_handle
(
out
->
mutable_data
<
OT
>
(
ctx
.
GetPlace
()));
output_
->
set_data_handle
(
out
->
mutable_data
<
OT
>
(
ctx
.
GetPlace
()));
out
->
set_mem_desc
(
output_
->
get_desc
());
if
(
out
->
format
()
==
MKLDNNMemoryFormat
::
undef
)
{
auto
output_format
=
platform
::
GetMKLDNNFormat
(
*
output_
);
out
->
set_format
((
MKLDNNMemoryFormat
)
output_format
);
}
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -272,7 +268,7 @@ class MulPrimitiveFactory {
...
@@ -272,7 +268,7 @@ class MulPrimitiveFactory {
auto
buffer_size
=
dst_desc
.
get_size
();
auto
buffer_size
=
dst_desc
.
get_size
();
OT
*
output_data
=
output
->
mutable_data
<
OT
>
(
ctx
.
GetPlace
(),
buffer_size
);
OT
*
output_data
=
output
->
mutable_data
<
OT
>
(
ctx
.
GetPlace
(),
buffer_size
);
output
->
set_
format
(
paddle
::
platform
::
GetMKLDNNFormat
(
dst_desc
)
)
;
output
->
set_
mem_desc
(
dst_desc
);
return
memory
(
dst_desc
,
engine_
,
to_void_cast
<
OT
>
(
output_data
));
return
memory
(
dst_desc
,
engine_
,
to_void_cast
<
OT
>
(
output_data
));
}
}
...
@@ -392,9 +388,10 @@ class MulMKLDNNINT8Kernel : public framework::OpKernel<XT> {
...
@@ -392,9 +388,10 @@ class MulMKLDNNINT8Kernel : public framework::OpKernel<XT> {
if
(
out_dims
.
size
()
!=
2
)
{
if
(
out_dims
.
size
()
!=
2
)
{
out
->
Resize
(
out_dims
);
out
->
Resize
(
out_dims
);
}
}
out
->
set_layout
(
DataLayout
::
kMKLDNN
);
out
->
set_format
(
platform
::
MKLDNNFormatForSize
(
out_dims
.
size
(),
auto
in_md
=
dnnl
::
memory
::
desc
(
*
dnnl_primitive_desc_query_md
(
MKLDNNMemoryFormat
::
nchw
));
mul
.
get_primitive_desc
(),
dnnl_query_dst_md
,
0
));
out
->
set_mem_desc
(
in_md
.
reshape
(
phi
::
vectorize
<
int64_t
>
(
out
->
dims
())));
}
}
};
};
...
@@ -442,10 +439,11 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
...
@@ -442,10 +439,11 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
matmul_p
->
execute
(
astream
,
matmul_args
);
matmul_p
->
execute
(
astream
,
matmul_args
);
astream
.
wait
();
astream
.
wait
();
out
->
set_layout
(
framework
::
DataLayout
::
kMKLDNN
);
// This kernel is flattening dims so then we need to unflattened version
// plain output formats are enforced inside handler
// that should be set in out reshape require plain layout, but
out
->
set_format
(
platform
::
MKLDNNFormatForSize
(
// MatmulV2MKLDNNHanlder enforces one so it should work
out
->
dims
().
size
(),
dnnl
::
memory
::
format_tag
::
nchw
));
out
->
set_mem_desc
(
dst_memory_p
->
get_desc
().
reshape
(
phi
::
vectorize
<
int64_t
>
(
out
->
dims
())));
}
}
private:
private:
...
...
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
View file @
dbe08e9b
...
@@ -24,7 +24,8 @@
...
@@ -24,7 +24,8 @@
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace
phi
{
namespace
phi
{
...
@@ -37,6 +38,9 @@ namespace operators {
...
@@ -37,6 +38,9 @@ namespace operators {
using
paddle
::
platform
::
MKLDNNDeviceContext
;
using
paddle
::
platform
::
MKLDNNDeviceContext
;
using
phi
::
CPUContext
;
using
phi
::
CPUContext
;
using
platform
::
to_void_cast
;
using
platform
::
to_void_cast
;
using
Tensor
=
framework
::
Tensor
;
using
SelectedRows
=
phi
::
SelectedRows
;
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
T
>
template
<
typename
T
>
class
SumMKLDNNHandler
class
SumMKLDNNHandler
...
...
paddle/fluid/operators/mlu/mlu_baseop.cc
View file @
dbe08e9b
...
@@ -256,6 +256,186 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() {
...
@@ -256,6 +256,186 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() {
}
}
}
}
class
MLUOpTensorDescPool
{
public:
mluOpTensorDescriptor_t
Pop
()
{
mluOpTensorDescriptor_t
raw_desc
;
if
(
q_
.
try_dequeue
(
raw_desc
))
{
return
raw_desc
;
}
else
{
mluOpCreateTensorDescriptor
(
&
raw_desc
);
return
raw_desc
;
}
}
void
Recycle
(
mluOpTensorDescriptor_t
desc
)
{
mluOpResetTensorDescriptor
(
desc
);
q_
.
enqueue
(
desc
);
}
~
MLUOpTensorDescPool
()
{
auto
size
=
q_
.
size_approx
();
if
(
size
>
0
)
{
std
::
vector
<
mluOpTensorDescriptor_t
>
vec
(
size
);
q_
.
try_dequeue_bulk
(
vec
.
data
(),
size
);
for
(
auto
desc
:
vec
)
{
mluOpDestroyTensorDescriptor
(
desc
);
}
}
}
private:
moodycamel
::
ConcurrentQueue
<
mluOpTensorDescriptor_t
>
q_
;
};
static
MLUOpTensorDescPool
g_mluop_tensor_desc_pool
;
MLUOpTensorDesc
&
MLUOpTensorDesc
::
operator
=
(
MLUOpTensorDesc
&&
rhs
)
{
if
(
raw_tensor_desc
)
{
g_mluop_tensor_desc_pool
.
Recycle
(
raw_tensor_desc
);
}
raw_tensor_desc
=
rhs
.
raw_tensor_desc
;
rhs
.
raw_tensor_desc
=
nullptr
;
return
*
this
;
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
)
{
raw_tensor_desc
=
g_mluop_tensor_desc_pool
.
Pop
();
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptor
(
raw_tensor_desc
,
MLUOP_LAYOUT_ARRAY
,
tensor_dtype
,
tensor_dim
,
dim_sizes
));
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
const
mluOpTensorLayout_t
layout
)
{
raw_tensor_desc
=
g_mluop_tensor_desc_pool
.
Pop
();
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptor
(
raw_tensor_desc
,
layout
,
tensor_dtype
,
tensor_dim
,
dim_sizes
));
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
int
position
)
:
MLUOpTensorDesc
(
tensor_dim
,
dim_sizes
,
tensor_dtype
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptorPosition
(
raw_tensor_desc
,
position
));
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
)
{
std
::
vector
<
int
>
dim_sizes_int32
(
tensor_dim
);
std
::
vector
<
int64_t
>::
const_iterator
int64_cbegin
(
dim_sizes
);
std
::
vector
<
int64_t
>::
const_iterator
int64_cend
(
dim_sizes
+
tensor_dim
);
std
::
transform
(
int64_cbegin
,
int64_cend
,
dim_sizes_int32
.
begin
(),
&
CheckedNarrowing
<
int64_t
,
int
>
);
raw_tensor_desc
=
g_mluop_tensor_desc_pool
.
Pop
();
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptor
(
raw_tensor_desc
,
MLUOP_LAYOUT_ARRAY
,
tensor_dtype
,
tensor_dim
,
dim_sizes_int32
.
data
()));
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
const
mluOpTensorLayout_t
layout
)
{
std
::
vector
<
int
>
dim_sizes_int32
(
tensor_dim
);
std
::
vector
<
int64_t
>::
const_iterator
int64_cbegin
(
dim_sizes
);
std
::
vector
<
int64_t
>::
const_iterator
int64_cend
(
dim_sizes
+
tensor_dim
);
std
::
transform
(
int64_cbegin
,
int64_cend
,
dim_sizes_int32
.
begin
(),
&
CheckedNarrowing
<
int64_t
,
int
>
);
raw_tensor_desc
=
g_mluop_tensor_desc_pool
.
Pop
();
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptor
(
raw_tensor_desc
,
layout
,
tensor_dtype
,
tensor_dim
,
dim_sizes_int32
.
data
()));
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
int
position
)
{
std
::
vector
<
int
>
dim_sizes_int32
(
tensor_dim
);
std
::
vector
<
int64_t
>::
const_iterator
int64_cbegin
(
dim_sizes
);
std
::
vector
<
int64_t
>::
const_iterator
int64_cend
(
dim_sizes
+
tensor_dim
);
std
::
transform
(
int64_cbegin
,
int64_cend
,
dim_sizes_int32
.
begin
(),
&
CheckedNarrowing
<
int64_t
,
int
>
);
raw_tensor_desc
=
g_mluop_tensor_desc_pool
.
Pop
();
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptor
(
raw_tensor_desc
,
MLUOP_LAYOUT_ARRAY
,
tensor_dtype
,
tensor_dim
,
dim_sizes_int32
.
data
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptorPosition
(
raw_tensor_desc
,
position
));
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
Tensor
&
tensor
,
const
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
)
{
auto
dims
=
phi
::
vectorize
<
int
>
(
tensor
.
dims
());
int
tensor_dim
=
dims
.
size
();
raw_tensor_desc
=
g_mluop_tensor_desc_pool
.
Pop
();
if
(
tensor_dim
==
0
)
{
int
scalar_dims
[
1
]
=
{
1
};
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptor
(
raw_tensor_desc
,
layout
,
tensor_dtype
,
1
,
scalar_dims
));
}
else
{
std
::
vector
<
int
>
tensor_dim_sizes_int
(
dims
.
begin
(),
dims
.
end
());
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptor
(
raw_tensor_desc
,
layout
,
tensor_dtype
,
tensor_dim
,
tensor_dim_sizes_int
.
data
()));
}
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
Tensor
&
tensor
)
:
MLUOpTensorDesc
(
tensor
,
MLUOP_LAYOUT_ARRAY
,
ToMluOpDataType
(
tensor
.
dtype
()))
{}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
Tensor
&
tensor
,
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
,
int
position
)
:
MLUOpTensorDesc
(
tensor
,
layout
,
tensor_dtype
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptorPosition
(
raw_tensor_desc
,
position
));
}
MLUOpTensorDesc
::
MLUOpTensorDesc
(
const
Tensor
&
tensor
,
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
,
int
position
,
float
scale
)
:
MLUOpTensorDesc
(
tensor
,
layout
,
tensor_dtype
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpSetTensorDescriptorPositionAndScale
(
raw_tensor_desc
,
position
,
scale
));
}
MLUOpTensorDesc
::~
MLUOpTensorDesc
()
{
if
(
raw_tensor_desc
)
{
g_mluop_tensor_desc_pool
.
Recycle
(
raw_tensor_desc
);
}
}
MLUCnnlActivationDesc
::
MLUCnnlActivationDesc
(
MLUCnnlActivationDesc
::
MLUCnnlActivationDesc
(
const
cnnlActivationMode_t
act_mode
,
const
float
ceof
)
{
const
cnnlActivationMode_t
act_mode
,
const
float
ceof
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateActivationDescriptor
(
&
active_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateActivationDescriptor
(
&
active_desc_
));
...
@@ -1563,17 +1743,35 @@ MLURNNDesc::~MLURNNDesc() {
...
@@ -1563,17 +1743,35 @@ MLURNNDesc::~MLURNNDesc() {
void
*
indices_out
)
{
void
*
indices_out
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlTopKTensor
(
handle
,
size_t
workspace_size
;
input_desc
,
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetTopKTensorWorkspaceSize
(
handle
,
input
,
input_desc
,
k
,
k
,
dim
,
dim
,
largest
,
largest
,
sorted
,
values_output_desc
,
values_output_desc
,
indices_output_desc
,
values_out
,
&
workspace_size
));
indices_output_desc
,
indices_out
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlTopKTensor_v3
(
handle
,
input_desc
,
input
,
k
,
dim
,
largest
,
sorted
,
false
/*lower_index_first*/
,
workspace_ptr
,
workspace_size
,
values_output_desc
,
values_out
,
indices_output_desc
,
indices_out
));
}
}
/* static */
void
MLUCnnl
::
StridedSlice
(
/* static */
void
MLUCnnl
::
StridedSlice
(
...
@@ -4527,6 +4725,78 @@ MLURNNDesc::~MLURNNDesc() {
...
@@ -4527,6 +4725,78 @@ MLURNNDesc::~MLURNNDesc() {
output
));
output
));
}
}
/* static */
void
MLUCnnl
::
SmoothL1LossForward
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
t_desc
,
const
void
*
target
,
const
float
beta
,
const
cnnlSmoothL1LossAlgorithm_t
algorithm
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetSmoothL1LossForwardWorkspaceSize
(
handle
,
x_desc
,
algorithm
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSmoothL1LossForward_v2
(
handle
,
x_desc
,
x
,
t_desc
,
target
,
beta
,
algorithm
,
workspace_ptr
,
workspace_size
,
y_desc
,
y
));
}
/* static */
void
MLUCnnl
::
SmoothL1LossBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
target_desc
,
const
void
*
target
,
const
cnnlTensorDescriptor_t
dy_desc
,
const
void
*
dy
,
const
float
beta
,
const
cnnlSmoothL1LossAlgorithm_t
algorithm
,
const
cnnlTensorDescriptor_t
dx_desc
,
void
*
dx
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetSmoothL1LossBackwardWorkspaceSize
(
handle
,
x_desc
,
algorithm
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSmoothL1LossBackward_v2
(
handle
,
x_desc
,
x
,
target_desc
,
target
,
dy_desc
,
dy
,
beta
,
algorithm
,
workspace_ptr
,
workspace_size
,
dx_desc
,
dx
));
}
/* static */
void
MLUCnnl
::
EmbeddingForward
(
/* static */
void
MLUCnnl
::
EmbeddingForward
(
const
ExecutionContext
&
ctx
,
const
ExecutionContext
&
ctx
,
const
int
padding_idx
,
const
int
padding_idx
,
...
@@ -5148,5 +5418,94 @@ MLURNNDesc::~MLURNNDesc() {
...
@@ -5148,5 +5418,94 @@ MLURNNDesc::~MLURNNDesc() {
diff_x
));
diff_x
));
}
}
/* static */
void
MLUOP
::
OpYoloBox
(
const
ExecutionContext
&
ctx
,
const
mluOpTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
mluOpTensorDescriptor_t
img_size_desc
,
const
void
*
img_size
,
const
mluOpTensorDescriptor_t
anchors_desc
,
const
void
*
anchors
,
const
int
class_num
,
const
float
conf_thresh
,
const
int
downsample_ratio
,
const
bool
clip_bbox
,
const
float
scale
,
const
bool
iou_aware
,
const
float
iou_aware_factor
,
const
mluOpTensorDescriptor_t
boxes_desc
,
void
*
boxes
,
const
mluOpTensorDescriptor_t
scores_desc
,
void
*
scores
)
{
mluOpHandle_t
handle
=
GetMLUOpHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpYoloBox
(
handle
,
x_desc
,
x
,
img_size_desc
,
img_size
,
anchors_desc
,
anchors
,
class_num
,
conf_thresh
,
downsample_ratio
,
clip_bbox
,
scale
,
iou_aware
,
iou_aware_factor
,
boxes_desc
,
boxes
,
scores_desc
,
scores
));
}
/* static */
void
MLUOP
::
OpPriorBox
(
const
ExecutionContext
&
ctx
,
const
mluOpTensorDescriptor_t
min_sizes_desc
,
const
void
*
min_sizes
,
const
mluOpTensorDescriptor_t
aspect_ratios_desc
,
const
void
*
aspect_ratios
,
const
mluOpTensorDescriptor_t
variances_desc
,
const
void
*
variances
,
const
mluOpTensorDescriptor_t
max_sizes_desc
,
const
void
*
max_sizes
,
const
int
height
,
const
int
width
,
const
int
im_height
,
const
int
im_width
,
const
float
step_h
,
const
float
step_w
,
const
float
offset
,
const
bool
clip
,
const
bool
min_max_aspect_ratios_order
,
const
mluOpTensorDescriptor_t
output_desc
,
void
*
output
,
const
mluOpTensorDescriptor_t
var_desc
,
void
*
var
)
{
mluOpHandle_t
handle
=
GetMLUOpHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
mluOpPriorBox
(
handle
,
min_sizes_desc
,
min_sizes
,
aspect_ratios_desc
,
aspect_ratios
,
variances_desc
,
variances
,
max_sizes_desc
,
max_sizes
,
height
,
width
,
im_height
,
im_width
,
step_h
,
step_w
,
offset
,
clip
,
min_max_aspect_ratios_order
,
output_desc
,
output
,
var_desc
,
var
));
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/mlu/mlu_baseop.h
View file @
dbe08e9b
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <cn_api.h>
#include <cn_api.h>
#include <cnnl.h>
#include <cnnl.h>
#include <concurrentqueue.h>
#include <concurrentqueue.h>
#include <mlu_op.h>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -138,6 +139,54 @@ inline cnnlDataType_t ToCnnlDataType() {
...
@@ -138,6 +139,54 @@ inline cnnlDataType_t ToCnnlDataType() {
return
ToCnnlDataType
(
type
);
return
ToCnnlDataType
(
type
);
}
}
inline
mluOpDataType_t
ToMluOpDataType
(
const
paddle
::
experimental
::
DataType
&
dtype
)
{
mluOpDataType_t
type
=
MLUOP_DTYPE_FLOAT
;
switch
(
dtype
)
{
case
DataType
::
FLOAT16
:
type
=
MLUOP_DTYPE_HALF
;
break
;
case
DataType
::
FLOAT32
:
type
=
MLUOP_DTYPE_FLOAT
;
break
;
case
DataType
::
FLOAT64
:
type
=
MLUOP_DTYPE_DOUBLE
;
break
;
case
DataType
::
INT8
:
type
=
MLUOP_DTYPE_INT8
;
break
;
case
DataType
::
INT16
:
type
=
MLUOP_DTYPE_INT16
;
break
;
case
DataType
::
INT32
:
type
=
MLUOP_DTYPE_INT32
;
break
;
case
DataType
::
INT64
:
type
=
MLUOP_DTYPE_INT64
;
break
;
case
DataType
::
BOOL
:
type
=
MLUOP_DTYPE_BOOL
;
break
;
case
DataType
::
UINT8
:
type
=
MLUOP_DTYPE_UINT8
;
break
;
default:
break
;
}
return
type
;
}
inline
mluOpDataType_t
ToMluOpDataType
(
const
paddle
::
framework
::
proto
::
VarType
::
Type
&
type
)
{
return
ToMluOpDataType
(
framework
::
TransToPhiDataType
(
type
));
}
template
<
typename
T
>
inline
mluOpDataType_t
ToMluOpDataType
()
{
auto
type
=
framework
::
ToDataType
(
std
::
type_index
(
typeid
(
T
)));
return
ToMluOpDataType
(
type
);
}
// Converts (via narrowing) a type T value to a type U, and checks that the
// Converts (via narrowing) a type T value to a type U, and checks that the
// value has no value change due to the conversion.
// value has no value change due to the conversion.
template
<
typename
WideT
,
typename
NarrowT
>
template
<
typename
WideT
,
typename
NarrowT
>
...
@@ -152,6 +201,10 @@ inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
...
@@ -152,6 +201,10 @@ inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
return
ctx
.
template
device_context
<
MLUDeviceContext
>().
cnnl_handle
();
return
ctx
.
template
device_context
<
MLUDeviceContext
>().
cnnl_handle
();
}
}
inline
static
mluOpHandle_t
GetMLUOpHandleFromCTX
(
const
ExecutionContext
&
ctx
)
{
return
ctx
.
template
device_context
<
MLUDeviceContext
>().
mluOp_handle
();
}
inline
static
const
MLUDeviceContext
&
GetDevCtxFromCTX
(
inline
static
const
MLUDeviceContext
&
GetDevCtxFromCTX
(
const
ExecutionContext
&
ctx
)
{
const
ExecutionContext
&
ctx
)
{
return
ctx
.
template
device_context
<
MLUDeviceContext
>();
return
ctx
.
template
device_context
<
MLUDeviceContext
>();
...
@@ -281,6 +334,74 @@ class MLUCnnlTensorDesc {
...
@@ -281,6 +334,74 @@ class MLUCnnlTensorDesc {
cnnlTensorDescriptor_t
raw_tensor_desc
=
nullptr
;
cnnlTensorDescriptor_t
raw_tensor_desc
=
nullptr
;
};
};
class
MLUOpTensorDesc
{
public:
MLUOpTensorDesc
()
{}
// SE_DISALLOW_COPY_AND_ASSIGN
MLUOpTensorDesc
(
const
MLUOpTensorDesc
&
desc
)
=
delete
;
MLUOpTensorDesc
&
operator
=
(
const
MLUOpTensorDesc
&
)
=
delete
;
MLUOpTensorDesc
(
MLUOpTensorDesc
&&
rhs
)
:
raw_tensor_desc
(
rhs
.
raw_tensor_desc
)
{
rhs
.
raw_tensor_desc
=
nullptr
;
}
MLUOpTensorDesc
&
operator
=
(
MLUOpTensorDesc
&&
rhs
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
const
mluOpTensorLayout_t
layout
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
int
position
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
const
mluOpTensorLayout_t
layout
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
int
position
);
MLUOpTensorDesc
(
const
Tensor
&
tensor
,
const
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
);
explicit
MLUOpTensorDesc
(
const
Tensor
&
tensor
);
MLUOpTensorDesc
(
const
Tensor
&
tensor
,
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
,
int
position
);
MLUOpTensorDesc
(
const
Tensor
&
tensor
,
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
,
int
position
,
float
scale
);
~
MLUOpTensorDesc
();
const
mluOpTensorDescriptor_t
get
()
const
{
return
raw_tensor_desc
;
}
private:
mluOpTensorDescriptor_t
raw_tensor_desc
=
nullptr
;
};
class
MLUCnnlActivationDesc
{
class
MLUCnnlActivationDesc
{
public:
public:
MLUCnnlActivationDesc
(
const
MLUCnnlActivationDesc
&
desc
)
=
delete
;
MLUCnnlActivationDesc
(
const
MLUCnnlActivationDesc
&
desc
)
=
delete
;
...
@@ -1921,6 +2042,28 @@ class MLUCnnl {
...
@@ -1921,6 +2042,28 @@ class MLUCnnl {
const
cnnlTensorDescriptor_t
output_desc
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
void
*
output
);
static
void
SmoothL1LossForward
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
t_desc
,
const
void
*
target
,
const
float
beta
,
const
cnnlSmoothL1LossAlgorithm_t
algorithm
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
);
static
void
SmoothL1LossBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
target_desc
,
const
void
*
target
,
const
cnnlTensorDescriptor_t
dy_desc
,
const
void
*
dy
,
const
float
beta
,
const
cnnlSmoothL1LossAlgorithm_t
algorithm
,
const
cnnlTensorDescriptor_t
dx_desc
,
void
*
dx
);
static
void
EmbeddingForward
(
const
ExecutionContext
&
ctx
,
static
void
EmbeddingForward
(
const
ExecutionContext
&
ctx
,
const
int
padding_idx
,
const
int
padding_idx
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
cnnlTensorDescriptor_t
weight_desc
,
...
@@ -2149,6 +2292,50 @@ class MLUCnnl {
...
@@ -2149,6 +2292,50 @@ class MLUCnnl {
void
*
diff_x
);
void
*
diff_x
);
};
};
class
MLUOP
{
public:
static
void
OpYoloBox
(
const
ExecutionContext
&
ctx
,
const
mluOpTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
mluOpTensorDescriptor_t
img_size_desc
,
const
void
*
img_size
,
const
mluOpTensorDescriptor_t
anchors_desc
,
const
void
*
anchors
,
const
int
class_num
,
const
float
conf_thresh
,
const
int
downsample_ratio
,
const
bool
clip_bbox
,
const
float
scale
,
const
bool
iou_aware
,
const
float
iou_aware_factor
,
const
mluOpTensorDescriptor_t
boxes_desc
,
void
*
boxes
,
const
mluOpTensorDescriptor_t
scores_desc
,
void
*
scores
);
static
void
OpPriorBox
(
const
ExecutionContext
&
ctx
,
const
mluOpTensorDescriptor_t
min_sizes_desc
,
const
void
*
min_sizes
,
const
mluOpTensorDescriptor_t
aspect_ratios_desc
,
const
void
*
aspect_ratios
,
const
mluOpTensorDescriptor_t
variances_desc
,
const
void
*
variances
,
const
mluOpTensorDescriptor_t
max_sizes_desc
,
const
void
*
max_sizes
,
const
int
height
,
const
int
width
,
const
int
im_height
,
const
int
im_width
,
const
float
step_h
,
const
float
step_w
,
const
float
offset
,
const
bool
clip
,
const
bool
min_max_aspect_ratios_order
,
const
mluOpTensorDescriptor_t
output_desc
,
void
*
output
,
const
mluOpTensorDescriptor_t
var_desc
,
void
*
var
);
};
const
std
::
map
<
const
std
::
string
,
std
::
pair
<
std
::
vector
<
int
>
,
std
::
vector
<
int
>>>
const
std
::
map
<
const
std
::
string
,
std
::
pair
<
std
::
vector
<
int
>
,
std
::
vector
<
int
>>>
TransPermMap
=
{
TransPermMap
=
{
// trans_mode, (forward_perm, backward_perm)
// trans_mode, (forward_perm, backward_perm)
...
...
paddle/fluid/operators/one_hot_v2_op_mlu.cc
View file @
dbe08e9b
...
@@ -97,4 +97,6 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
...
@@ -97,4 +97,6 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
one_hot_v2
,
ops
::
OneHotV2MLUKernel
<
int32_t
>
);
REGISTER_OP_MLU_KERNEL
(
one_hot_v2
,
ops
::
OneHotV2MLUKernel
<
int32_t
>
,
ops
::
OneHotV2MLUKernel
<
int64_t
>
);
paddle/fluid/operators/optimizers/adam_op_mlu.cc
View file @
dbe08e9b
...
@@ -291,11 +291,38 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
...
@@ -291,11 +291,38 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
skip_update
=
skip_update_vec
[
0
];
skip_update
=
skip_update_vec
[
0
];
}
}
bool
with_decay
=
ctx
.
Attr
<
bool
>
(
"with_decay"
);
bool
with_decay
=
ctx
.
Attr
<
bool
>
(
"with_decay"
);
const
bool
multi_precision
=
ctx
.
Attr
<
bool
>
(
"multi_precision"
);
auto
*
param_out
=
ctx
.
Output
<
LoDTensor
>
(
"ParamOut"
);
auto
*
master_param_out
=
ctx
.
Output
<
LoDTensor
>
(
"MasterParamOut"
);
const
auto
*
master_param
=
ctx
.
Input
<
LoDTensor
>
(
"MasterParam"
);
VLOG
(
3
)
<<
"Skip update: "
<<
skip_update
<<
", With decay: "
<<
with_decay
;
VLOG
(
3
)
<<
"Skip update: "
<<
skip_update
<<
", With decay: "
<<
with_decay
;
if
(
!
skip_update
&&
with_decay
)
{
if
(
!
skip_update
&&
with_decay
)
{
if
(
ctx
.
HasInput
(
"MasterParam"
))
{
auto
*
param
=
ctx
.
Input
<
LoDTensor
>
(
"Param"
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
MLUCnnlTensorDesc
param_desc
(
*
param
);
"Master Param is not supported on MLU"
));
if
(
multi_precision
)
{
VLOG
(
3
)
<<
"[adamw] multi_precision, cast masterparam to param."
;
bool
has_master
=
ctx
.
HasInput
(
"MasterParam"
)
&&
ctx
.
HasOutput
(
"MasterParamOut"
);
PADDLE_ENFORCE_EQ
(
has_master
,
true
,
platform
::
errors
::
InvalidArgument
(
"The Input(MasterParam) and Output(MasterParamOut) "
"should not be null when "
"the attr `multi_precision` is true"
));
// cast masterparam (fp32) to param (fp16), then paramout (fp16) to
// masterparamout (fp32)
MLUCnnlTensorDesc
master_param_desc
(
*
master_param
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
framework
::
TransToProtoVarType
(
master_param
->
dtype
()),
framework
::
TransToProtoVarType
(
param
->
dtype
()));
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
master_param_desc
.
get
(),
GetBasePtr
(
master_param
),
param_desc
.
get
(),
const_cast
<
void
*>
(
GetBasePtr
(
param
)));
}
else
{
}
else
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE_EQ
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
PADDLE_ENFORCE_EQ
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
...
@@ -305,13 +332,12 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
...
@@ -305,13 +332,12 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
"but the received is %s"
,
"but the received is %s"
,
ctx
.
InputNames
(
"Param"
).
front
(),
ctx
.
InputNames
(
"Param"
).
front
(),
framework
::
ToTypeName
(
param_var
->
Type
())));
framework
::
ToTypeName
(
param_var
->
Type
())));
auto
*
param
=
ctx
.
Input
<
LoDTensor
>
(
"Param"
);
auto
*
lr
=
ctx
.
Input
<
LoDTensor
>
(
"LearningRate"
);
auto
*
lr
=
ctx
.
Input
<
LoDTensor
>
(
"LearningRate"
);
float
coeff
=
ctx
.
Attr
<
float
>
(
"coeff"
);
float
coeff
=
ctx
.
Attr
<
float
>
(
"coeff"
);
// update param with decay coeff: mul(-1 * lr, coeff * param) + param
// update param with decay coeff: mul(-1 * lr, coeff * param) + param
MLUCnnlTensorDesc
lr_desc
(
*
lr
);
MLUCnnlTensorDesc
lr_desc
(
*
lr
);
MLUCnnlTensorDesc
param_desc
(
*
param
);
MLUCnnlOpTensorDesc
mul_op_desc
(
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
...
@@ -330,9 +356,244 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
...
@@ -330,9 +356,244 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
}
}
}
}
AdamMLUKernel
<
T
>::
Compute
(
ctx
);
AdamMLUKernel
<
T
>::
Compute
(
ctx
);
if
(
multi_precision
)
{
VLOG
(
3
)
<<
"[adamw] multi_precision, cast paramout to masterparamout."
;
// cast paramout to masterparamout
master_param_out
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
framework
::
TransToProtoVarType
(
param_out
->
dtype
()),
framework
::
TransToProtoVarType
(
master_param_out
->
dtype
()));
MLUCnnlTensorDesc
param_out_desc
(
*
param_out
);
MLUCnnlTensorDesc
master_param_out_desc
(
*
master_param_out
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
param_out_desc
.
get
(),
GetBasePtr
(
param_out
),
master_param_out_desc
.
get
(),
GetBasePtr
(
master_param_out
));
}
}
}
};
};
template
<
typename
T
>
class
MergedAdamMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
// Get inputs and outputs
auto
params
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Param"
);
auto
grads
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Grad"
);
auto
lrs
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
mom1s
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Moment1"
);
auto
mom2s
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Moment2"
);
auto
beta1_pows
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Beta1Pow"
);
auto
beta2_pows
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Beta2Pow"
);
auto
master_params
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"MasterParam"
);
auto
param_outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
mom1_outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"Moment1Out"
);
auto
mom2_outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"Moment2Out"
);
auto
beta1_pow_outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"Beta1PowOut"
);
auto
beta2_pow_outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"Beta2PowOut"
);
// Check validation of inputs and outputs
size_t
param_num
=
params
.
size
();
PADDLE_ENFORCE_EQ
(
param_num
,
param_outs
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Output(ParamOut) must be equal to "
"Input(Param), but got the size of Output(ParamOut) "
"is %d, the size of Input(Param) is %d."
,
param_outs
.
size
(),
param_num
));
bool
skip_update
=
false
;
if
(
ctx
.
HasInput
(
"SkipUpdate"
))
{
auto
*
skip_update_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"SkipUpdate"
);
PADDLE_ENFORCE_EQ
(
skip_update_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(SkipUpdate) size must be 1, but get %d"
,
skip_update_tensor
->
numel
()));
std
::
vector
<
bool
>
skip_update_vec
;
paddle
::
framework
::
TensorToVector
(
*
skip_update_tensor
,
ctx
.
device_context
(),
&
skip_update_vec
);
ctx
.
device_context
().
Wait
();
skip_update
=
skip_update_vec
[
0
];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if
(
skip_update
)
{
VLOG
(
4
)
<<
"MergedAdam skip update"
;
for
(
size_t
i
=
0
;
i
<
param_num
;
++
i
)
{
framework
::
TensorCopy
(
*
params
[
i
],
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
param_outs
[
i
]);
framework
::
TensorCopy
(
*
mom1s
[
i
],
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
mom1_outs
[
i
]);
framework
::
TensorCopy
(
*
mom2s
[
i
],
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
mom2_outs
[
i
]);
framework
::
TensorCopy
(
*
beta1_pows
[
i
],
beta1_pows
[
i
]
->
place
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
beta1_pow_outs
[
i
]);
framework
::
TensorCopy
(
*
beta2_pows
[
i
],
beta2_pows
[
i
]
->
place
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
beta2_pow_outs
[
i
]);
}
return
;
}
bool
use_global_beta_pow
=
ctx
.
Attr
<
bool
>
(
"use_global_beta_pow"
);
VLOG
(
4
)
<<
"use_global_beta_pow:"
<<
use_global_beta_pow
;
// Get beta1, beta2 and epsilon from attribute.
const
Tensor
*
beta1_tensor
=
nullptr
;
const
Tensor
*
beta2_tensor
=
nullptr
;
const
Tensor
*
epsilon_tensor
=
nullptr
;
Tensor
beta1_tmp
(
experimental
::
DataType
::
FLOAT32
);
Tensor
beta2_tmp
(
experimental
::
DataType
::
FLOAT32
);
Tensor
epsilon_tmp
(
experimental
::
DataType
::
FLOAT32
);
T
beta1
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta1"
));
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
beta1_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
beta2_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
epsilon_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_tmp_desc
(
beta1_tmp
);
MLUCnnlTensorDesc
beta2_tmp_desc
(
beta2_tmp
);
MLUCnnlTensorDesc
epsilon_tmp_desc
(
epsilon_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta1
,
beta1_tmp_desc
.
get
(),
GetBasePtr
(
&
beta1_tmp
));
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta2
,
beta2_tmp_desc
.
get
(),
GetBasePtr
(
&
beta2_tmp
));
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
epsilon
,
epsilon_tmp_desc
.
get
(),
GetBasePtr
(
&
epsilon_tmp
));
beta1_tensor
=
&
beta1_tmp
;
beta2_tensor
=
&
beta2_tmp
;
epsilon_tensor
=
&
epsilon_tmp
;
// Loop to compute
for
(
size_t
i
=
0
;
i
<
param_num
;
++
i
)
{
VLOG
(
4
)
<<
"[MergedAdam] loop: "
<<
i
;
param_outs
[
i
]
->
ShareDataWith
(
*
params
[
i
]);
mom1_outs
[
i
]
->
ShareDataWith
(
*
mom1s
[
i
]);
mom2_outs
[
i
]
->
ShareDataWith
(
*
mom2s
[
i
]);
LoDTensor
beta1_pow_tmp
;
LoDTensor
beta2_pow_tmp
;
if
(
beta1_pows
[
i
]
->
place
()
==
platform
::
CPUPlace
())
{
T
beta1
=
*
beta1_pows
[
i
]
->
data
<
T
>
();
beta1_pow_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_pow_tmp_desc
(
beta1_pow_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta1
,
beta1_pow_tmp_desc
.
get
(),
GetBasePtr
(
&
beta1_pow_tmp
));
beta1_pows
[
i
]
=
&
beta1_pow_tmp
;
}
if
(
beta2_pows
[
i
]
->
place
()
==
platform
::
CPUPlace
())
{
T
beta2
=
*
beta2_pows
[
i
]
->
data
<
T
>
();
beta2_pow_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta2_pow_tmp_desc
(
beta2_pow_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta2
,
beta2_pow_tmp_desc
.
get
(),
GetBasePtr
(
&
beta2_pow_tmp
));
beta2_pows
[
i
]
=
&
beta2_pow_tmp
;
}
VLOG
(
3
)
<<
"beta1_pow.numel() : "
<<
beta1_pows
[
i
]
->
numel
()
<<
"beta2_pow.numel() : "
<<
beta2_pows
[
i
]
->
numel
();
VLOG
(
3
)
<<
"param.numel(): "
<<
params
[
i
]
->
numel
();
PADDLE_ENFORCE_EQ
(
beta1_pow_outs
[
i
]
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"beta1 pow output size should be 1, but received "
"value is:%d."
,
beta1_pow_outs
[
i
]
->
numel
()));
PADDLE_ENFORCE_EQ
(
beta2_pow_outs
[
i
]
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"beta2 pow output size should be 1, but received "
"value is:%d."
,
beta2_pow_outs
[
i
]
->
numel
()));
MLUCnnlTensorDesc
param_desc
(
*
params
[
i
]);
MLUCnnlTensorDesc
mom1_desc
(
*
mom1s
[
i
]);
MLUCnnlTensorDesc
mom2_desc
(
*
mom2s
[
i
]);
MLUCnnlTensorDesc
grad_desc
(
*
grads
[
i
]);
MLUCnnl
::
ApplyAdam
(
ctx
,
param_desc
.
get
(),
GetBasePtr
(
param_outs
[
i
]),
mom1_desc
.
get
(),
GetBasePtr
(
mom1_outs
[
i
]),
mom2_desc
.
get
(),
GetBasePtr
(
mom2_outs
[
i
]),
grad_desc
.
get
(),
GetBasePtr
(
grads
[
i
]),
GetBasePtr
(
lrs
[
i
]),
GetBasePtr
(
beta1_tensor
),
GetBasePtr
(
beta2_tensor
),
GetBasePtr
(
beta1_pows
[
i
]),
GetBasePtr
(
beta2_pows
[
i
]),
GetBasePtr
(
epsilon_tensor
),
/*use_nesterov*/
false
);
if
(
!
use_global_beta_pow
)
{
beta1_pow_outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
beta2_pow_outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_desc
(
*
beta1_tensor
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_pows
[
i
]),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_tensor
),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_pow_outs
[
i
]),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_pows
[
i
]),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_tensor
),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_pow_outs
[
i
]),
ToCnnlDataType
<
T
>
());
}
}
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
...
@@ -346,3 +607,7 @@ REGISTER_OP_MLU_KERNEL(adam,
...
@@ -346,3 +607,7 @@ REGISTER_OP_MLU_KERNEL(adam,
REGISTER_OP_MLU_KERNEL
(
adamw
,
REGISTER_OP_MLU_KERNEL
(
adamw
,
ops
::
AdamWMLUKernel
<
float
>
,
ops
::
AdamWMLUKernel
<
float
>
,
ops
::
AdamWMLUKernel
<
plat
::
float16
>
);
ops
::
AdamWMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
merged_adam
,
ops
::
MergedAdamMLUKernel
<
float
>
,
ops
::
MergedAdamMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/pool_op_mlu.cc
View file @
dbe08e9b
...
@@ -141,10 +141,9 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
...
@@ -141,10 +141,9 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
handle
,
pool_mode
,
out_w
,
out_h
,
&
extra_input_size
);
handle
,
pool_mode
,
out_w
,
out_h
,
&
extra_input_size
);
if
(
extra_input_size
>
0
)
{
if
(
extra_input_size
>
0
)
{
phi
::
CPUContext
cpu_ctx
;
framework
::
Tensor
extra_host_tensor
;
framework
::
Tensor
extra_host_tensor
=
extra_host_tensor
.
mutable_data
<
int8_t
>
(
ctx
.
AllocateTmpTensor
<
int8_t
,
phi
::
CPUContext
>
(
{
static_cast
<
int64_t
>
(
extra_input_size
)},
platform
::
CPUPlace
());
{
static_cast
<
int64_t
>
(
extra_input_size
)},
cpu_ctx
);
cnnlInitPoolingExtraInput
(
handle
,
cnnlInitPoolingExtraInput
(
handle
,
pool_desc
.
get
(),
pool_desc
.
get
(),
trans_in_x_desc
.
get
(),
trans_in_x_desc
.
get
(),
...
...
paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
View file @
dbe08e9b
...
@@ -92,6 +92,112 @@ class ReduceMaxMLUKernel : public framework::OpKernel<T> {
...
@@ -92,6 +92,112 @@ class ReduceMaxMLUKernel : public framework::OpKernel<T> {
}
}
};
};
template
<
typename
T
>
class
ReduceMaxGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
out
=
context
.
Input
<
Tensor
>
(
"Out"
);
auto
*
out_grad
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
reduce_dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
int
in_dtype
=
context
.
Attr
<
int
>
(
"in_dtype"
);
PADDLE_ENFORCE_EQ
(
in_dtype
==
-
1
,
true
,
platform
::
errors
::
InvalidArgument
(
"MLU only support in_dtype == -1 in reduce_max_grad op."
));
auto
*
x_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
x_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
place
=
context
.
GetPlace
();
// broadcast
auto
x_dims_vec
=
phi
::
vectorize
(
x
->
dims
());
if
(
reduce_all
)
{
reduce_dims
.
clear
();
for
(
size_t
d
=
0
;
d
<
x_dims_vec
.
size
();
++
d
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
d
));
}
}
Tensor
tmp_out
,
tmp_out_grad
;
auto
tmp_out_dims_vec
=
x_dims_vec
;
for
(
auto
d
:
reduce_dims
)
{
if
(
d
<
0
)
{
d
+=
x_dims_vec
.
size
();
}
tmp_out_dims_vec
[
d
]
=
1
;
}
tmp_out
.
ShareDataWith
(
*
out
);
tmp_out
.
Resize
(
phi
::
make_ddim
(
tmp_out_dims_vec
));
tmp_out_grad
.
ShareDataWith
(
*
out_grad
);
tmp_out_grad
.
Resize
(
phi
::
make_ddim
(
tmp_out_dims_vec
));
Tensor
transformed_out
(
x
->
type
());
transformed_out
.
Resize
(
phi
::
make_ddim
(
x_dims_vec
));
transformed_out
.
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
tmp_out_desc
(
tmp_out
);
MLUCnnlTensorDesc
transformed_out_desc
(
transformed_out
);
MLUCnnl
::
BroadcastTo
(
context
,
tmp_out_desc
.
get
(),
GetBasePtr
(
&
tmp_out
),
transformed_out_desc
.
get
(),
GetBasePtr
(
&
transformed_out
));
Tensor
transformed_out_grad
(
x
->
type
());
transformed_out_grad
.
Resize
(
phi
::
make_ddim
(
x_dims_vec
));
transformed_out_grad
.
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
tmp_out_grad_desc
(
tmp_out_grad
);
MLUCnnlTensorDesc
transformed_out_grad_desc
(
transformed_out_grad
);
MLUCnnl
::
BroadcastTo
(
context
,
tmp_out_grad_desc
.
get
(),
GetBasePtr
(
&
tmp_out_grad
),
transformed_out_grad_desc
.
get
(),
GetBasePtr
(
&
transformed_out_grad
));
// compare
Tensor
equal_cond
;
equal_cond
.
mutable_data
<
bool
>
(
x_grad
->
dims
(),
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
equal_cond_desc
(
equal_cond
);
MLUCnnl
::
Logic
(
context
,
CNNL_LOGIC_OP_EQ
,
x_desc
.
get
(),
GetBasePtr
(
x
),
transformed_out_desc
.
get
(),
GetBasePtr
(
&
transformed_out
),
equal_cond_desc
.
get
(),
GetBasePtr
(
&
equal_cond
));
// select
Tensor
t_zero
;
t_zero
.
mutable_data
<
T
>
(
x_grad
->
dims
(),
place
);
FillMLUTensorWithHostValue
<
T
>
(
context
,
static_cast
<
T
>
(
0
),
&
t_zero
);
t_zero
.
Resize
(
x_grad
->
dims
());
MLUCnnlTensorDesc
t_zero_desc
(
t_zero
);
MLUCnnlTensorDesc
x_grad_desc
(
*
x_grad
);
MLUCnnl
::
Select
(
context
,
equal_cond_desc
.
get
(),
GetBasePtr
(
&
equal_cond
),
transformed_out_grad_desc
.
get
(),
GetBasePtr
(
&
transformed_out_grad
),
t_zero_desc
.
get
(),
GetBasePtr
(
&
t_zero
),
x_grad_desc
.
get
(),
GetBasePtr
(
x_grad
));
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
...
@@ -102,3 +208,7 @@ REGISTER_OP_MLU_KERNEL(reduce_max,
...
@@ -102,3 +208,7 @@ REGISTER_OP_MLU_KERNEL(reduce_max,
ops
::
ReduceMaxMLUKernel
<
float
>
,
ops
::
ReduceMaxMLUKernel
<
float
>
,
ops
::
ReduceMaxMLUKernel
<
plat
::
float16
>
,
ops
::
ReduceMaxMLUKernel
<
plat
::
float16
>
,
ops
::
ReduceMaxMLUKernel
<
int
>
);
ops
::
ReduceMaxMLUKernel
<
int
>
);
REGISTER_OP_MLU_KERNEL
(
reduce_max_grad
,
ops
::
ReduceMaxGradMLUKernel
<
float
>
,
ops
::
ReduceMaxGradMLUKernel
<
plat
::
float16
>
,
ops
::
ReduceMaxGradMLUKernel
<
int
>
);
paddle/fluid/operators/select_output_op.cc
View file @
dbe08e9b
...
@@ -93,7 +93,8 @@ class SelectOutputInferShape : public framework::InferShapeBase {
...
@@ -93,7 +93,8 @@ class SelectOutputInferShape : public framework::InferShapeBase {
void
operator
()(
framework
::
InferShapeContext
*
context
)
const
override
{
void
operator
()(
framework
::
InferShapeContext
*
context
)
const
override
{
OP_INOUT_CHECK
(
context
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"SelectOutput"
);
OP_INOUT_CHECK
(
context
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"SelectOutput"
);
OP_INOUT_CHECK
(
context
->
HasInput
(
"Mask"
),
"Input"
,
"Mask"
,
"SelectOutput"
);
OP_INOUT_CHECK
(
context
->
HasInput
(
"Mask"
),
"Input"
,
"Mask"
,
"SelectOutput"
);
OP_INOUT_CHECK
(
context
->
HasOutputs
(
"Out"
),
"Output"
,
"Out"
,
"SelectOutput"
);
OP_INOUT_CHECK
(
context
->
HasOutputs
(
"Out"
,
true
),
"Output"
,
"Out"
,
"SelectOutput"
);
}
}
};
};
...
...
paddle/fluid/operators/strided_slice_op_mlu.cc
View file @
dbe08e9b
...
@@ -19,6 +19,11 @@ limitations under the License. */
...
@@ -19,6 +19,11 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
phi
::
DenseTensor
;
using
Variable
=
framework
::
Variable
;
using
LoDTensorArray
=
framework
::
LoDTensorArray
;
using
DDim
=
framework
::
DDim
;
static
void
ProcessStridedSliceParams
(
static
void
ProcessStridedSliceParams
(
const
std
::
vector
<
int
>&
axes
,
const
std
::
vector
<
int
>&
axes
,
const
DDim
&
input_dims
,
const
DDim
&
input_dims
,
...
...
paddle/fluid/operators/sum_op.cc
View file @
dbe08e9b
...
@@ -9,15 +9,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -9,15 +9,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/sum_op.h"
#include <algorithm>
#include <algorithm>
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
...
@@ -32,94 +34,6 @@ class SumOp : public framework::OperatorWithKernel {
...
@@ -32,94 +34,6 @@ class SumOp : public framework::OperatorWithKernel {
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInputs
(
"X"
),
"Input"
,
"X"
,
"sum"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"sum"
);
if
(
ctx
->
IsRuntime
()
&&
ctx
->
GetOutputsVarType
(
"Out"
)[
0
]
==
framework
::
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
return
;
// skip runtime infershape when is tensor array;
}
auto
x_var_types
=
ctx
->
GetInputsVarType
(
"X"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
auto
N
=
x_dims
.
size
();
PADDLE_ENFORCE_GT
(
N
,
0
,
platform
::
errors
::
InvalidArgument
(
"The input tensor X's dimensions of SumOp "
"should be larger than 0. But received X's dimensions %d, "
"X's shape = [%s]."
,
N
,
&
x_dims
));
if
(
N
==
1
)
{
VLOG
(
3
)
<<
"Warning: SumOp have only one input, may waste memory"
;
}
framework
::
DDim
in_dim
({
0
});
for
(
size_t
i
=
0
;
i
<
x_dims
.
size
();
++
i
)
{
auto
&
x_dim
=
x_dims
[
i
];
// x_dim.size() == 1 means the real dim of selected rows is [0]
if
(
x_var_types
[
i
]
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
&&
x_dim
.
size
()
==
1
)
{
continue
;
}
if
(
phi
::
product
(
x_dim
)
==
0
)
{
continue
;
}
if
(
phi
::
product
(
in_dim
)
==
0
)
{
in_dim
=
x_dim
;
}
else
{
if
(
ctx
->
IsRuntime
())
{
PADDLE_ENFORCE_EQ
(
in_dim
,
x_dim
,
platform
::
errors
::
InvalidArgument
(
"The input tensor X of SumOp must"
" have same shape. But received X[0]'s shape = "
"[%s], X[%d]'s shape = [%s]."
,
in_dim
,
i
,
x_dim
));
}
else
{
PADDLE_ENFORCE_EQ
(
in_dim
.
size
(),
x_dim
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The input tensor X of SumOp must have same "
"dimensions. But received X[0]'s dimensions = %d, X[0]'s "
"shape = "
"[%s], X[%d]'s dimensions = %d, X[%d]'s shape = [%s]."
,
in_dim
.
size
(),
in_dim
,
i
,
x_dim
.
size
(),
i
,
x_dim
));
// if in_dim or x_dim has -1, not check equal
for
(
int
j
=
0
;
j
<
x_dim
.
size
();
++
j
)
{
if
(
x_dim
[
j
]
==
-
1
||
in_dim
[
j
]
==
-
1
)
{
continue
;
}
PADDLE_ENFORCE_EQ
(
in_dim
[
j
],
x_dim
[
j
],
platform
::
errors
::
InvalidArgument
(
"The input tensor X of SumOp must have same shape "
"if not -1."
"But received X[0]'s shape = [%s], X[%d]'s shape = [%s]."
,
in_dim
,
i
,
x_dim
));
}
}
}
}
ctx
->
SetOutputDim
(
"Out"
,
in_dim
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
...
@@ -350,18 +264,16 @@ DECLARE_INPLACE_OP_INFERER(SumInplaceInferer, {"X", "Out"});
...
@@ -350,18 +264,16 @@ DECLARE_INPLACE_OP_INFERER(SumInplaceInferer, {"X", "Out"});
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
DECLARE_INFER_SHAPE_FUNCTOR
(
sum
,
AddNInferShapeFunctor
,
PD_INFER_META
(
phi
::
AddNTensorArrayInferMeta
));
REGISTER_OPERATOR
(
sum
,
REGISTER_OPERATOR
(
sum
,
ops
::
SumOp
,
ops
::
SumOp
,
ops
::
SumOpMaker
,
ops
::
SumOpMaker
,
ops
::
SumGradDescMaker
,
ops
::
SumGradDescMaker
,
ops
::
SumGradOpBaseMaker
,
ops
::
SumGradOpBaseMaker
,
ops
::
SumOpVarTypeInference
,
ops
::
SumOpVarTypeInference
,
ops
::
SumInplaceInferer
);
ops
::
SumInplaceInferer
,
AddNInferShapeFunctor
);
REGISTER_OP_CPU_KERNEL
(
sum
,
ops
::
SumKernel
<
phi
::
CPUContext
,
float
>
,
ops
::
SumKernel
<
phi
::
CPUContext
,
double
>
,
ops
::
SumKernel
<
phi
::
CPUContext
,
int
>
,
ops
::
SumKernel
<
phi
::
CPUContext
,
paddle
::
platform
::
bfloat16
>
,
ops
::
SumKernel
<
phi
::
CPUContext
,
int64_t
>
);
paddle/fluid/operators/sum_op.cu
deleted
100644 → 0
View file @
b5499578
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/platform/device_context.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
plat
=
paddle
::
platform
;
namespace
paddle
{
namespace
operators
{
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
class
T
>
__global__
void
Sum2CUDAKernel
(
const
T
*
in_0
,
const
T
*
in_1
,
T
*
out
,
int64_t
N
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
id
<
N
)
{
out
[
id
]
=
in_0
[
id
]
+
in_1
[
id
];
id
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
template
<
class
T
>
__global__
void
SumArrayCUDAKernel
(
T
**
in
,
T
*
out
,
int64_t
N
,
size_t
in_size
,
bool
read_dst
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
id
<
N
)
{
T
total
(
read_dst
?
out
[
id
]
:
static_cast
<
T
>
(
0
));
for
(
int
i
=
0
;
i
<
in_size
;
++
i
)
{
const
T
*
tmp
=
in
[
i
];
if
(
tmp
)
{
total
+=
tmp
[
id
];
}
}
out
[
id
]
=
total
;
id
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
template
<
class
T
>
__global__
void
SumSelectedRowsCUDAKernel
(
T
**
sr_in_out
,
int64_t
N
,
size_t
rows
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
id
<
N
)
{
for
(
int
i
=
0
;
i
<
2
*
rows
;
i
+=
2
)
{
const
T
*
tmp
=
sr_in_out
[
i
];
T
*
tmp_out
=
sr_in_out
[
i
+
1
];
if
(
tmp
&&
tmp_out
)
{
tmp_out
[
id
]
+=
tmp
[
id
];
}
}
id
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
template
<
class
T
>
void
SumToLoDTensor
(
const
framework
::
ExecutionContext
&
context
)
{
auto
in_vars
=
context
.
MultiInputVar
(
"X"
);
const
size_t
in_num
=
in_vars
.
size
();
constexpr
size_t
theory_sm_threads
=
1024
;
auto
&
dev_ctx
=
context
.
template
device_context
<
phi
::
GPUContext
>();
auto
stream
=
dev_ctx
.
stream
();
auto
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
auto
sm_count
=
max_threads
/
theory_sm_threads
;
size_t
tile_size
=
0
;
dim3
grids
;
dim3
blocks
;
auto
ComputeKernelParameter
=
[
&
](
size_t
length
)
{
if
(
length
>=
max_threads
)
tile_size
=
1024
;
else
if
(
length
<
max_threads
&&
length
>
sm_count
*
128
)
tile_size
=
512
;
else
if
(
length
<=
sm_count
*
128
)
tile_size
=
256
;
grids
=
dim3
(
CEIL_DIV
(
length
,
tile_size
),
1
,
1
);
blocks
=
dim3
(
tile_size
,
1
,
1
);
};
auto
*
out
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
bool
in_place
=
in_vars
[
0
]
==
context
.
OutputVar
(
"Out"
);
if
(
!
in_place
)
{
auto
*
out_ptr
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
in_num
>=
1
&&
in_vars
[
0
]
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
&
in_0_tensor
=
in_vars
[
0
]
->
Get
<
framework
::
LoDTensor
>
();
if
(
in_0_tensor
.
numel
()
>
0
)
{
in_place
=
(
in_0_tensor
.
data
<
T
>
()
==
out_ptr
);
}
}
}
// Sum of two tensors
if
(
in_num
==
2
&&
in_vars
[
0
]
->
IsType
<
framework
::
LoDTensor
>
()
&&
in_vars
[
1
]
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
&
in_0
=
in_vars
[
0
]
->
Get
<
framework
::
LoDTensor
>
();
auto
&
in_1
=
in_vars
[
1
]
->
Get
<
framework
::
LoDTensor
>
();
int64_t
length_0
=
in_0
.
numel
();
int64_t
length_1
=
in_1
.
numel
();
if
(
length_0
&&
length_1
&&
in_0
.
IsInitialized
()
&&
in_1
.
IsInitialized
())
{
auto
result
=
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
place
=
*
dev_ctx
.
eigen_device
();
auto
in_0_e
=
EigenVector
<
T
>::
Flatten
(
in_0
);
auto
in_1_e
=
EigenVector
<
T
>::
Flatten
(
in_1
);
result
.
device
(
place
)
=
in_0_e
+
in_1_e
;
}
else
if
(
length_0
&&
in_0
.
IsInitialized
())
{
auto
result
=
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
place
=
*
dev_ctx
.
eigen_device
();
result
.
device
(
place
)
=
EigenVector
<
T
>::
Flatten
(
in_0
);
}
else
if
(
length_1
&&
in_1
.
IsInitialized
())
{
auto
result
=
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
place
=
*
dev_ctx
.
eigen_device
();
result
.
device
(
place
)
=
EigenVector
<
T
>::
Flatten
(
in_1
);
}
return
;
}
int
start
=
in_place
?
1
:
0
;
if
(
!
in_place
)
{
phi
::
funcs
::
SetConstant
<
phi
::
GPUContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
phi
::
GPUContext
>(),
out
,
static_cast
<
T
>
(
0
));
}
std
::
vector
<
const
T
*>
in_data
;
std
::
vector
<
int
>
selectrow_index
;
int64_t
lod_length
=
0
;
bool
dst_write
=
false
;
for
(
int
i
=
start
;
i
<
in_num
;
++
i
)
{
if
(
in_vars
[
i
]
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
&
in_i
=
in_vars
[
i
]
->
Get
<
framework
::
LoDTensor
>
();
lod_length
=
in_i
.
numel
();
if
(
lod_length
&&
in_i
.
IsInitialized
())
{
in_data
.
emplace_back
(
in_i
.
data
<
T
>
());
}
}
else
if
(
in_vars
[
i
]
->
IsType
<
phi
::
SelectedRows
>
())
{
selectrow_index
.
push_back
(
i
);
}
}
// compute select rows separately.
if
(
!
selectrow_index
.
empty
())
{
std
::
vector
<
const
T
*>
sr_in_out_data
;
size_t
rows
=
0
;
int64_t
length
=
0
;
for
(
auto
index
:
selectrow_index
)
{
auto
&
sr
=
in_vars
[
index
]
->
Get
<
phi
::
SelectedRows
>
();
auto
&
sr_value
=
sr
.
value
();
auto
&
sr_rows
=
sr
.
rows
();
auto
row_numel
=
sr_value
.
numel
()
/
sr_rows
.
size
();
auto
out_dims
=
out
->
dims
();
PADDLE_ENFORCE_EQ
(
sr
.
height
(),
out_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The table height of input must be same as output, "
"but received input height is %d"
", output height is %d"
,
sr
.
height
(),
out_dims
[
0
]));
PADDLE_ENFORCE_EQ
(
row_numel
,
out
->
numel
()
/
sr
.
height
(),
platform
::
errors
::
InvalidArgument
(
"The table width of input must be same as output, "
"but received input width is %d"
", output width is %d"
,
row_numel
,
out
->
numel
()
/
sr
.
height
()));
auto
*
sr_data
=
sr_value
.
data
<
T
>
();
auto
*
sr_out_data
=
out
->
data
<
T
>
();
rows
+=
sr_rows
.
size
();
length
=
row_numel
;
for
(
size_t
i
=
0
;
i
<
sr_rows
.
size
();
++
i
)
{
sr_in_out_data
.
emplace_back
(
&
sr_data
[
i
*
row_numel
]);
sr_in_out_data
.
emplace_back
(
&
sr_out_data
[
sr_rows
[
i
]
*
row_numel
]);
}
}
if
(
!
sr_in_out_data
.
empty
())
{
auto
tmp_sr_in_out_array
=
memory
::
Alloc
(
dev_ctx
.
GetPlace
(),
sr_in_out_data
.
size
()
*
sizeof
(
T
*
),
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
dev_ctx
.
stream
())));
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
tmp_sr_in_out_array
->
ptr
(),
platform
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
sr_in_out_data
.
data
()),
sr_in_out_data
.
size
()
*
sizeof
(
T
*
),
dev_ctx
.
stream
());
T
**
sr_in_out_array_data
=
reinterpret_cast
<
T
**>
(
tmp_sr_in_out_array
->
ptr
());
ComputeKernelParameter
(
length
);
SumSelectedRowsCUDAKernel
<
T
>
<<<
grids
,
blocks
,
0
,
stream
>>>
(
sr_in_out_array_data
,
length
,
rows
);
dst_write
=
true
;
}
}
// if indata not null, merge into one kernel call.
if
(
!
in_data
.
empty
())
{
auto
tmp_in_array
=
memory
::
Alloc
(
dev_ctx
.
GetPlace
(),
in_data
.
size
()
*
sizeof
(
T
*
),
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
dev_ctx
.
stream
())));
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
tmp_in_array
->
ptr
(),
platform
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
in_data
.
data
()),
in_data
.
size
()
*
sizeof
(
T
*
),
dev_ctx
.
stream
());
T
**
in_array_data
=
reinterpret_cast
<
T
**>
(
tmp_in_array
->
ptr
());
ComputeKernelParameter
(
lod_length
);
SumArrayCUDAKernel
<
T
><<<
grids
,
blocks
,
0
,
stream
>>>
(
in_array_data
,
out
->
data
<
T
>
(),
lod_length
,
in_data
.
size
(),
dst_write
|
in_place
);
}
}
template
<
typename
T
>
class
SumKernel
<
phi
::
GPUContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
out_var
=
context
.
OutputVar
(
"Out"
);
if
(
out_var
->
IsType
<
framework
::
LoDTensor
>
())
{
SumToLoDTensor
<
T
>
(
context
);
}
else
if
(
out_var
->
IsType
<
phi
::
SelectedRows
>
())
{
SelectedRowsCompute
<
phi
::
GPUContext
,
T
>
(
context
);
}
else
if
(
out_var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
LodTensorArrayCompute
<
phi
::
GPUContext
,
T
>
(
context
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Expected type of Output(out) must be Tensor, SelectedRows or "
"LodTensorArray. But got "
"unsupport type: %s."
,
framework
::
ToTypeName
(
out_var
->
Type
())));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
sum
,
ops
::
SumKernel
<
phi
::
GPUContext
,
float
>
,
ops
::
SumKernel
<
phi
::
GPUContext
,
double
>
,
ops
::
SumKernel
<
phi
::
GPUContext
,
int
>
,
ops
::
SumKernel
<
phi
::
GPUContext
,
int64_t
>
,
ops
::
SumKernel
<
phi
::
GPUContext
,
plat
::
float16
>
,
ops
::
SumKernel
<
phi
::
GPUContext
,
plat
::
bfloat16
>
);
paddle/fluid/operators/sum_op.h
deleted
100644 → 0
View file @
b5499578
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
SelectedRows
=
phi
::
SelectedRows
;
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
DeviceContext
,
typename
T
>
void
SelectedRowsCompute
(
const
framework
::
ExecutionContext
&
context
)
{
auto
in_vars
=
context
.
MultiInputVar
(
"X"
);
auto
out_var
=
context
.
OutputVar
(
"Out"
);
bool
in_place
=
out_var
==
in_vars
[
0
];
if
(
in_place
&&
in_vars
.
size
()
<
2
)
{
return
;
}
std
::
vector
<
const
phi
::
SelectedRows
*>
inputs
;
SelectedRows
temp_in0
;
if
(
in_place
)
{
auto
&
in0
=
in_vars
[
0
]
->
Get
<
phi
::
SelectedRows
>
();
temp_in0
.
set_height
(
in0
.
height
());
temp_in0
.
set_rows
(
in0
.
rows
());
framework
::
TensorCopy
(
in0
.
value
(),
in0
.
place
(),
context
.
device_context
(),
temp_in0
.
mutable_value
());
inputs
.
push_back
(
&
temp_in0
);
for
(
size_t
i
=
1
;
i
<
in_vars
.
size
();
++
i
)
{
auto
&
in
=
in_vars
[
i
]
->
Get
<
phi
::
SelectedRows
>
();
if
(
in
.
rows
().
size
()
>
0
)
{
inputs
.
push_back
(
&
in
);
}
}
}
else
{
for
(
auto
&
in_var
:
in_vars
)
{
auto
&
in
=
in_var
->
Get
<
phi
::
SelectedRows
>
();
if
(
in
.
rows
().
size
()
>
0
)
{
inputs
.
push_back
(
&
in_var
->
Get
<
phi
::
SelectedRows
>
());
}
}
}
auto
*
out
=
context
.
Output
<
phi
::
SelectedRows
>
(
"Out"
);
out
->
mutable_rows
()
->
clear
();
bool
has_data
=
false
;
for
(
auto
&
in
:
inputs
)
{
if
(
in
->
rows
().
size
()
>
0
)
{
has_data
=
true
;
break
;
}
}
if
(
has_data
)
{
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_add
;
merge_add
(
context
.
template
device_context
<
DeviceContext
>(),
inputs
,
out
);
out
->
SyncIndex
();
}
else
{
// no data, just set a empty out tensor.
out
->
mutable_value
()
->
mutable_data
<
T
>
(
phi
::
make_ddim
({
0
}),
context
.
GetPlace
());
}
}
template
<
typename
DeviceContext
,
typename
T
>
void
LodTensorArrayCompute
(
const
framework
::
ExecutionContext
&
context
)
{
auto
in_vars
=
context
.
MultiInputVar
(
"X"
);
auto
out_var
=
context
.
OutputVar
(
"Out"
);
bool
in_place
=
out_var
==
in_vars
[
0
];
auto
&
out_array
=
*
out_var
->
GetMutable
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
in_place
?
1
:
0
;
i
<
in_vars
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
in_vars
[
i
]
->
IsType
<
framework
::
LoDTensorArray
>
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Only support all inputs are TensorArray, "
"but inputs[%d] is not TensorArray."
,
i
));
auto
&
in_array
=
in_vars
[
i
]
->
Get
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
0
;
i
<
in_array
.
size
();
++
i
)
{
if
(
in_array
[
i
].
IsInitialized
()
&&
(
in_array
[
i
].
numel
()
!=
0
))
{
if
(
i
>=
out_array
.
size
())
{
out_array
.
resize
(
i
+
1
);
}
if
(
!
out_array
[
i
].
IsInitialized
()
||
(
out_array
[
i
].
numel
()
==
0
))
{
framework
::
TensorCopy
(
in_array
[
i
],
in_array
[
i
].
place
(),
context
.
device_context
(),
&
out_array
[
i
]);
out_array
[
i
].
set_lod
(
in_array
[
i
].
lod
());
}
else
{
PADDLE_ENFORCE_EQ
(
out_array
[
i
].
lod
(),
in_array
[
i
].
lod
(),
platform
::
errors
::
InvalidArgument
(
"The lod message between inputs[%d] and"
" outputs[%d] must be same, but now is not same."
,
i
,
i
));
auto
in
=
EigenVector
<
T
>::
Flatten
(
in_array
[
i
]);
auto
result
=
EigenVector
<
T
>::
Flatten
(
out_array
[
i
]);
result
.
device
(
*
context
.
template
device_context
<
DeviceContext
>()
.
eigen_device
())
=
result
+
in
;
}
}
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
SumKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
VLOG
(
10
)
<<
"start sum kernel"
;
auto
in_vars
=
context
.
MultiInputVar
(
"X"
);
size_t
in_num
=
in_vars
.
size
();
auto
out_var
=
context
.
OutputVar
(
"Out"
);
bool
in_place
=
out_var
==
in_vars
[
0
];
if
(
out_var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
*
out
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
out_ptr
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
in_num
>=
1
&&
in_vars
[
0
]
->
IsType
<
framework
::
LoDTensor
>
()
&&
in_vars
[
0
]
->
Get
<
framework
::
LoDTensor
>
().
IsInitialized
())
{
auto
&
in_0_tensor
=
in_vars
[
0
]
->
Get
<
framework
::
LoDTensor
>
();
if
(
in_0_tensor
.
numel
()
>
0
)
{
in_place
=
(
in_0_tensor
.
data
<
T
>
()
==
out_ptr
);
}
}
auto
result
=
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
int
start
=
in_place
?
1
:
0
;
if
(
!
in_place
)
{
if
((
in_num
>=
2
)
&&
in_vars
[
0
]
->
IsType
<
framework
::
LoDTensor
>
()
&&
in_vars
[
1
]
->
IsType
<
framework
::
LoDTensor
>
()
&&
in_vars
[
0
]
->
Get
<
framework
::
LoDTensor
>
().
IsInitialized
()
&&
in_vars
[
1
]
->
Get
<
framework
::
LoDTensor
>
().
IsInitialized
())
{
auto
&
in_0
=
in_vars
[
0
]
->
Get
<
framework
::
LoDTensor
>
();
auto
&
in_1
=
in_vars
[
1
]
->
Get
<
framework
::
LoDTensor
>
();
if
(
in_0
.
numel
()
&&
in_1
.
numel
())
{
auto
in_0_e
=
EigenVector
<
T
>::
Flatten
(
in_0
);
auto
in_1_e
=
EigenVector
<
T
>::
Flatten
(
in_1
);
result
.
device
(
place
)
=
in_0_e
+
in_1_e
;
start
=
2
;
}
}
if
(
start
!=
2
)
{
VLOG
(
10
)
<<
"Fill with constant = 0 in sum kernel."
;
phi
::
funcs
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
out
,
static_cast
<
T
>
(
0
));
}
}
math
::
SelectedRowsAddToTensor
<
DeviceContext
,
T
>
functor
;
// If in_place, just skip the first tensor
for
(
size_t
i
=
start
;
i
<
in_num
;
i
++
)
{
if
(
in_vars
[
i
]
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
&
in_t
=
in_vars
[
i
]
->
Get
<
framework
::
LoDTensor
>
();
if
(
!
in_t
.
IsInitialized
()
||
in_t
.
numel
()
==
0
)
{
continue
;
}
auto
in
=
EigenVector
<
T
>::
Flatten
(
in_t
);
result
.
device
(
place
)
=
result
+
in
;
}
else
if
(
in_vars
[
i
]
->
IsType
<
phi
::
SelectedRows
>
())
{
auto
&
in_t
=
in_vars
[
i
]
->
Get
<
phi
::
SelectedRows
>
();
functor
(
context
.
template
device_context
<
DeviceContext
>(),
in_t
,
out
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Expected type of Input(X) of %d-th must be Tensor, "
"SelectedRows. But got "
"unsupport type: %s."
,
framework
::
ToTypeName
(
in_vars
[
i
]
->
Type
())));
}
}
}
else
if
(
out_var
->
IsType
<
phi
::
SelectedRows
>
())
{
SelectedRowsCompute
<
DeviceContext
,
T
>
(
context
);
}
else
if
(
out_var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
LodTensorArrayCompute
<
DeviceContext
,
T
>
(
context
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Expected type of Output(out) must be Tensor, SelectedRows, "
"LoDTensorArray. But got "
"unsupport type: %s."
,
framework
::
ToTypeName
(
out_var
->
Type
())));
}
VLOG
(
10
)
<<
"end sum kernel"
;
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/sum_op_mlu.cc
View file @
dbe08e9b
...
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/sum_op.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
using
SelectedRows
=
phi
::
SelectedRows
;
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
SumMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
class
SumMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
paddle/fluid/operators/sum_op_npu.cc
View file @
dbe08e9b
...
@@ -16,13 +16,16 @@ limitations under the License. */
...
@@ -16,13 +16,16 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
using
SelectedRows
=
phi
::
SelectedRows
;
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
SumNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
class
SumNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
paddle/fluid/operators/sum_op_xpu.cc
View file @
dbe08e9b
...
@@ -13,14 +13,16 @@ limitations under the License. */
...
@@ -13,14 +13,16 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
framework
::
Tensor
;
using
framework
::
Tensor
;
using
SelectedRows
=
phi
::
SelectedRows
;
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
SumXPUKernel
:
public
framework
::
OpKernel
<
T
>
{
class
SumXPUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
...
...
Prev
1
2
3
4
5
6
7
8
9
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment