Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
MLPerf_BERT_paddle
Commits
581b8d15
Commit
581b8d15
authored
Apr 10, 2023
by
liangjing
Browse files
version 1
parents
Pipeline
#169
failed with stages
in 0 seconds
Changes
161
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2701 additions
and
0 deletions
+2701
-0
external_ops/dist.bak/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
...ps/dist.bak/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
+0
-0
external_ops/dist/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
...al_ops/dist/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
+0
-0
external_ops/env.sh
external_ops/env.sh
+77
-0
external_ops/fmhalib/README.md
external_ops/fmhalib/README.md
+9
-0
external_ops/fmhalib/fmha_cuda.cc
external_ops/fmhalib/fmha_cuda.cc
+142
-0
external_ops/fmhalib/fmha_cuda.cu
external_ops/fmhalib/fmha_cuda.cu
+0
-0
external_ops/fmhalib/test_fmha.py
external_ops/fmhalib/test_fmha.py
+55
-0
external_ops/fused_dense_op/README.md
external_ops/fused_dense_op/README.md
+12
-0
external_ops/fused_dense_op/fused_dense_cuda.cc
external_ops/fused_dense_op/fused_dense_cuda.cc
+73
-0
external_ops/fused_dense_op/fused_dense_cuda.cu
external_ops/fused_dense_op/fused_dense_cuda.cu
+879
-0
external_ops/fused_dense_op/test_fused_dense_op.py
external_ops/fused_dense_op/test_fused_dense_op.py
+296
-0
external_ops/fused_dense_op/test_fused_dense_perf.py
external_ops/fused_dense_op/test_fused_dense_perf.py
+331
-0
external_ops/fused_dropout_residual_ln/README.md
external_ops/fused_dropout_residual_ln/README.md
+16
-0
external_ops/fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cc
...sed_dropout_residual_ln/fused_dropout_residual_ln_cuda.cc
+78
-0
external_ops/fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cu
...sed_dropout_residual_ln/fused_dropout_residual_ln_cuda.cu
+154
-0
external_ops/fused_dropout_residual_ln/test_fused_dropout_op.py
...al_ops/fused_dropout_residual_ln/test_fused_dropout_op.py
+198
-0
external_ops/fused_dropout_residual_ln/test_fused_dropout_perf.py
..._ops/fused_dropout_residual_ln/test_fused_dropout_perf.py
+207
-0
external_ops/lr_op/lr_op_cuda.cc
external_ops/lr_op/lr_op_cuda.cc
+38
-0
external_ops/lr_op/lr_op_cuda.cu
external_ops/lr_op/lr_op_cuda.cu
+85
-0
external_ops/setup.py
external_ops/setup.py
+51
-0
No files found.
external_ops/dist.bak/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
0 → 100644
View file @
581b8d15
File added
external_ops/dist/custom_setup_ops-0.0.0-py3.6-linux-x86_64.egg
0 → 100644
View file @
581b8d15
File added
external_ops/env.sh
0 → 100644
View file @
581b8d15
#!/bin/bash
################################################################################
# Copyright 2016-2021 by SW Group, Chengdu Hygon IC Design Co., Ltd.
# All right reserved. See COPYRIGHT for detailed Information.
#
# @file set_env.sh
# @brief set env variables for running test.
#
# @author wangmingliang <wangmingliang@hygon.cn>
# @date 2022/03/23
# @history 1.0
################################################################################
export
HYGON_ROCM_INSTALL
=
/opt/dtk-21.04
# library path
# fix LD_LIBRARY_PATH begin or end with colon
export
LD_LIBRARY_PATH
=
$(
echo
${
LD_LIBRARY_PATH
}
|
sed
's/:$//; s/^://;'
)
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/lib:
${
HYGON_ROCM_INSTALL
}
/lib64
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/hip/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/hipblas/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/hipcub/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/hipfft/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/hiprand/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/hipsolver/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/hipsparse/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/hsa/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/llvm/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/miopen/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/miopengemm/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/oam/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/opencl/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rccl/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocalution/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocblas/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocclr/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocfft/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocm_smi/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocprim/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocprofiler/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocrand/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocsolver/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocsparse/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocthrust/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/roctracer/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
LD_LIBRARY_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocblas/lib
${
LD_LIBRARY_PATH
:+:
${
LD_LIBRARY_PATH
}}
export
C_INCLUDE_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocrand/include:
$C_INCLUDE_PATH
export
CPLUS_INCLUDE_PATH
=
${
HYGON_ROCM_INSTALL
}
/rocrand/include:
$CPLUS_INCLUDE_PATH
export
C_INCLUDE_PATH
=
${
HYGON_ROCM_INSTALL
}
/hiprand/include:
$C_INCLUDE_PATH
export
CPLUS_INCLUDE_PATH
=
${
HYGON_ROCM_INSTALL
}
/hiprand/include:
$CPLUS_INCLUDE_PATH
# executable path
# fix PATH begin or end with colon
export
PATH
=
$(
echo
${
PATH
}
|
sed
's/:$//; s/^://;'
)
export
PATH
=
${
HYGON_ROCM_INSTALL
}
/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
${
HYGON_ROCM_INSTALL
}
/hip/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
${
HYGON_ROCM_INSTALL
}
/llvm/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
${
HYGON_ROCM_INSTALL
}
/llvm/lib/clang/13.0.0/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
${
HYGON_ROCM_INSTALL
}
/miopen/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
${
HYGON_ROCM_INSTALL
}
/opencl/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
${
HYGON_ROCM_INSTALL
}
/rocprofiler/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
/public/home/zhangqha/bladisc/hmmer/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
/public/home/zhangqha/bladisc/hh-suite-master/build/bin
${
PATH
:+:
${
PATH
}}
export
PATH
=
/public/home/zhangqha/bladisc/kalign/build/bin
${
PATH
:+:
${
PATH
}}
# component path
export
ROCM_PATH
=
${
HYGON_ROCM_INSTALL
}
export
HSA_PATH
=
${
HYGON_ROCM_INSTALL
}
/hsa
export
HIP_PATH
=
${
HYGON_ROCM_INSTALL
}
/hip
export
HIP_ROCCLR_HOME
=
${
HYGON_ROCM_INSTALL
}
/hip
export
HIP_LIB_PATH
=
${
HYGON_ROCM_INSTALL
}
/hip/lib
export
DEVICE_LIB_PATH
=
${
HYGON_ROCM_INSTALL
}
/amdgcn/bitcode
export
HIP_CLANG_PATH
=
${
HYGON_ROCM_INSTALL
}
/llvm/bin
export
HIP_RUNTIME
=
"rocclr"
export
HIP_COMPILER
=
"clang"
#export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$CPLUS_INCLUDE_PATH
#export INCLUDE=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$INCLUDE
#export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$C_INCLUDE_PATH
external_ops/fmhalib/README.md
0 → 100644
View file @
581b8d15
# preparation
Accordding to the following PR to produce fmhalib.so
https://github.com/sneaxiy/apex/pull/1
# build the fmha op
python setup.py install
# unittest (need to improve)
python test_fmha.py
external_ops/fmhalib/fmha_cuda.cc
0 → 100644
View file @
581b8d15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
#define CHECK_INPUT(x) \
PD_CHECK(x.place().GetType() == phi::AllocationType::GPU, \
#x " must be a GPU Tensor.")
std
::
vector
<
paddle
::
Tensor
>
fmha_cuda_forward
(
const
paddle
::
Tensor
&
qkv
,
const
paddle
::
Tensor
&
cu_seqlen
,
const
paddle
::
Tensor
&
host_seqlen
,
bool
is_test
,
float
dropout_rate
,
bool
zero_tensors
,
bool
use_fmha_mke_opt
);
std
::
vector
<
paddle
::
Tensor
>
fmha_cuda_backward
(
const
paddle
::
Tensor
&
qkv
,
const
paddle
::
Tensor
&
cu_seqlen
,
const
paddle
::
Tensor
&
host_seqlen
,
const
paddle
::
Tensor
&
softmax_input
,
const
paddle
::
Tensor
&
d_ctx_out
,
bool
is_test
,
float
dropout_rate
,
bool
zero_tensors
,
bool
use_fmha_mke_opt
);
/*
*x_shape(fp16) = [total_tokens, 3, num_heads, head_size]
*y_shape(int32) = [batch_size + 1]
*/
std
::
vector
<
std
::
vector
<
int64_t
>>
FmhaInferShape
(
const
std
::
vector
<
int64_t
>&
x_shape
,
const
std
::
vector
<
int64_t
>&
y_shape
,
const
std
::
vector
<
int64_t
>&
host_y_shape
,
const
bool
&
is_test
,
const
float
&
dropout_rate
,
const
bool
&
zero_tensors
,
const
bool
&
use_fmha_mke_opt
)
{
int
total
=
x_shape
[
0
];
int
num_heads
=
x_shape
[
2
];
int
head_size
=
x_shape
[
3
];
int
batch_size
=
y_shape
[
0
]
-
1
;
if
(
x_shape
[
1
]
!=
3
)
{
PD_THROW
(
"The shape for input QKV should be [total_tokens, 3, num_heas, "
"head_size]."
);
}
int
max_seq_len
=
512
;
std
::
vector
<
int64_t
>
ctx_out_shape
=
{
total
,
num_heads
,
head_size
};
std
::
vector
<
int64_t
>
s_out_shape
=
{
batch_size
,
num_heads
,
max_seq_len
,
max_seq_len
};
return
{
ctx_out_shape
,
s_out_shape
};
}
std
::
vector
<
paddle
::
DataType
>
FmhaInferDtype
(
paddle
::
DataType
x_dtype
,
paddle
::
DataType
y_dtype
,
paddle
::
DataType
host_y_dtype
)
{
return
{
x_dtype
,
x_dtype
};
}
std
::
vector
<
paddle
::
Tensor
>
FmhaCUDAForward
(
const
paddle
::
Tensor
&
qkv
,
const
paddle
::
Tensor
&
cu_seqlen
,
const
paddle
::
Tensor
&
host_seqlen
,
bool
is_test
,
float
dropout_rate
,
bool
zero_tensors
,
bool
use_fmha_mke_opt
)
{
CHECK_INPUT
(
qkv
);
CHECK_INPUT
(
cu_seqlen
);
// Note: should not use CHECK_INPUT(max_seq_len_host),
// because it will enforce this input to be GPU tensor
return
fmha_cuda_forward
(
qkv
,
cu_seqlen
,
host_seqlen
,
is_test
,
dropout_rate
,
zero_tensors
,
use_fmha_mke_opt
);
}
std
::
vector
<
paddle
::
Tensor
>
FmhaCUDABackward
(
const
paddle
::
Tensor
&
qkv
,
const
paddle
::
Tensor
&
cu_seqlen
,
const
paddle
::
Tensor
&
host_seqlen
,
const
paddle
::
Tensor
&
softmax_input
,
const
paddle
::
Tensor
&
d_ctx_out
,
bool
is_test
,
float
dropout_rate
,
bool
zero_tensors
,
bool
use_fmha_mke_opt
)
{
CHECK_INPUT
(
qkv
);
CHECK_INPUT
(
cu_seqlen
);
CHECK_INPUT
(
softmax_input
);
CHECK_INPUT
(
d_ctx_out
);
return
fmha_cuda_backward
(
qkv
,
cu_seqlen
,
host_seqlen
,
softmax_input
,
d_ctx_out
,
is_test
,
dropout_rate
,
zero_tensors
,
use_fmha_mke_opt
);
}
PD_BUILD_OP
(
custom_fmha
)
.
Inputs
({
"QKV"
,
"CuSeqLen"
,
"HostSeqLen"
})
.
Outputs
({
"CtxOut"
,
"SOut"
})
.
Attrs
({
"is_test: bool"
,
"dropout_rate: float"
,
"zero_tensors: bool"
,
"use_fmha_mke_opt: bool"
})
.
SetKernelFn
(
PD_KERNEL
(
FmhaCUDAForward
))
.
SetInferShapeFn
(
PD_INFER_SHAPE
(
FmhaInferShape
))
.
SetInferDtypeFn
(
PD_INFER_DTYPE
(
FmhaInferDtype
));
PD_BUILD_GRAD_OP
(
custom_fmha
)
.
Inputs
({
"QKV"
,
"CuSeqLen"
,
"HostSeqLen"
,
"SOut"
,
paddle
::
Grad
(
"CtxOut"
)})
.
Outputs
({
paddle
::
Grad
(
"QKV"
)})
.
Attrs
({
"is_test: bool"
,
"dropout_rate: float"
,
"zero_tensors: bool"
,
"use_fmha_mke_opt: bool"
})
.
SetKernelFn
(
PD_KERNEL
(
FmhaCUDABackward
));
external_ops/fmhalib/fmha_cuda.cu
0 → 100644
View file @
581b8d15
external_ops/fmhalib/test_fmha.py
0 → 100644
View file @
581b8d15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
custom_setup_ops
import
custom_fmha
import
numpy
as
np
total
=
2
batch_size
=
56
num_heads
=
16
head_size
=
64
is_training
=
True
max_seq_len
=
512
dropout_rate
=
0.1
cu_seqlen
=
np
.
arange
(
batch_size
+
1
)
cu_seqlen
=
np
.
cumsum
(
cu_seqlen
)
total
=
cu_seqlen
[
-
1
]
#print("cu_seqlen", cu_seqlen)
#print("cu_seqlen[-1]", cu_seqlen[-1])
cu_seqlen
=
paddle
.
to_tensor
(
cu_seqlen
)
cu_seqlen
=
paddle
.
cast
(
cu_seqlen
,
'int32'
)
qkv
=
np
.
random
.
random
((
total
,
3
,
num_heads
,
head_size
)).
astype
(
np
.
float16
)
#print("qkv:", qkv)
qkv
=
paddle
.
to_tensor
(
qkv
,
stop_gradient
=
False
)
max_seq_len_host
=
[
max_seq_len
]
max_seq_len_host
=
paddle
.
to_tensor
(
max_seq_len_host
,
dtype
=
'int32'
,
place
=
paddle
.
CPUPlace
())
ctx_out
,
s_out
=
custom_fmha
(
qkv
,
cu_seqlen
,
max_seq_len_host
,
is_training
,
dropout_rate
,
False
)
print
(
"print ctx_out and s_out: "
)
print
(
ctx_out
)
print
(
s_out
)
# backward.
print
(
"print qkv.grad: "
)
grad_ctx_dout
=
np
.
random
.
random
(
(
total
,
num_heads
,
head_size
)).
astype
(
np
.
float16
)
grad_ctx_dout
=
paddle
.
to_tensor
(
grad_ctx_dout
)
paddle
.
autograd
.
backward
([
ctx_out
],
[
grad_ctx_dout
],
retain_graph
=
True
)
print
(
qkv
.
grad
)
external_ops/fused_dense_op/README.md
0 → 100644
View file @
581b8d15
# Function
Support gemm_nn/nt + bias with float16, float32 and float64 data types.
# Correcness test
`
python test_fused_dense_op.py
`
# performance result
`
python test_fused_dense_perf.py
`
external_ops/fused_dense_op/fused_dense_cuda.cc
0 → 100644
View file @
581b8d15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
// @x: [x, in_feature] or [xx, xx, in_feature]
// @y: [out_feature, in_feature]
// @out: [x, out_feature] or [xx, xx, out_feature]
// support transx=false, transy=true/false.
std
::
vector
<
std
::
vector
<
int64_t
>>
FusedDenseInferShape
(
const
std
::
vector
<
int64_t
>&
x_shape
,
const
std
::
vector
<
int64_t
>&
y_shape
,
const
std
::
vector
<
int64_t
>&
bias_shape
,
const
bool
&
transx
,
const
bool
&
transy
,
const
bool
&
use_addto
)
{
int
x_size
=
x_shape
.
size
();
int
x_m
=
1
;
for
(
int
i
=
0
;
i
<
(
x_size
-
1
);
i
++
)
{
x_m
*=
x_shape
[
i
];
}
int
x_k
=
x_shape
[
x_size
-
1
];
int
y_k
=
y_shape
[
0
];
int
y_n
=
y_shape
[
1
];
if
(
transy
)
{
y_k
=
y_shape
[
1
];
y_n
=
y_shape
[
0
];
}
if
(
x_k
!=
y_k
)
{
PD_THROW
(
"The reudce dim of A and B in matmul is not equal."
);
}
if
(
transx
)
{
PD_THROW
(
"Only support cases: transx is False, transy are True/False."
);
}
std
::
vector
<
int64_t
>
out_shape
(
x_shape
);
out_shape
[
x_size
-
1
]
=
y_n
;
return
{
out_shape
};
}
std
::
vector
<
paddle
::
DataType
>
FusedDenseInferDtype
(
paddle
::
DataType
x_dtype
,
paddle
::
DataType
y_dtype
,
paddle
::
DataType
bias_dtype
)
{
return
{
x_dtype
};
}
PD_BUILD_OP
(
custom_fused_dense
)
.
Inputs
({
"X"
,
"Y"
,
"Bias"
})
.
Outputs
({
"Out"
})
.
Attrs
({
"transx: bool"
,
"transy: bool"
,
"use_addto: bool"
})
.
SetInferShapeFn
(
PD_INFER_SHAPE
(
FusedDenseInferShape
))
.
SetInferDtypeFn
(
PD_INFER_DTYPE
(
FusedDenseInferDtype
));
PD_BUILD_GRAD_OP
(
custom_fused_dense
)
.
Inputs
({
"X"
,
"Y"
,
"Bias"
,
paddle
::
Grad
(
"Out"
)})
.
Outputs
({
paddle
::
Grad
(
"X"
),
paddle
::
Grad
(
"Y"
),
paddle
::
Grad
(
"Bias"
)})
.
Attrs
({
"transx: bool"
,
"transy: bool"
});
external_ops/fused_dense_op/fused_dense_cuda.cu
0 → 100644
View file @
581b8d15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#define CUBLAS_VERSION 13000
/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
// includes cublaslt
#include <cublasLt.h>
#endif
#include "paddle/extension.h"
#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#define CHECK_CUBLAS_ERR(error_code) \
do { \
if (error_code != CUBLAS_STATUS_SUCCESS) { \
PD_THROW("cublas error code is ", error_code); \
} \
} while (0)
// todo: allocate 4MB. (the following code looks like 4MB * sizeof(T)?)
constexpr
auto
kWorkspaceSize
=
(
1
<<
22
);
// FP64 Wrapper around cublas GEMMEx
// TODO(limin): in fact, alpha and beta are double type.
cublasStatus_t
gemm_bias
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
double
*
A
,
int
lda
,
const
double
*
B
,
int
ldb
,
const
float
*
beta
,
double
*
C
,
int
ldc
)
{
return
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_64F
,
lda
,
B
,
CUDA_R_64F
,
ldb
,
beta
,
C
,
CUDA_R_64F
,
ldc
,
CUDA_R_64F
,
CUBLAS_GEMM_DEFAULT
);
}
// FP32 Wrapper around cublas GEMMEx
cublasStatus_t
gemm_bias
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
return
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_32F
,
lda
,
B
,
CUDA_R_32F
,
ldb
,
beta
,
C
,
CUDA_R_32F
,
ldc
,
CUDA_R_32F
,
CUBLAS_GEMM_DEFAULT
);
}
// FP16 Tensor core wrapper around cublas GEMMEx
cublasStatus_t
gemm_bias
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
paddle
::
float16
*
A
,
int
lda
,
const
paddle
::
float16
*
B
,
int
ldb
,
const
float
*
beta
,
paddle
::
float16
*
C
,
int
ldc
)
{
return
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
CUDA_R_16F
,
lda
,
B
,
CUDA_R_16F
,
ldb
,
beta
,
C
,
CUDA_R_16F
,
ldc
,
CUDA_R_32F
,
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
}
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
// float16 and float32
template
<
typename
T
>
cublasStatus_t
cublaslt_matmul_desc_init
(
cublasLtMatmulDescOpaque_t
*
operationDesc
)
{
cublasStatus_t
status
=
CUBLAS_STATUS_SUCCESS
;
status
=
cublasLtMatmulDescInit
(
operationDesc
,
CUBLAS_COMPUTE_32F
,
CUDA_R_32F
);
return
status
;
}
// float64
template
<
>
cublasStatus_t
cublaslt_matmul_desc_init
<
double
>
(
cublasLtMatmulDescOpaque_t
*
operationDesc
)
{
cublasStatus_t
status
=
CUBLAS_STATUS_SUCCESS
;
status
=
cublasLtMatmulDescInit
(
operationDesc
,
CUBLAS_COMPUTE_64F
,
CUDA_R_64F
);
return
status
;
}
// float16
template
<
typename
T
>
cublasStatus_t
set_cublaslt_matrix_layout_init
(
cublasLtMatrixLayoutOpaque_t
*
Adesc
,
cublasLtMatrixLayoutOpaque_t
*
Bdesc
,
cublasLtMatrixLayoutOpaque_t
*
Cdesc
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
int
lda
,
int
ldb
,
int
ldc
)
{
cublasStatus_t
status
=
CUBLAS_STATUS_SUCCESS
;
status
=
cublasLtMatrixLayoutInit
(
Adesc
,
CUDA_R_16F
,
transa
==
CUBLAS_OP_N
?
m
:
k
,
transa
==
CUBLAS_OP_N
?
k
:
m
,
lda
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatrixLayoutInit
(
Bdesc
,
CUDA_R_16F
,
transb
==
CUBLAS_OP_N
?
k
:
n
,
transb
==
CUBLAS_OP_N
?
n
:
k
,
ldb
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatrixLayoutInit
(
Cdesc
,
CUDA_R_16F
,
m
,
n
,
ldc
);
CLEANUP:
return
status
;
}
template
<
>
cublasStatus_t
set_cublaslt_matrix_layout_init
<
float
>
(
cublasLtMatrixLayoutOpaque_t
*
Adesc
,
cublasLtMatrixLayoutOpaque_t
*
Bdesc
,
cublasLtMatrixLayoutOpaque_t
*
Cdesc
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
int
lda
,
int
ldb
,
int
ldc
)
{
cublasStatus_t
status
=
CUBLAS_STATUS_SUCCESS
;
status
=
cublasLtMatrixLayoutInit
(
Adesc
,
CUDA_R_32F
,
transa
==
CUBLAS_OP_N
?
m
:
k
,
transa
==
CUBLAS_OP_N
?
k
:
m
,
lda
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatrixLayoutInit
(
Bdesc
,
CUDA_R_32F
,
transb
==
CUBLAS_OP_N
?
k
:
n
,
transb
==
CUBLAS_OP_N
?
n
:
k
,
ldb
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatrixLayoutInit
(
Cdesc
,
CUDA_R_32F
,
m
,
n
,
ldc
);
CLEANUP:
return
status
;
}
template
<
>
cublasStatus_t
set_cublaslt_matrix_layout_init
<
double
>
(
cublasLtMatrixLayoutOpaque_t
*
Adesc
,
cublasLtMatrixLayoutOpaque_t
*
Bdesc
,
cublasLtMatrixLayoutOpaque_t
*
Cdesc
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
int
lda
,
int
ldb
,
int
ldc
)
{
cublasStatus_t
status
=
CUBLAS_STATUS_SUCCESS
;
status
=
cublasLtMatrixLayoutInit
(
Adesc
,
CUDA_R_64F
,
transa
==
CUBLAS_OP_N
?
m
:
k
,
transa
==
CUBLAS_OP_N
?
k
:
m
,
lda
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatrixLayoutInit
(
Bdesc
,
CUDA_R_64F
,
transb
==
CUBLAS_OP_N
?
k
:
n
,
transb
==
CUBLAS_OP_N
?
n
:
k
,
ldb
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatrixLayoutInit
(
Cdesc
,
CUDA_R_64F
,
m
,
n
,
ldc
);
CLEANUP:
return
status
;
}
#endif
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
template
<
typename
T
>
int
gemm_bias_lt
(
cublasLtHandle_t
ltHandle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
/* host pointer */
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
const
float
*
beta
,
/* host pointer */
T
*
C
,
int
ldc
,
void
*
workspace
,
size_t
workspaceSize
,
cudaStream_t
stream
,
bool
use_bias
,
const
void
*
bias
)
{
cublasStatus_t
status
=
CUBLAS_STATUS_SUCCESS
;
cublasLtMatmulDescOpaque_t
operationDesc
=
{};
cublasLtMatrixLayoutOpaque_t
Adesc
=
{},
Bdesc
=
{},
Cdesc
=
{};
cublasLtMatmulPreferenceOpaque_t
preference
=
{};
int
returnedResults
=
0
;
cublasLtMatmulHeuristicResult_t
heuristicResult
=
{};
cublasLtEpilogue_t
epilogue
=
CUBLASLT_EPILOGUE_DEFAULT
;
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
// for details about defaults; here we just set the transforms for
// A and B.
status
=
cublaslt_matmul_desc_init
<
T
>
(
&
operationDesc
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatmulDescSetAttribute
(
&
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSA
,
&
transa
,
sizeof
(
transa
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatmulDescSetAttribute
(
&
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
transb
,
sizeof
(
transa
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
if
(
use_bias
)
{
status
=
cublasLtMatmulDescSetAttribute
(
&
operationDesc
,
CUBLASLT_MATMUL_DESC_BIAS_POINTER
,
&
bias
,
sizeof
(
bias
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
goto
CLEANUP
;
}
epilogue
=
CUBLASLT_EPILOGUE_BIAS
;
}
status
=
cublasLtMatmulDescSetAttribute
(
&
operationDesc
,
CUBLASLT_MATMUL_DESC_EPILOGUE
,
&
epilogue
,
sizeof
(
epilogue
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
goto
CLEANUP
;
}
// Create matrix descriptors. Not setting any extra attributes.
status
=
set_cublaslt_matrix_layout_init
<
T
>
(
&
Adesc
,
&
Bdesc
,
&
Cdesc
,
transa
,
transb
,
m
,
n
,
k
,
lda
,
ldb
,
ldc
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
// Create preference handle; In general, extra attributes can be
// used here to disable tensor ops or to make sure algo selected
// will work with badly aligned A, B, C. However, for simplicity
// here we assume A,B,C are always well aligned (e.g., directly
// come from cudaMalloc)
status
=
cublasLtMatmulPreferenceInit
(
&
preference
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatmulPreferenceSetAttribute
(
&
preference
,
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES
,
&
workspaceSize
,
sizeof
(
workspaceSize
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
// We just need the best available heuristic to try and run matmul.
// There is no guarantee that this will work. For example, if A is
// badly aligned, you can request more (e.g. 32) algos and try to
// run them one by one until something works.
status
=
cublasLtMatmulAlgoGetHeuristic
(
ltHandle
,
&
operationDesc
,
&
Adesc
,
&
Bdesc
,
&
Cdesc
,
&
Cdesc
,
&
preference
,
1
,
&
heuristicResult
,
&
returnedResults
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
if
(
returnedResults
==
0
)
{
status
=
CUBLAS_STATUS_NOT_SUPPORTED
;
goto
CLEANUP
;
}
status
=
cublasLtMatmul
(
ltHandle
,
&
operationDesc
,
alpha
,
A
,
&
Adesc
,
B
,
&
Bdesc
,
beta
,
C
,
&
Cdesc
,
reinterpret_cast
<
void
*>
(
C
),
&
Cdesc
,
// &heuristicResult.algo,
NULL
,
workspace
,
workspaceSize
,
stream
);
CLEANUP:
// Descriptors are no longer needed as all GPU work was already
// enqueued.
PADDLE_ENFORCE_GPU_SUCCESS
(
status
);
return
status
==
CUBLAS_STATUS_SUCCESS
?
0
:
1
;
}
#endif
template
<
typename
T
>
static
int
linear_bias_cuda_forward_impl
(
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx
,
const
T
*
input_data
,
const
T
*
weight_data
,
const
T
*
bias_data
,
bool
transx
,
bool
transy
,
int
in_features
,
int
batch_size
,
int
out_features
,
T
*
output_data
,
void
*
lt_workspace
)
{
auto
handle
=
dev_ctx
.
cublas_handle
();
auto
stream
=
dev_ctx
.
stream
();
const
float
alpha
=
1.0
;
const
float
beta_zero
=
0.0
;
const
float
beta_one
=
1.0
;
int
status
=
1
;
// nt
cublasOperation_t
transpose_x
=
CUBLAS_OP_T
;
cublasOperation_t
transpose_y
=
CUBLAS_OP_N
;
if
(
transy
)
{
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
status
=
gemm_bias_lt
((
cublasLtHandle_t
)
handle
,
transpose_x
,
transpose_y
,
out_features
,
batch_size
,
in_features
,
&
alpha
,
/* host pointer */
weight_data
,
in_features
,
input_data
,
in_features
,
&
beta_zero
,
/* host pointer */
output_data
,
out_features
,
lt_workspace
,
kWorkspaceSize
,
stream
,
true
,
bias_data
);
#endif
if
(
status
!=
0
)
{
PD_THROW
(
"cublaslt gemm_bias failed with error code "
,
status
);
#if 0
output.copy_(bias);
status = gemm_bias(
handle,
transpose_x,
transpose_y,
out_features,
batch_size,
in_features,
&alpha,
weight,
in_features,
input_data,
in_features,
&beta_one,
output_data,
out_features);
#endif
}
}
else
{
// nn
transpose_x
=
CUBLAS_OP_N
;
transpose_y
=
CUBLAS_OP_N
;
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
status
=
gemm_bias_lt
((
cublasLtHandle_t
)
handle
,
transpose_x
,
transpose_y
,
out_features
,
batch_size
,
in_features
,
&
alpha
,
/* host pointer */
weight_data
,
out_features
,
input_data
,
in_features
,
&
beta_zero
,
/* host pointer */
output_data
,
out_features
,
lt_workspace
,
kWorkspaceSize
,
stream
,
true
,
bias_data
);
#endif
if
(
status
!=
0
)
{
PD_THROW
(
"cublaslt gemm_bias failed with error code "
,
status
);
}
}
return
status
;
}
template
<
typename
T
>
int
gemm_bgradb_lt
(
cublasLtHandle_t
ltHandle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
/* host pointer */
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
const
float
*
beta
,
/* host pointer */
T
*
C
,
int
ldc
,
void
*
workspace
,
size_t
workspaceSize
,
cudaStream_t
stream
,
bool
use_bias
,
const
void
*
bgrad
,
cublasLtEpilogue_t
epilogue
)
{
cublasStatus_t
status
=
CUBLAS_STATUS_SUCCESS
;
cublasLtMatmulDescOpaque_t
operationDesc
=
{};
cublasLtMatrixLayoutOpaque_t
Adesc
=
{},
Bdesc
=
{},
Cdesc
=
{};
cublasLtMatmulPreferenceOpaque_t
preference
=
{};
int
returnedResults
=
0
;
cublasLtMatmulHeuristicResult_t
heuristicResult
=
{};
// cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
// for details about defaults; here we just set the transforms for
// A and B.
status
=
cublaslt_matmul_desc_init
<
T
>
(
&
operationDesc
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatmulDescSetAttribute
(
&
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSA
,
&
transa
,
sizeof
(
transa
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatmulDescSetAttribute
(
&
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
transb
,
sizeof
(
transa
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
if
(
use_bias
)
{
status
=
cublasLtMatmulDescSetAttribute
(
&
operationDesc
,
CUBLASLT_MATMUL_DESC_BIAS_POINTER
,
&
bgrad
,
sizeof
(
bgrad
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
goto
CLEANUP
;
}
}
status
=
cublasLtMatmulDescSetAttribute
(
&
operationDesc
,
CUBLASLT_MATMUL_DESC_EPILOGUE
,
&
epilogue
,
sizeof
(
epilogue
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
goto
CLEANUP
;
}
// Create matrix descriptors. Not setting any extra attributes.
status
=
set_cublaslt_matrix_layout_init
<
T
>
(
&
Adesc
,
&
Bdesc
,
&
Cdesc
,
transa
,
transb
,
m
,
n
,
k
,
lda
,
ldb
,
ldc
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
// Create preference handle; In general, extra attributes can be
// used here to disable tensor ops or to make sure algo selected
// will work with badly aligned A, B, C. However, for simplicity
// here we assume A,B,C are always well aligned (e.g., directly
// come from cudaMalloc)
status
=
cublasLtMatmulPreferenceInit
(
&
preference
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
status
=
cublasLtMatmulPreferenceSetAttribute
(
&
preference
,
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES
,
&
workspaceSize
,
sizeof
(
workspaceSize
));
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
// We just need the best available heuristic to try and run matmul.
// There is no guarantee that this will work. For example, if A is
// badly aligned, you can request more (e.g. 32) algos and try to
// run them one by one until something works.
status
=
cublasLtMatmulAlgoGetHeuristic
(
ltHandle
,
&
operationDesc
,
&
Adesc
,
&
Bdesc
,
&
Cdesc
,
&
Cdesc
,
&
preference
,
1
,
&
heuristicResult
,
&
returnedResults
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
goto
CLEANUP
;
if
(
returnedResults
==
0
)
{
status
=
CUBLAS_STATUS_NOT_SUPPORTED
;
goto
CLEANUP
;
}
status
=
cublasLtMatmul
(
ltHandle
,
&
operationDesc
,
alpha
,
A
,
&
Adesc
,
B
,
&
Bdesc
,
beta
,
C
,
&
Cdesc
,
C
,
&
Cdesc
,
// &heuristicResult.algo,
NULL
,
workspace
,
workspaceSize
,
stream
);
CLEANUP:
// Descriptors are no longer needed as all GPU work was already
// enqueued.
return
status
==
CUBLAS_STATUS_SUCCESS
?
0
:
1
;
}
template
<
typename
T
>
int
linear_bias_cuda_backward_impl
(
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx
,
const
T
*
input
,
const
T
*
weight
,
const
T
*
d_output
,
bool
transx
,
bool
transy
,
bool
use_addto
,
int
in_features
,
int
batch_size
,
int
out_features
,
T
*
d_weight
,
T
*
d_bias
,
T
*
d_input
,
void
*
lt_workspace
)
{
auto
handle
=
dev_ctx
.
cublas_handle
();
auto
stream
=
dev_ctx
.
stream
();
const
float
alpha
=
1.0
;
const
float
beta_zero
=
0.0
;
const
float
beta_one
=
1.0
;
int
status
=
1
;
if
(
transy
)
{
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
// cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
cublasLtEpilogue_t
epilogue
=
CUBLASLT_EPILOGUE_BGRADB
;
status
=
gemm_bgradb_lt
((
cublasLtHandle_t
)
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
in_features
,
out_features
,
batch_size
,
&
alpha
,
/* host pointer */
input
,
in_features
,
d_output
,
out_features
,
&
beta_zero
,
/* host pointer */
d_weight
,
in_features
,
lt_workspace
,
kWorkspaceSize
,
stream
,
true
,
static_cast
<
const
void
*>
(
d_bias
),
epilogue
);
#endif
if
(
status
!=
0
)
{
PD_THROW
(
"cublaslt gemm_bias failed with error code "
,
status
);
#if 0
status = gemm_bias(
handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
in_features,
out_features,
batch_size,
&alpha,
input,
in_features,
d_output,
out_features,
&beta_zero,
d_weight,
in_features);
#endif
}
}
else
{
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
cublasLtEpilogue_t
epilogue
=
CUBLASLT_EPILOGUE_BGRADA
;
status
=
gemm_bgradb_lt
((
cublasLtHandle_t
)
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
out_features
,
in_features
,
batch_size
,
&
alpha
,
/* host pointer */
d_output
,
out_features
,
input
,
in_features
,
&
beta_zero
,
/* host pointer */
d_weight
,
out_features
,
lt_workspace
,
kWorkspaceSize
,
stream
,
true
,
static_cast
<
const
void
*>
(
d_bias
),
epilogue
);
#endif
if
(
status
!=
0
)
{
PD_THROW
(
"cublaslt gemm_bias failed with error code "
,
status
);
}
}
cublasOperation_t
transpose_x
=
CUBLAS_OP_N
;
cublasOperation_t
transpose_y
=
CUBLAS_OP_N
;
const
float
beta_dinput
=
(
use_addto
?
beta_one
:
beta_zero
);
if
(
transy
)
{
status
=
gemm_bias
(
handle
,
transpose_x
,
transpose_y
,
in_features
,
batch_size
,
out_features
,
&
alpha
,
weight
,
in_features
,
d_output
,
out_features
,
&
beta_dinput
,
d_input
,
in_features
);
}
else
{
transpose_x
=
CUBLAS_OP_T
;
transpose_y
=
CUBLAS_OP_N
;
status
=
gemm_bias
(
handle
,
transpose_x
,
transpose_y
,
in_features
,
batch_size
,
out_features
,
&
alpha
,
weight
,
out_features
,
d_output
,
out_features
,
&
beta_dinput
,
d_input
,
in_features
);
}
return
status
;
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC
(
custom_fused_dense
,
ctx
)
{
namespace
f
=
paddle
::
framework
;
const
auto
*
x
=
ctx
.
Input
<
f
::
Tensor
>
(
"X"
);
const
auto
*
y
=
ctx
.
Input
<
f
::
Tensor
>
(
"Y"
);
const
auto
*
bias
=
ctx
.
Input
<
f
::
Tensor
>
(
"Bias"
);
auto
*
out
=
ctx
.
Output
<
f
::
Tensor
>
(
"Out"
);
bool
transx
=
ctx
.
Attr
<
bool
>
(
"transx"
);
bool
transy
=
ctx
.
Attr
<
bool
>
(
"transy"
);
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
auto
place
=
dev_ctx
.
GetPlace
();
if
(
transx
)
{
PD_THROW
(
"Attr(transx) must be False currently."
);
}
const
auto
&
x_dims
=
x
->
dims
();
int
x_m
=
1
;
for
(
int
i
=
0
;
i
<
x_dims
.
size
()
-
1
;
i
++
)
{
x_m
*=
x_dims
[
i
];
}
int
x_k
=
x_dims
[
x_dims
.
size
()
-
1
];
const
auto
&
y_dims
=
y
->
dims
();
int
y_k
=
y_dims
[
0
];
int
y_n
=
y_dims
[
1
];
if
(
transy
)
{
y_k
=
y_dims
[
1
];
y_n
=
y_dims
[
0
];
}
if
(
x_k
!=
y_k
)
{
PD_THROW
(
"The reudce dim of A and B in matmul is not equal."
);
}
auto
out_dims
=
x_dims
;
out_dims
[
x_dims
.
size
()
-
1
]
=
y_n
;
out
->
Resize
(
out_dims
);
f
::
Tensor
lt_workspace
;
lt_workspace
.
Resize
({
kWorkspaceSize
});
PD_DISPATCH_FLOATING_AND_HALF_TYPES
(
x
->
dtype
(),
"linear_bias_cuda_forward_impl"
,
([
&
]
{
linear_bias_cuda_forward_impl
<
data_t
>
(
dev_ctx
,
x
->
data
<
data_t
>
(),
y
->
data
<
data_t
>
(),
bias
->
data
<
data_t
>
(),
transx
,
transy
,
x_k
,
x_m
,
y_n
,
out
->
mutable_data
<
data_t
>
(
place
),
lt_workspace
.
mutable_data
<
data_t
>
(
place
));
}));
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC
(
custom_fused_dense_grad
,
ctx
)
{
namespace
f
=
paddle
::
framework
;
const
auto
*
x
=
ctx
.
Input
<
f
::
Tensor
>
(
"X"
);
const
auto
*
y
=
ctx
.
Input
<
f
::
Tensor
>
(
"Y"
);
const
auto
*
grad_out
=
ctx
.
Input
<
f
::
Tensor
>
(
f
::
GradVarName
(
"Out"
));
auto
*
grad_x
=
ctx
.
Output
<
f
::
Tensor
>
(
f
::
GradVarName
(
"X"
));
auto
*
grad_y
=
ctx
.
Output
<
f
::
Tensor
>
(
f
::
GradVarName
(
"Y"
));
auto
*
grad_bias
=
ctx
.
Output
<
f
::
Tensor
>
(
f
::
GradVarName
(
"Bias"
));
bool
transx
=
ctx
.
Attr
<
bool
>
(
"transx"
);
bool
transy
=
ctx
.
Attr
<
bool
>
(
"transy"
);
bool
use_addto
=
ctx
.
Attr
<
bool
>
(
"use_addto"
);
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
auto
place
=
dev_ctx
.
GetPlace
();
if
(
transx
)
{
PD_THROW
(
"Attr(transx) must be False currently."
);
}
const
auto
&
x_dims
=
x
->
dims
();
int
x_m
=
1
;
for
(
int
i
=
0
;
i
<
x_dims
.
size
()
-
1
;
i
++
)
{
x_m
*=
x_dims
[
i
];
}
int
x_k
=
x_dims
[
x_dims
.
size
()
-
1
];
const
auto
&
y_dims
=
y
->
dims
();
int
y_k
=
y_dims
[
0
];
int
y_n
=
y_dims
[
1
];
if
(
transy
)
{
y_k
=
y_dims
[
1
];
y_n
=
y_dims
[
0
];
}
if
(
x_k
!=
y_k
)
{
PD_THROW
(
"The reudce dim of A and B in matmul is not equal."
);
}
grad_x
->
Resize
(
x_dims
);
grad_y
->
Resize
(
y_dims
);
grad_bias
->
Resize
({
y_n
});
f
::
Tensor
lt_workspace
;
lt_workspace
.
Resize
({
kWorkspaceSize
});
#if defined(CUDA_VERSION) && CUDA_VERSION < 11000
PD_THROW
(
"fused_dense_cuda_backward is not supported on cuda_version < 11000"
);
#endif
PD_DISPATCH_FLOATING_AND_HALF_TYPES
(
x
->
dtype
(),
"linear_bias_cuda_backward_impl"
,
([
&
]
{
linear_bias_cuda_backward_impl
<
data_t
>
(
dev_ctx
,
x
->
data
<
data_t
>
(),
y
->
data
<
data_t
>
(),
grad_out
->
data
<
data_t
>
(),
transx
,
transy
,
use_addto
,
x_k
,
x_m
,
y_n
,
grad_y
->
mutable_data
<
data_t
>
(
place
),
grad_bias
->
mutable_data
<
data_t
>
(
place
),
grad_x
->
mutable_data
<
data_t
>
(
place
),
lt_workspace
.
mutable_data
<
data_t
>
(
place
));
}));
}
external_ops/fused_dense_op/test_fused_dense_op.py
0 → 100644
View file @
581b8d15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
custom_setup_ops
import
custom_fused_dense
import
numpy
as
np
iters
=
1
def
test_fused_dense_op
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
,
atol
):
# x * weight^t + bias
def
run_paddle_mm_bias
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
):
pp_x
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
pp_weight
=
paddle
.
to_tensor
(
weight
,
stop_gradient
=
False
)
pp_bias
=
paddle
.
to_tensor
(
bias
,
stop_gradient
=
False
)
pp_out
=
paddle
.
matmul
(
pp_x
,
pp_weight
,
transx
,
transy
)
pp_bias_out
=
paddle
.
add
(
pp_out
,
pp_bias
)
pp_grad_out
=
paddle
.
to_tensor
(
grad_out
)
paddle
.
autograd
.
backward
(
[
pp_bias_out
],
[
pp_grad_out
],
retain_graph
=
True
)
return
pp_bias_out
,
pp_x
.
grad
,
pp_weight
.
grad
,
pp_bias
.
grad
def
run_custom_fuse_dense
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
):
x_tensor
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
weight_tensor
=
paddle
.
to_tensor
(
weight
,
stop_gradient
=
False
)
bias_tensor
=
paddle
.
to_tensor
(
bias
,
stop_gradient
=
False
)
out_tensor
=
custom_fused_dense
(
x_tensor
,
weight_tensor
,
bias_tensor
,
transx
,
transy
)
grad_out_tensor
=
paddle
.
to_tensor
(
grad_out
)
paddle
.
autograd
.
backward
(
[
out_tensor
],
[
grad_out_tensor
],
retain_graph
=
True
)
return
out_tensor
,
x_tensor
.
grad
,
weight_tensor
.
grad
,
bias_tensor
.
grad
'''
def run_numpy_mm_bias(x, weight, bias, transx, transy):
out = np.matmul(x, weight.transpose(1, 0))
out = out + bias
return out
'''
def
run_ref_backward
(
x
,
weight
,
bias
,
trans
,
transy
,
grad_out
):
x_tensor
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
weight_tensor
=
paddle
.
to_tensor
(
weight
,
stop_gradient
=
False
)
grad_out_tensor
=
paddle
.
to_tensor
(
grad_out
)
# d_weight: x * grad_out^t (nt)
# d_input: weight * grad_out (nn)
if
transy
:
ref_grad_weight
=
paddle
.
matmul
(
grad_out_tensor
,
x_tensor
,
True
,
False
)
ref_grad_input
=
paddle
.
matmul
(
grad_out_tensor
,
weight_tensor
,
False
,
False
)
else
:
ref_grad_weight
=
paddle
.
matmul
(
x_tensor
,
grad_out_tensor
,
True
,
False
)
ref_grad_input
=
paddle
.
matmul
(
grad_out_tensor
,
weight_tensor
,
False
,
True
)
ref_grad_bias
=
paddle
.
sum
(
grad_out_tensor
,
axis
=
0
)
return
ref_grad_input
,
ref_grad_weight
,
ref_grad_bias
'''
ref_out = run_numpy_mm_bias(x, weight, bias, transx, transy)
'''
ref_pp_out
,
ref_pp_x_grad
,
ref_pp_weight_grad
,
ref_pp_bias_grad
=
run_paddle_mm_bias
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
)
#print("ref_pp_x_grad shape: ", ref_pp_x_grad.shape)
custom_out
,
x_grad
,
weight_grad
,
bias_grad
=
run_custom_fuse_dense
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
)
#print("x_grad shape: ", x_grad.shape)
new_ref_grad_input
,
new_ref_grad_weight
,
new_ref_grad_bias
=
run_ref_backward
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
)
# check out
'''
np.testing.assert_allclose(
ref_out, custom_out.numpy(),
rtol=1e-5, atol=atol)
'''
np
.
testing
.
assert_allclose
(
ref_pp_out
.
numpy
(),
custom_out
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
# check grad
np
.
testing
.
assert_allclose
(
ref_pp_x_grad
.
numpy
(),
x_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
np
.
testing
.
assert_allclose
(
new_ref_grad_weight
.
numpy
(),
weight_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
np
.
testing
.
assert_allclose
(
new_ref_grad_bias
.
numpy
(),
bias_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
def
generate_input_data
(
m
,
dtype
=
np
.
float16
):
# index = np.random.randint(0, 5, (m))
data
=
np
.
random
.
random
((
m
)).
astype
(
dtype
)
for
i
in
range
(
m
):
#index[i] = 1
#data[i] = 1.0/(np.exp2(index[i]))
if
i
%
2
==
0
:
data
[
i
]
=
0.25
elif
i
%
3
==
0
:
data
[
i
]
=
0.5
else
:
data
[
i
]
=
0.0625
return
data
def
generate_fixed_input
(
x_m
,
in_feature
,
out_feature
,
transy
,
dtype
):
x
=
generate_input_data
(
x_m
*
in_feature
,
dtype
)
x
=
x
.
reshape
(
x_m
,
in_feature
)
weight
=
generate_input_data
(
out_feature
*
in_feature
,
dtype
)
if
transy
:
weight
=
weight
.
reshape
(
out_feature
,
in_feature
)
else
:
weight
=
weight
.
reshape
(
in_feature
,
out_feature
)
bias
=
generate_input_data
(
out_feature
,
dtype
)
grad_out
=
generate_input_data
(
x_m
*
out_feature
,
dtype
)
grad_out
=
grad_out
.
reshape
(
x_m
,
out_feature
)
return
x
,
weight
,
bias
,
grad_out
def
generate_ones_input
(
x_m
,
in_feature
,
out_feature
,
transy
,
dtype
):
x
=
np
.
ones
(
x_m
*
in_feature
).
astype
(
dtype
)
x
=
x
.
reshape
(
x_m
,
in_feature
)
weight
=
np
.
ones
(
out_feature
*
in_feature
).
astype
(
dtype
)
if
transy
:
weight
=
weight
.
reshape
(
out_feature
,
in_feature
)
else
:
weight
=
weight
.
reshape
(
in_feature
,
out_feature
)
bias
=
np
.
ones
(
out_feature
).
astype
(
dtype
)
# bias = np.zeros(out_feature).astype(dtype)
grad_out
=
np
.
ones
(
x_m
*
out_feature
).
astype
(
dtype
)
grad_out
=
grad_out
.
reshape
(
x_m
,
out_feature
)
return
x
,
weight
,
bias
,
grad_out
def
test_driver
(
i
=
0
,
x_m
=
56
,
in_feature
=
4
,
out_feature
=
8
,
transx
=
False
,
transy
=
True
,
dtype
=
np
.
float16
,
atol
=
1e-2
):
for
i
in
range
(
iters
):
if
i
==
0
:
x
,
weight
,
bias
,
grad_out
=
generate_ones_input
(
x_m
,
in_feature
,
out_feature
,
transy
,
dtype
)
elif
i
==
1
:
x
,
weight
,
bias
,
grad_out
=
generate_fixed_input
(
x_m
,
in_feature
,
out_feature
,
transy
,
dtype
)
else
:
x
=
np
.
random
.
random
((
x_m
,
in_feature
)).
astype
(
dtype
)
if
transy
:
weight
=
np
.
random
.
random
(
(
out_feature
,
in_feature
)).
astype
(
dtype
)
else
:
weight
=
np
.
random
.
random
(
(
in_feature
,
out_feature
)).
astype
(
dtype
)
bias
=
np
.
random
.
random
((
out_feature
)).
astype
(
dtype
)
grad_out
=
np
.
random
.
random
((
x_m
,
out_feature
)).
astype
(
dtype
)
test_fused_dense_op
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
,
atol
)
## Note: mlperf config: x_m from xx to 28672, in_feature is 1024, out_feature is 1024/4096.
for
i
in
range
(
3
):
print
(
"Begin Test "
,
i
)
if
i
==
0
or
i
==
1
:
fp16_atol
=
1e-5
atol
=
1e-5
else
:
fp16_atol
=
0.3
atol
=
1e-3
#####################################################
## nt
## randome input: 0.2 is not right, should set to 0.3
print
(
"gemm_nt + bias test: "
)
test_driver
(
i
,
x_m
=
56
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
atol
=
fp16_atol
)
test_driver
(
i
,
x_m
=
56
,
in_feature
=
1024
,
out_feature
=
4096
,
transx
=
False
,
transy
=
True
,
atol
=
fp16_atol
)
test_driver
(
i
,
x_m
=
1000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
atol
=
fp16_atol
)
## for 0.0625 input, fp16 type's max error is 0.03125
test_driver
(
i
,
x_m
=
2000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
np
.
float32
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
np
.
float32
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
out_feature
=
4096
,
transx
=
False
,
transy
=
True
,
dtype
=
np
.
float32
,
atol
=
atol
)
#####################################################
## nn
print
(
"gemm_nn + bias test: "
)
test_driver
(
i
,
x_m
=
2
,
in_feature
=
1
,
out_feature
=
4
,
transx
=
False
,
transy
=
False
,
dtype
=
np
.
float32
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
56
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
np
.
float32
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
56
,
in_feature
=
1024
,
out_feature
=
4096
,
transx
=
False
,
transy
=
False
,
dtype
=
np
.
float32
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
np
.
float32
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
out_feature
=
4096
,
transx
=
False
,
transy
=
False
,
dtype
=
np
.
float32
,
atol
=
atol
)
external_ops/fused_dense_op/test_fused_dense_perf.py
0 → 100644
View file @
581b8d15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
custom_setup_ops
import
custom_fused_dense
import
numpy
as
np
import
time
iters
=
100
place
=
paddle
.
CUDAPlace
(
0
)
def
test_fused_dense_op
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
,
atol
):
# x * weight^t + bias
def
run_paddle_mm_bias
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
):
pp_x
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
pp_weight
=
paddle
.
to_tensor
(
weight
,
stop_gradient
=
False
)
pp_bias
=
paddle
.
to_tensor
(
bias
,
stop_gradient
=
False
)
pp_grad_out
=
paddle
.
to_tensor
(
grad_out
)
# warmup
pp_out
=
paddle
.
matmul
(
pp_x
,
pp_weight
,
transx
,
transy
)
pp_bias_out
=
paddle
.
add
(
pp_out
,
pp_bias
)
paddle
.
autograd
.
backward
(
[
pp_bias_out
],
[
pp_grad_out
],
retain_graph
=
True
)
paddle
.
device
.
cuda
.
synchronize
(
place
)
t1
=
time
.
time
()
for
i
in
range
(
iters
):
pp_out
=
paddle
.
matmul
(
pp_x
,
pp_weight
,
transx
,
transy
)
pp_bias_out
=
paddle
.
add
(
pp_out
,
pp_bias
)
paddle
.
autograd
.
backward
(
[
pp_bias_out
],
[
pp_grad_out
],
retain_graph
=
True
)
paddle
.
device
.
cuda
.
synchronize
(
place
)
t2
=
time
.
time
()
print
(
"paddle gemm time is: (ms)"
,
(
t2
-
t1
)
*
1000
)
return
pp_bias_out
,
pp_x
.
grad
,
pp_weight
.
grad
,
pp_bias
.
grad
def
run_custom_fuse_dense
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
):
x_tensor
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
weight_tensor
=
paddle
.
to_tensor
(
weight
,
stop_gradient
=
False
)
bias_tensor
=
paddle
.
to_tensor
(
bias
,
stop_gradient
=
False
)
grad_out_tensor
=
paddle
.
to_tensor
(
grad_out
)
# warmup
out_tensor
=
custom_fused_dense
(
x_tensor
,
weight_tensor
,
bias_tensor
,
transx
,
transy
)
paddle
.
autograd
.
backward
(
[
out_tensor
],
[
grad_out_tensor
],
retain_graph
=
True
)
t3
=
time
.
time
()
for
i
in
range
(
iters
):
out_tensor
=
custom_fused_dense
(
x_tensor
,
weight_tensor
,
bias_tensor
,
transx
,
transy
)
paddle
.
autograd
.
backward
(
[
out_tensor
],
[
grad_out_tensor
],
retain_graph
=
True
)
paddle
.
device
.
cuda
.
synchronize
(
place
)
t4
=
time
.
time
()
print
(
"cublaslt gemm time is: (ms)"
,
(
t4
-
t3
)
*
1000
)
return
out_tensor
,
x_tensor
.
grad
,
weight_tensor
.
grad
,
bias_tensor
.
grad
def
run_ref_backward
(
x
,
weight
,
bias
,
trans
,
transy
,
grad_out
):
x_tensor
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
weight_tensor
=
paddle
.
to_tensor
(
weight
,
stop_gradient
=
False
)
grad_out_tensor
=
paddle
.
to_tensor
(
grad_out
)
# d_weight: x * grad_out^t (nt)
# d_input: weight * grad_out (nn)
if
transy
:
ref_grad_weight
=
paddle
.
matmul
(
grad_out_tensor
,
x_tensor
,
True
,
False
)
ref_grad_input
=
paddle
.
matmul
(
grad_out_tensor
,
weight_tensor
,
False
,
False
)
else
:
ref_grad_weight
=
paddle
.
matmul
(
x_tensor
,
grad_out_tensor
,
True
,
False
)
ref_grad_input
=
paddle
.
matmul
(
grad_out_tensor
,
weight_tensor
,
False
,
True
)
ref_grad_bias
=
paddle
.
sum
(
grad_out_tensor
,
axis
=
0
)
return
ref_grad_input
,
ref_grad_weight
,
ref_grad_bias
#print("ref_pp_x_grad shape: ", ref_pp_x_grad.shape)
custom_out
,
x_grad
,
weight_grad
,
bias_grad
=
run_custom_fuse_dense
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
)
ref_pp_out
,
ref_pp_x_grad
,
ref_pp_weight_grad
,
ref_pp_bias_grad
=
run_paddle_mm_bias
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
)
def
generate_input_data
(
m
,
dtype
=
np
.
float16
):
# index = np.random.randint(0, 5, (m))
data
=
np
.
random
.
random
((
m
)).
astype
(
dtype
)
for
i
in
range
(
m
):
#index[i] = 1
#data[i] = 1.0/(np.exp2(index[i]))
if
i
%
2
==
0
:
data
[
i
]
=
0.25
elif
i
%
3
==
0
:
data
[
i
]
=
0.5
else
:
data
[
i
]
=
0.0625
return
data
def
generate_fixed_input
(
x_m
,
in_feature
,
out_feature
,
transy
,
dtype
):
x
=
generate_input_data
(
x_m
*
in_feature
,
dtype
)
x
=
x
.
reshape
(
x_m
,
in_feature
)
weight
=
generate_input_data
(
out_feature
*
in_feature
,
dtype
)
if
transy
:
weight
=
weight
.
reshape
(
out_feature
,
in_feature
)
else
:
weight
=
weight
.
reshape
(
in_feature
,
out_feature
)
bias
=
generate_input_data
(
out_feature
,
dtype
)
grad_out
=
generate_input_data
(
x_m
*
out_feature
,
dtype
)
grad_out
=
grad_out
.
reshape
(
x_m
,
out_feature
)
return
x
,
weight
,
bias
,
grad_out
def
generate_ones_input
(
x_m
,
in_feature
,
out_feature
,
transy
,
dtype
):
x
=
np
.
ones
(
x_m
*
in_feature
).
astype
(
dtype
)
x
=
x
.
reshape
(
x_m
,
in_feature
)
weight
=
np
.
ones
(
out_feature
*
in_feature
).
astype
(
dtype
)
if
transy
:
weight
=
weight
.
reshape
(
out_feature
,
in_feature
)
else
:
weight
=
weight
.
reshape
(
in_feature
,
out_feature
)
bias
=
np
.
ones
(
out_feature
).
astype
(
dtype
)
# bias = np.zeros(out_feature).astype(dtype)
grad_out
=
np
.
ones
(
x_m
*
out_feature
).
astype
(
dtype
)
grad_out
=
grad_out
.
reshape
(
x_m
,
out_feature
)
return
x
,
weight
,
bias
,
grad_out
def
test_driver
(
i
=
0
,
x_m
=
56
,
in_feature
=
4
,
out_feature
=
8
,
transx
=
False
,
transy
=
True
,
dtype
=
np
.
float16
,
atol
=
1e-2
):
print
(
"m, in_feature, out_feature = "
,
x_m
,
in_feature
,
out_feature
)
if
i
==
0
:
x
,
weight
,
bias
,
grad_out
=
generate_ones_input
(
x_m
,
in_feature
,
out_feature
,
transy
,
dtype
)
elif
i
==
1
:
x
,
weight
,
bias
,
grad_out
=
generate_fixed_input
(
x_m
,
in_feature
,
out_feature
,
transy
,
dtype
)
else
:
x
=
np
.
random
.
random
((
x_m
,
in_feature
)).
astype
(
dtype
)
if
transy
:
weight
=
np
.
random
.
random
((
out_feature
,
in_feature
)).
astype
(
dtype
)
else
:
weight
=
np
.
random
.
random
((
in_feature
,
out_feature
)).
astype
(
dtype
)
bias
=
np
.
random
.
random
((
out_feature
)).
astype
(
dtype
)
grad_out
=
np
.
random
.
random
((
x_m
,
out_feature
)).
astype
(
dtype
)
test_fused_dense_op
(
x
,
weight
,
bias
,
transx
,
transy
,
grad_out
,
atol
)
print
(
"
\n
"
)
## Note: mlperf config: x_m from xx to 28672, in_feature is 1024, out_feature is 1024/4096.
i
=
0
fp16_atol
=
1e-5
atol
=
1e-5
data_type
=
np
.
float16
###############################################################################################################
## nt
## randome input: 0.2 is not right, should set to 0.3
print
(
"gemm_nt + bias test: "
)
test_driver
(
i
,
x_m
=
56
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
1000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
2000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
4000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
8000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
16000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
True
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
out_feature
=
4096
,
transx
=
False
,
transy
=
True
,
dtype
=
data_type
,
atol
=
atol
)
###############################################################################################################
## nn
print
(
"gemm_nn + bias test: "
)
test_driver
(
i
,
x_m
=
56
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
1000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
2000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
4000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
8000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
16000
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
out_feature
=
1024
,
transx
=
False
,
transy
=
False
,
dtype
=
data_type
,
atol
=
atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
out_feature
=
4096
,
transx
=
False
,
transy
=
False
,
dtype
=
data_type
,
atol
=
atol
)
external_ops/fused_dropout_residual_ln/README.md
0 → 100644
View file @
581b8d15
# function
This fused op implements the following logics:
```
layer_norm(residual + dropout(input))
```
# Accuracy test
python test_fused_dropout_op.py
# Perf test
python test_fused_dropout_perf.py
# TODO
1.
In the unittest, we only test the accuracy when dropout_rate is 0.
2.
how to set is_test is true for dropout in eval phase?
external_ops/fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cc
0 → 100644
View file @
581b8d15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
std
::
vector
<
std
::
vector
<
int64_t
>>
FusedDropoutResidualLnInferShape
(
const
std
::
vector
<
int64_t
>&
x_shape
,
const
std
::
vector
<
int64_t
>&
residual_shape
,
const
std
::
vector
<
int64_t
>&
ln_scale_shape
,
const
std
::
vector
<
int64_t
>&
ln_bias_shape
)
{
int
x_m
=
1
;
for
(
int
i
=
0
;
i
<
x_shape
.
size
()
-
1
;
i
++
)
{
x_m
*=
x_shape
[
i
];
}
const
std
::
vector
<
int64_t
>
ln_out_shape
=
{
x_m
};
return
{
x_shape
,
x_shape
,
ln_out_shape
,
ln_out_shape
,
x_shape
};
}
// todo: now, ln_scale is fp16; how ot set ln_mean and ln_var is fp32?
std
::
vector
<
paddle
::
DataType
>
FusedDropoutResidualLnInferDtype
(
paddle
::
DataType
x_dtype
,
paddle
::
DataType
residual_dtype
,
paddle
::
DataType
ln_scale_dtype
,
paddle
::
DataType
ln_bias_dtype
)
{
// the type of ln_mean/ln_var is the same as ln_scale.
return
{
x_dtype
,
paddle
::
DataType
::
UINT8
,
paddle
::
DataType
::
FLOAT32
,
paddle
::
DataType
::
FLOAT32
,
// ln_scale_dtype,
// ln_scale_dtype,
x_dtype
};
}
PD_BUILD_OP
(
custom_fused_dropout_residual_ln
)
.
Inputs
({
"X"
,
"Residual"
,
"LnScale"
,
"LnBias"
})
.
Outputs
({
"Out"
,
"DropoutMask"
,
"LnMean"
,
"LnVar"
,
"DropoutResidualOut"
})
.
Attrs
({
"ln_epsilon: float"
,
"is_test: bool"
,
"fix_seed: bool"
,
"seed_val: int"
,
"is_upscale_in_train: bool"
,
"dropout_rate: float"
})
.
SetInferShapeFn
(
PD_INFER_SHAPE
(
FusedDropoutResidualLnInferShape
))
.
SetInferDtypeFn
(
PD_INFER_DTYPE
(
FusedDropoutResidualLnInferDtype
));
PD_BUILD_GRAD_OP
(
custom_fused_dropout_residual_ln
)
.
Inputs
({
"X"
,
"Residual"
,
"LnScale"
,
"LnBias"
,
"DropoutMask"
,
"LnMean"
,
"LnVar"
,
"DropoutResidualOut"
,
paddle
::
Grad
(
"Out"
)})
.
Outputs
({
paddle
::
Grad
(
"X"
),
paddle
::
Grad
(
"Residual"
),
paddle
::
Grad
(
"LnScale"
),
paddle
::
Grad
(
"LnBias"
)})
.
Attrs
({
"ln_epsilon: float"
,
"is_test: bool"
,
"fix_seed: bool"
,
"seed_val: int"
,
"is_upscale_in_train: bool"
,
"dropout_rate: float"
});
external_ops/fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cu
0 → 100644
View file @
581b8d15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
__PD_DEFINE_RAW_OP_KERNEL_FUNC
(
custom_fused_dropout_residual_ln
,
ctx
)
{
namespace
f
=
paddle
::
framework
;
const
auto
&
x
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"X"
);
const
auto
&
residual
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"Residual"
);
const
auto
&
ln_scale
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"LnScale"
);
const
auto
&
ln_bias
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"LnBias"
);
auto
&
final_out
=
*
ctx
.
Output
<
f
::
Tensor
>
(
"Out"
);
auto
&
dropout_mask_out
=
*
ctx
.
Output
<
f
::
Tensor
>
(
"DropoutMask"
);
auto
&
ln_mean
=
*
ctx
.
Output
<
f
::
Tensor
>
(
"LnMean"
);
auto
&
ln_var
=
*
ctx
.
Output
<
f
::
Tensor
>
(
"LnVar"
);
auto
&
dropout_residual_out
=
*
ctx
.
Output
<
f
::
Tensor
>
(
"DropoutResidualOut"
);
auto
ln_epsilon
=
ctx
.
Attr
<
float
>
(
"ln_epsilon"
);
auto
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
fix_seed
=
ctx
.
Attr
<
bool
>
(
"fix_seed"
);
auto
seed_val
=
ctx
.
Attr
<
int
>
(
"seed_val"
);
auto
is_upscale_in_train
=
ctx
.
Attr
<
bool
>
(
"is_upscale_in_train"
);
auto
dropout_rate
=
ctx
.
Attr
<
float
>
(
"dropout_rate"
);
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
auto
place
=
dev_ctx
.
GetPlace
();
// inputs
const
auto
&
x_dims
=
x
.
dims
();
int
x_m
=
1
;
for
(
int
i
=
0
;
i
<
x_dims
.
size
()
-
1
;
i
++
)
{
x_m
*=
x_dims
[
i
];
}
int
x_n
=
x_dims
[
x_dims
.
size
()
-
1
];
// outputs
final_out
.
Resize
(
x_dims
);
dropout_mask_out
.
Resize
(
x_dims
);
dropout_residual_out
.
Resize
(
x_dims
);
ln_mean
.
Resize
({
x_m
});
ln_var
.
Resize
({
x_m
});
paddle
::
operators
::
DropoutParam
dropout_param
(
fix_seed
,
0
,
is_test
,
is_upscale_in_train
,
dropout_rate
,
nullptr
,
seed_val
);
PD_DISPATCH_FLOATING_AND_HALF_TYPES
(
x
.
dtype
(),
"LayernormResidualDropoutBias"
,
([
&
]
{
paddle
::
operators
::
FusedDropoutLayerNormHelper
<
data_t
,
uint8_t
>
fused_dropout_layernorm_helper
(
dev_ctx
,
x_m
,
x_n
,
dropout_param
,
ln_epsilon
);
fused_dropout_layernorm_helper
.
LayernormResidualDropoutBias
<
data_t
,
true
>
(
dev_ctx
,
x
.
data
<
data_t
>
(),
// out_linear_out_data,
residual
.
data
<
data_t
>
(),
// residual_data
nullptr
,
// bias_data,
ln_scale
.
data
<
data_t
>
(),
ln_bias
.
data
<
data_t
>
(),
dev_ctx
.
Alloc
<
data_t
>
(
&
dropout_residual_out
),
dev_ctx
.
Alloc
<
uint8_t
>
(
&
dropout_mask_out
),
dev_ctx
.
Alloc
<
data_t
>
(
&
final_out
),
dev_ctx
.
Alloc
<
paddle
::
operators
::
LayerNormParamType
<
data_t
>>
(
&
ln_mean
),
dev_ctx
.
Alloc
<
paddle
::
operators
::
LayerNormParamType
<
data_t
>>
(
&
ln_var
));
}));
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC
(
custom_fused_dropout_residual_ln_grad
,
ctx
)
{
namespace
f
=
paddle
::
framework
;
const
auto
&
ln_scale
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"LnScale"
);
const
auto
&
dropout_mask_out
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"DropoutMask"
);
const
auto
&
ln_mean
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"LnMean"
);
const
auto
&
ln_var
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"LnVar"
);
const
auto
&
dropout_residual_out
=
*
ctx
.
Input
<
f
::
Tensor
>
(
"DropoutResidualOut"
);
const
auto
&
grad_out
=
*
ctx
.
Input
<
f
::
Tensor
>
(
f
::
GradVarName
(
"Out"
));
auto
&
grad_x
=
*
ctx
.
Output
<
f
::
Tensor
>
(
f
::
GradVarName
(
"X"
));
auto
&
grad_residual
=
*
ctx
.
Output
<
f
::
Tensor
>
(
f
::
GradVarName
(
"Residual"
));
auto
&
grad_ln_scale
=
*
ctx
.
Output
<
f
::
Tensor
>
(
f
::
GradVarName
(
"LnScale"
));
auto
&
grad_ln_bias
=
*
ctx
.
Output
<
f
::
Tensor
>
(
f
::
GradVarName
(
"LnBias"
));
f
::
Tensor
grad_dropout_residual_out
;
auto
ln_epsilon
=
ctx
.
Attr
<
float
>
(
"ln_epsilon"
);
auto
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
fix_seed
=
ctx
.
Attr
<
bool
>
(
"fix_seed"
);
auto
seed_val
=
ctx
.
Attr
<
int
>
(
"seed_val"
);
auto
is_upscale_in_train
=
ctx
.
Attr
<
bool
>
(
"is_upscale_in_train"
);
auto
dropout_rate
=
ctx
.
Attr
<
float
>
(
"dropout_rate"
);
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
auto
place
=
dev_ctx
.
GetPlace
();
const
auto
&
x_dims
=
grad_out
.
dims
();
int
x_m
=
1
;
for
(
int
i
=
0
;
i
<
x_dims
.
size
()
-
1
;
i
++
)
{
x_m
*=
x_dims
[
i
];
}
int
x_n
=
x_dims
[
x_dims
.
size
()
-
1
];
// output
grad_x
.
Resize
(
x_dims
);
grad_residual
.
Resize
(
x_dims
);
grad_dropout_residual_out
.
Resize
(
x_dims
);
grad_ln_scale
.
Resize
(
ln_scale
.
dims
());
grad_ln_bias
.
Resize
(
ln_scale
.
dims
());
paddle
::
operators
::
DropoutParam
dropout_param
(
fix_seed
,
0
,
is_test
,
is_upscale_in_train
,
dropout_rate
,
nullptr
,
seed_val
);
PD_DISPATCH_FLOATING_AND_HALF_TYPES
(
grad_out
.
dtype
(),
"LayernormResidualDropoutBiasGrad"
,
([
&
]
{
paddle
::
operators
::
FusedDropoutLayerNormHelper
<
data_t
,
uint8_t
>
fused_dropout_layernorm_helper
(
dev_ctx
,
x_m
,
x_n
,
dropout_param
,
ln_epsilon
);
fused_dropout_layernorm_helper
.
LayernormResidualDropoutBiasGrad
<
data_t
,
true
>
(
dev_ctx
,
grad_out
.
data
<
data_t
>
(),
dropout_residual_out
.
data
<
data_t
>
(),
dropout_mask_out
.
data
<
uint8_t
>
(),
ln_scale
.
data
<
data_t
>
(),
ln_mean
.
data
<
paddle
::
operators
::
LayerNormParamType
<
data_t
>>
(),
ln_var
.
data
<
paddle
::
operators
::
LayerNormParamType
<
data_t
>>
(),
dev_ctx
.
Alloc
<
data_t
>
(
&
grad_dropout_residual_out
),
dev_ctx
.
Alloc
<
data_t
>
(
&
grad_ln_scale
),
dev_ctx
.
Alloc
<
data_t
>
(
&
grad_ln_bias
),
dev_ctx
.
Alloc
<
data_t
>
(
&
grad_x
),
// d_out_linear_out_data,
nullptr
,
// d_out_linear_bias_data,
dev_ctx
.
Alloc
<
data_t
>
(
&
grad_residual
));
}));
}
external_ops/fused_dropout_residual_ln/test_fused_dropout_op.py
0 → 100644
View file @
581b8d15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
custom_setup_ops
import
custom_fused_dropout_residual_ln
import
numpy
as
np
iters
=
2
def
test_fused_dropout_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
,
atol
):
# layer_norm(dropout(x) + residual)
def
run_paddle_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
):
pp_x
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
pp_residual
=
paddle
.
to_tensor
(
residual
,
stop_gradient
=
False
)
pp_ln_scale
=
paddle
.
to_tensor
(
ln_scale
,
stop_gradient
=
False
)
pp_ln_bias
=
paddle
.
to_tensor
(
ln_bias
,
stop_gradient
=
False
)
pp_grad_out
=
paddle
.
to_tensor
(
grad_out
)
if
dropout_rate
>
0
:
pp_out
=
paddle
.
nn
.
functional
.
dropout
(
pp_x
,
dropout_rate
)
pp_add_out
=
paddle
.
add
(
pp_out
,
pp_residual
)
else
:
pp_add_out
=
paddle
.
add
(
pp_x
,
pp_residual
)
pp_out
=
paddle
.
nn
.
functional
.
layer_norm
(
pp_add_out
,
pp_add_out
.
shape
[
1
:],
pp_ln_scale
,
pp_ln_bias
,
epsilon
)
paddle
.
autograd
.
backward
([
pp_out
],
[
pp_grad_out
],
retain_graph
=
True
)
return
pp_out
,
pp_add_out
,
pp_x
.
grad
,
pp_residual
.
grad
,
pp_ln_scale
.
grad
,
pp_ln_bias
.
grad
def
run_custom_fuse_dropout_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
):
x_tensor
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
residual_tensor
=
paddle
.
to_tensor
(
residual
,
stop_gradient
=
False
)
ln_scale_tensor
=
paddle
.
to_tensor
(
ln_scale
,
stop_gradient
=
False
)
ln_bias_tensor
=
paddle
.
to_tensor
(
ln_bias
,
stop_gradient
=
False
)
grad_out_tensor
=
paddle
.
to_tensor
(
grad_out
)
# Note: use the default config of dropout.
is_test
=
False
fix_seed
=
True
is_upscale_in_train
=
True
seed_val
=
0
out_tensor
,
dropout_mask
,
ln_mean
,
ln_var
,
dropout_residual_out
=
custom_fused_dropout_residual_ln
(
x_tensor
,
residual_tensor
,
ln_scale_tensor
,
ln_bias_tensor
,
epsilon
,
is_test
,
fix_seed
,
seed_val
,
is_upscale_in_train
,
dropout_rate
)
paddle
.
autograd
.
backward
(
[
out_tensor
],
[
grad_out_tensor
],
retain_graph
=
True
)
return
out_tensor
,
dropout_residual_out
,
x_tensor
.
grad
,
residual_tensor
.
grad
,
ln_scale_tensor
.
grad
,
ln_bias_tensor
.
grad
pp_out
,
pp_add_out
,
pp_x_grad
,
pp_residual_grad
,
pp_ln_scale_grad
,
pp_ln_bias_grad
=
run_paddle_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
)
out
,
add_out
,
x_grad
,
residual_grad
,
ln_scale_grad
,
ln_bias_grad
=
run_custom_fuse_dropout_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
)
# check out
np
.
testing
.
assert_allclose
(
pp_out
.
numpy
(),
out
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
np
.
testing
.
assert_allclose
(
pp_add_out
.
numpy
(),
add_out
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
# check grad
np
.
testing
.
assert_allclose
(
pp_x_grad
.
numpy
(),
x_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
np
.
testing
.
assert_allclose
(
pp_residual_grad
.
numpy
(),
residual_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
np
.
testing
.
assert_allclose
(
pp_ln_scale_grad
.
numpy
(),
ln_scale_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
np
.
testing
.
assert_allclose
(
pp_ln_bias_grad
.
numpy
(),
ln_bias_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
atol
)
def
generate_input_data
(
m
,
dtype
=
np
.
float16
):
data
=
np
.
random
.
random
((
m
)).
astype
(
dtype
)
for
i
in
range
(
m
):
if
i
%
2
==
0
:
data
[
i
]
=
0.25
elif
i
%
3
==
0
:
data
[
i
]
=
0.5
else
:
data
[
i
]
=
0.0625
return
data
def
generate_fixed_input
(
x_m
,
in_feature
,
dtype
):
x
=
generate_input_data
(
x_m
*
in_feature
,
dtype
)
x
=
x
.
reshape
(
x_m
,
in_feature
)
residual
=
generate_input_data
(
x_m
*
in_feature
,
dtype
)
residual
=
residual
.
reshape
(
x_m
,
in_feature
)
ln_scale
=
generate_input_data
(
in_feature
,
dtype
)
ln_bias
=
generate_input_data
(
in_feature
,
dtype
)
grad_out
=
generate_input_data
(
x_m
*
in_feature
,
dtype
)
grad_out
=
grad_out
.
reshape
(
x_m
,
in_feature
)
return
x
,
residual
,
ln_scale
,
ln_bias
,
grad_out
def
generate_ones_input
(
x_m
,
in_feature
,
dtype
):
x
=
np
.
ones
(
x_m
*
in_feature
).
astype
(
dtype
)
x
=
x
.
reshape
(
x_m
,
in_feature
)
residual
=
np
.
ones
(
x_m
*
in_feature
).
astype
(
dtype
)
residual
=
residual
.
reshape
(
x_m
,
in_feature
)
ln_scale
=
np
.
ones
(
in_feature
).
astype
(
dtype
)
ln_bias
=
np
.
ones
(
in_feature
).
astype
(
dtype
)
grad_out
=
np
.
ones
(
x_m
*
in_feature
).
astype
(
dtype
)
grad_out
=
grad_out
.
reshape
(
x_m
,
in_feature
)
return
x
,
residual
,
ln_scale
,
ln_bias
,
grad_out
def
test_driver
(
i
=
0
,
x_m
=
56
,
in_feature
=
4
,
epsilon
=
1e-5
,
dropout_rate
=
0
,
dtype
=
np
.
float16
,
atol
=
1e-2
):
for
j
in
range
(
iters
):
if
i
==
0
:
x
,
residual
,
ln_scale
,
ln_bias
,
grad_out
=
generate_ones_input
(
x_m
,
in_feature
,
dtype
)
elif
i
==
1
:
x
,
residual
,
ln_scale
,
ln_bias
,
grad_out
=
generate_fixed_input
(
x_m
,
in_feature
,
dtype
)
else
:
x
=
np
.
random
.
random
((
x_m
,
in_feature
)).
astype
(
dtype
)
residual
=
np
.
random
.
random
((
x_m
,
in_feature
)).
astype
(
dtype
)
ln_scale
=
np
.
random
.
random
((
in_feature
)).
astype
(
dtype
)
ln_bias
=
np
.
random
.
random
((
in_feature
)).
astype
(
dtype
)
grad_out
=
np
.
random
.
random
((
x_m
,
in_feature
)).
astype
(
dtype
)
test_fused_dropout_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
,
atol
)
## Note: mlperf config: x_m is xx to 28672, in_feature is 1024.
## Note: only dropout_rate=0 is tested.
for
i
in
range
(
3
):
print
(
"Begin Test "
,
i
)
atol
=
1e-5
if
i
==
0
or
i
==
1
:
fp16_atol
=
1e-5
else
:
fp16_atol
=
1e-2
atol
=
1e-3
# fp16
test_driver
(
i
,
x_m
=
56
,
in_feature
=
1024
,
epsilon
=
1e-5
,
dropout_rate
=
0
,
dtype
=
np
.
float16
,
atol
=
fp16_atol
)
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
epsilon
=
1e-5
,
dropout_rate
=
0
,
dtype
=
np
.
float16
,
atol
=
fp16_atol
)
# fp32
test_driver
(
i
,
x_m
=
28672
,
in_feature
=
1024
,
epsilon
=
1e-5
,
dropout_rate
=
0
,
dtype
=
np
.
float32
,
atol
=
atol
)
external_ops/fused_dropout_residual_ln/test_fused_dropout_perf.py
0 → 100644
View file @
581b8d15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
custom_setup_ops
import
custom_fused_dropout_residual_ln
import
numpy
as
np
import
time
iters
=
100
place
=
paddle
.
CUDAPlace
(
0
)
def
test_fused_dropout_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
,
atol
):
# layer_norm(dropout(x) + residual)
def
run_paddle_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
):
pp_x
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
pp_residual
=
paddle
.
to_tensor
(
residual
,
stop_gradient
=
False
)
pp_ln_scale
=
paddle
.
to_tensor
(
ln_scale
,
stop_gradient
=
False
)
pp_ln_bias
=
paddle
.
to_tensor
(
ln_bias
,
stop_gradient
=
False
)
pp_grad_out
=
paddle
.
to_tensor
(
grad_out
)
# warmup
if
dropout_rate
>
0
:
pp_out
=
paddle
.
nn
.
functional
.
dropout
(
pp_x
,
dropout_rate
)
pp_add_out
=
paddle
.
add
(
pp_out
,
pp_residual
)
else
:
pp_add_out
=
paddle
.
add
(
pp_x
,
pp_residual
)
pp_out
=
paddle
.
nn
.
functional
.
layer_norm
(
pp_add_out
,
pp_add_out
.
shape
[
1
:],
pp_ln_scale
,
pp_ln_bias
,
epsilon
)
paddle
.
autograd
.
backward
([
pp_out
],
[
pp_grad_out
],
retain_graph
=
True
)
paddle
.
device
.
cuda
.
synchronize
(
place
)
t1
=
time
.
time
()
for
i
in
range
(
iters
):
if
dropout_rate
>
0
:
pp_out
=
paddle
.
nn
.
functional
.
dropout
(
pp_x
,
dropout_rate
)
pp_add_out
=
paddle
.
add
(
pp_out
,
pp_residual
)
else
:
pp_add_out
=
paddle
.
add
(
pp_x
,
pp_residual
)
pp_out
=
paddle
.
nn
.
functional
.
layer_norm
(
pp_add_out
,
pp_add_out
.
shape
[
1
:],
pp_ln_scale
,
pp_ln_bias
,
epsilon
)
paddle
.
autograd
.
backward
([
pp_out
],
[
pp_grad_out
],
retain_graph
=
True
)
paddle
.
device
.
cuda
.
synchronize
(
place
)
t2
=
time
.
time
()
print
(
"paddle dropout+add+ln time is: (ms)"
,
(
t2
-
t1
)
*
1000
)
return
pp_out
,
pp_add_out
,
pp_x
.
grad
,
pp_residual
.
grad
,
pp_ln_scale
.
grad
,
pp_ln_bias
.
grad
def
run_custom_fuse_dropout_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
):
x_tensor
=
paddle
.
to_tensor
(
x
,
stop_gradient
=
False
)
residual_tensor
=
paddle
.
to_tensor
(
residual
,
stop_gradient
=
False
)
ln_scale_tensor
=
paddle
.
to_tensor
(
ln_scale
,
stop_gradient
=
False
)
ln_bias_tensor
=
paddle
.
to_tensor
(
ln_bias
,
stop_gradient
=
False
)
grad_out_tensor
=
paddle
.
to_tensor
(
grad_out
)
# Note: use the default config of dropout.
is_test
=
False
fix_seed
=
True
is_upscale_in_train
=
True
seed_val
=
0
# warmup
out_tensor
,
dropout_mask
,
ln_mean
,
ln_var
,
dropout_residual_out
=
custom_fused_dropout_residual_ln
(
x_tensor
,
residual_tensor
,
ln_scale_tensor
,
ln_bias_tensor
,
epsilon
,
is_test
,
fix_seed
,
seed_val
,
is_upscale_in_train
,
dropout_rate
)
paddle
.
autograd
.
backward
(
[
out_tensor
],
[
grad_out_tensor
],
retain_graph
=
True
)
paddle
.
device
.
cuda
.
synchronize
(
place
)
t3
=
time
.
time
()
for
i
in
range
(
iters
):
out_tensor
,
dropout_mask
,
ln_mean
,
ln_var
,
dropout_residual_out
=
custom_fused_dropout_residual_ln
(
x_tensor
,
residual_tensor
,
ln_scale_tensor
,
ln_bias_tensor
,
epsilon
,
is_test
,
fix_seed
,
seed_val
,
is_upscale_in_train
,
dropout_rate
)
paddle
.
autograd
.
backward
(
[
out_tensor
],
[
grad_out_tensor
],
retain_graph
=
True
)
paddle
.
device
.
cuda
.
synchronize
(
place
)
t4
=
time
.
time
()
print
(
"fused_dropout_residual_ln op time is: (ms)"
,
(
t4
-
t3
)
*
1000
)
return
out_tensor
,
dropout_residual_out
,
x_tensor
.
grad
,
residual_tensor
.
grad
,
ln_scale_tensor
.
grad
,
ln_bias_tensor
.
grad
pp_out
,
pp_add_out
,
pp_x_grad
,
pp_residual_grad
,
pp_ln_scale_grad
,
pp_ln_bias_grad
=
run_paddle_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
)
out
,
add_out
,
x_grad
,
residual_grad
,
ln_scale_grad
,
ln_bias_grad
=
run_custom_fuse_dropout_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
)
def
generate_input_data
(
m
,
dtype
=
np
.
float16
):
data
=
np
.
random
.
random
((
m
)).
astype
(
dtype
)
for
i
in
range
(
m
):
if
i
%
2
==
0
:
data
[
i
]
=
0.25
elif
i
%
3
==
0
:
data
[
i
]
=
0.5
else
:
data
[
i
]
=
0.0625
return
data
def
generate_fixed_input
(
x_m
,
in_feature
,
dtype
):
x
=
generate_input_data
(
x_m
*
in_feature
,
dtype
)
x
=
x
.
reshape
(
x_m
,
in_feature
)
residual
=
generate_input_data
(
x_m
*
in_feature
,
dtype
)
residual
=
residual
.
reshape
(
x_m
,
in_feature
)
ln_scale
=
generate_input_data
(
in_feature
,
dtype
)
ln_bias
=
generate_input_data
(
in_feature
,
dtype
)
grad_out
=
generate_input_data
(
x_m
*
in_feature
,
dtype
)
grad_out
=
grad_out
.
reshape
(
x_m
,
in_feature
)
return
x
,
residual
,
ln_scale
,
ln_bias
,
grad_out
def
generate_ones_input
(
x_m
,
in_feature
,
dtype
):
x
=
np
.
ones
(
x_m
*
in_feature
).
astype
(
dtype
)
x
=
x
.
reshape
(
x_m
,
in_feature
)
residual
=
np
.
ones
(
x_m
*
in_feature
).
astype
(
dtype
)
residual
=
residual
.
reshape
(
x_m
,
in_feature
)
ln_scale
=
np
.
ones
(
in_feature
).
astype
(
dtype
)
ln_bias
=
np
.
ones
(
in_feature
).
astype
(
dtype
)
grad_out
=
np
.
ones
(
x_m
*
in_feature
).
astype
(
dtype
)
grad_out
=
grad_out
.
reshape
(
x_m
,
in_feature
)
return
x
,
residual
,
ln_scale
,
ln_bias
,
grad_out
def
test_driver
(
i
=
0
,
x_m
=
56
,
in_feature
=
4
,
epsilon
=
1e-5
,
dropout_rate
=
0
,
dtype
=
np
.
float16
,
atol
=
1e-2
):
print
(
"nrows, ncols, dtype = "
,
x_m
,
in_feature
,
dtype
)
if
i
==
0
:
x
,
residual
,
ln_scale
,
ln_bias
,
grad_out
=
generate_ones_input
(
x_m
,
in_feature
,
dtype
)
elif
i
==
1
:
x
,
residual
,
ln_scale
,
ln_bias
,
grad_out
=
generate_fixed_input
(
x_m
,
in_feature
,
dtype
)
else
:
x
=
np
.
random
.
random
((
x_m
,
in_feature
)).
astype
(
dtype
)
residual
=
np
.
random
.
random
((
x_m
,
in_feature
)).
astype
(
dtype
)
ln_scale
=
np
.
random
.
random
((
in_feature
)).
astype
(
dtype
)
ln_bias
=
np
.
random
.
random
((
in_feature
)).
astype
(
dtype
)
grad_out
=
np
.
random
.
random
((
x_m
,
in_feature
)).
astype
(
dtype
)
test_fused_dropout_op
(
x
,
residual
,
ln_scale
,
ln_bias
,
epsilon
,
dropout_rate
,
grad_out
,
atol
)
## Note: mlperf config: x_m is xx to 28672, in_feature is 1024.
## Note: only dropout_rate=0 is tested.
i
=
0
atol
=
1e-5
fp16_atol
=
1e-5
# fp16
print
(
"***************fp16 performance: "
)
x_m_values
=
[
56
,
200
,
500
,
1000
,
2000
,
4000
,
8000
,
16000
,
18000
,
20000
,
25000
,
28000
]
for
x_m
in
x_m_values
:
test_driver
(
i
,
x_m
=
x_m
,
in_feature
=
1024
,
epsilon
=
1e-5
,
dropout_rate
=
0.1
,
dtype
=
np
.
float16
,
atol
=
fp16_atol
)
# fp32
print
(
"**************fp32 performance: "
)
for
x_m
in
x_m_values
:
test_driver
(
i
,
x_m
=
x_m
,
in_feature
=
1024
,
epsilon
=
1e-5
,
dropout_rate
=
0.1
,
dtype
=
np
.
float32
,
atol
=
atol
)
external_ops/lr_op/lr_op_cuda.cc
0 → 100644
View file @
581b8d15
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
std
::
vector
<
std
::
vector
<
int64_t
>>
LrInferShape
(
std
::
vector
<
int64_t
>
x_shape
)
{
return
{
x_shape
};
}
// input: [1], int64
// output: [1], float
std
::
vector
<
paddle
::
DataType
>
LrInferDtype
(
paddle
::
DataType
x_dtype
)
{
return
{
paddle
::
DataType
::
FLOAT32
};
}
PD_BUILD_OP
(
custom_lr
)
.
Inputs
({
"X"
})
.
Outputs
({
"Out"
})
.
Attrs
({
"base_lr: float"
,
"end_lr: float"
,
"degree: float"
,
"warmup_step: int64_t"
,
"start_warmup_step: int64_t"
,
"max_step: int64_t"
})
.
SetInferShapeFn
(
PD_INFER_SHAPE
(
LrInferShape
))
.
SetInferDtypeFn
(
PD_INFER_DTYPE
(
LrInferDtype
));
external_ops/lr_op/lr_op_cuda.cu
0 → 100644
View file @
581b8d15
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
// x_data[0]: current step which is numbered from 0.
// Note: when computing, we should use x_data[0] + 1.
// y_data[0]: the lr var of this step
__global__
void
compute_lr_fwd_kernel
(
const
int64_t
*
x_data
,
float
*
y_data
,
float
base_lr
,
float
end_lr
,
float
degree
,
int64_t
start_warmup_step
,
int64_t
warmup_step
,
int64_t
max_step
)
{
int64_t
step
=
x_data
[
0
]
+
1
;
int64_t
offset_step
=
(
start_warmup_step
==
0
?
1
:
0
);
int64_t
mod_step
=
step
-
offset_step
-
start_warmup_step
;
double
y
;
if
(
mod_step
<
warmup_step
)
{
auto
p
=
mod_step
/
(
warmup_step
+
1e-6
);
y
=
base_lr
*
p
;
}
else
{
auto
p
=
(
step
-
offset_step
)
/
static_cast
<
double
>
(
max_step
);
p
=
(
p
>=
1
?
0
:
(
::
pow
(
1
-
p
,
degree
)));
y
=
(
base_lr
-
end_lr
)
*
p
+
end_lr
;
}
y_data
[
0
]
=
static_cast
<
float
>
(
y
);
// y_data[0] = base_lr * (static_cast<float>(max_step - x_data[0]) /
// max_step);
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC
(
custom_lr
,
ctx
)
{
namespace
f
=
paddle
::
framework
;
const
auto
*
x
=
ctx
.
Input
<
f
::
Tensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
f
::
Tensor
>
(
"Out"
);
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
auto
place
=
dev_ctx
.
GetPlace
();
auto
stream
=
dev_ctx
.
stream
();
float
base_lr
=
ctx
.
Attr
<
float
>
(
"base_lr"
);
float
end_lr
=
ctx
.
Attr
<
float
>
(
"end_lr"
);
float
degree
=
ctx
.
Attr
<
float
>
(
"degree"
);
int64_t
start_warmup_step
=
ctx
.
Attr
<
int64_t
>
(
"start_warmup_step"
);
int64_t
warmup_step
=
ctx
.
Attr
<
int64_t
>
(
"warmup_step"
);
int64_t
max_step
=
ctx
.
Attr
<
int64_t
>
(
"max_step"
);
const
auto
&
x_dims
=
x
->
dims
();
if
(
x_dims
.
size
()
!=
1
||
x_dims
[
0
]
!=
1
)
{
PD_THROW
(
"The shape of input x must be [1]."
);
}
auto
out_dims
=
x_dims
;
out
->
Resize
(
out_dims
);
const
int64_t
*
x_data
=
x
->
data
<
int64_t
>
();
float
*
out_data
=
out
->
mutable_data
<
float
>
(
x
->
place
());
compute_lr_fwd_kernel
<<<
1
,
1
,
0
,
stream
>>>
(
x_data
,
out_data
,
base_lr
,
end_lr
,
degree
,
start_warmup_step
,
warmup_step
,
max_step
);
}
external_ops/setup.py
0 → 100644
View file @
581b8d15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
paddle.fluid.core
as
core
from
paddle.utils.cpp_extension
import
CUDAExtension
,
setup
compile_dir
=
os
.
environ
.
get
(
'/public/home/zhangqha/for_baidu/Paddle-develop/build'
,
'/public/home/zhangqha/for_baidu/Paddle-develop/build'
)
define_macros
=
[]
if
core
.
is_compiled_with_mkldnn
():
define_macros
.
append
((
'PADDLE_WITH_MKLDNN'
,
None
))
if
core
.
is_compiled_with_nccl
():
define_macros
.
append
((
'PADDLE_WITH_NCCL'
,
None
))
define_macros
.
append
((
'PADDLE_WITH_HIP'
,
None
))
define_macros
.
append
((
'PADDLE_WITH_RCCL'
,
None
))
setup
(
name
=
'custom_setup_ops'
,
ext_modules
=
CUDAExtension
(
sources
=
[
#'./fused_dense_op/fused_dense_cuda.cc',
#'./fused_dense_op/fused_dense_cuda.cu',
###'./fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cc',
###'./fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cu',
#'./fmhalib/fmha_cuda.cc',
#'./fmhalib/fmha_cuda.cu',
'./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc'
,
'./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu'
,
'./lr_op/lr_op_cuda.cc'
,
'./lr_op/lr_op_cuda.cu'
,
'./acc_merge/acc_merge.cc'
,
'./acc_merge/acc_merge.cu'
,
],
# extra_objects=[os.path.join(apex_lib_dir, 'libfmha.so')],
# include_dirs=[apex_dir],
# library_dirs=[apex_lib_dir],
# extra_link_args=['-lfmha', '-ldl', '-lcublas'],
_compile_dir
=
compile_dir
,
define_macros
=
define_macros
))
Prev
1
2
3
4
5
6
7
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment