Commit 581b8d15 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #169 failed with stages
in 0 seconds
#!/bin/bash
################################################################################
# Copyright 2016-2021 by SW Group, Chengdu Hygon IC Design Co., Ltd.
# All right reserved. See COPYRIGHT for detailed Information.
#
# @file set_env.sh
# @brief set env variables for running test.
#
# @author wangmingliang <wangmingliang@hygon.cn>
# @date 2022/03/23
# @history 1.0
################################################################################
export HYGON_ROCM_INSTALL=/opt/dtk-21.04
# library path
# fix LD_LIBRARY_PATH begin or end with colon
export LD_LIBRARY_PATH=$(echo ${LD_LIBRARY_PATH} | sed 's/:$//; s/^://;')
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/lib:${HYGON_ROCM_INSTALL}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hip/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipcub/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipfft/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hiprand/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipsolver/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipsparse/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hsa/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/llvm/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/miopen/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/miopengemm/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/oam/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/opencl/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rccl/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocalution/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocclr/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocfft/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocm_smi/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocprim/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocprofiler/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocrand/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocsolver/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocsparse/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocthrust/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/roctracer/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export C_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/rocrand/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/rocrand/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/hiprand/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/hiprand/include:$CPLUS_INCLUDE_PATH
# executable path
# fix PATH begin or end with colon
export PATH=$(echo ${PATH} | sed 's/:$//; s/^://;')
export PATH=${HYGON_ROCM_INSTALL}/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/hip/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/llvm/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/llvm/lib/clang/13.0.0/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/miopen/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/opencl/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/rocprofiler/bin${PATH:+:${PATH}}
export PATH=/public/home/zhangqha/bladisc/hmmer/bin${PATH:+:${PATH}}
export PATH=/public/home/zhangqha/bladisc/hh-suite-master/build/bin${PATH:+:${PATH}}
export PATH=/public/home/zhangqha/bladisc/kalign/build/bin${PATH:+:${PATH}}
# component path
export ROCM_PATH=${HYGON_ROCM_INSTALL}
export HSA_PATH=${HYGON_ROCM_INSTALL}/hsa
export HIP_PATH=${HYGON_ROCM_INSTALL}/hip
export HIP_ROCCLR_HOME=${HYGON_ROCM_INSTALL}/hip
export HIP_LIB_PATH=${HYGON_ROCM_INSTALL}/hip/lib
export DEVICE_LIB_PATH=${HYGON_ROCM_INSTALL}/amdgcn/bitcode
export HIP_CLANG_PATH=${HYGON_ROCM_INSTALL}/llvm/bin
export HIP_RUNTIME="rocclr"
export HIP_COMPILER="clang"
#export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$CPLUS_INCLUDE_PATH
#export INCLUDE=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$INCLUDE
#export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$C_INCLUDE_PATH
# preparation
Accordding to the following PR to produce fmhalib.so
https://github.com/sneaxiy/apex/pull/1
# build the fmha op
python setup.py install
# unittest (need to improve)
python test_fmha.py
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
#define CHECK_INPUT(x) \
PD_CHECK(x.place().GetType() == phi::AllocationType::GPU, \
#x " must be a GPU Tensor.")
std::vector<paddle::Tensor> fmha_cuda_forward(const paddle::Tensor& qkv,
const paddle::Tensor& cu_seqlen,
const paddle::Tensor& host_seqlen,
bool is_test,
float dropout_rate,
bool zero_tensors,
bool use_fmha_mke_opt);
std::vector<paddle::Tensor> fmha_cuda_backward(
const paddle::Tensor& qkv,
const paddle::Tensor& cu_seqlen,
const paddle::Tensor& host_seqlen,
const paddle::Tensor& softmax_input,
const paddle::Tensor& d_ctx_out,
bool is_test,
float dropout_rate,
bool zero_tensors,
bool use_fmha_mke_opt);
/*
*x_shape(fp16) = [total_tokens, 3, num_heads, head_size]
*y_shape(int32) = [batch_size + 1]
*/
std::vector<std::vector<int64_t>> FmhaInferShape(
const std::vector<int64_t>& x_shape,
const std::vector<int64_t>& y_shape,
const std::vector<int64_t>& host_y_shape,
const bool& is_test,
const float& dropout_rate,
const bool& zero_tensors,
const bool& use_fmha_mke_opt) {
int total = x_shape[0];
int num_heads = x_shape[2];
int head_size = x_shape[3];
int batch_size = y_shape[0] - 1;
if (x_shape[1] != 3) {
PD_THROW(
"The shape for input QKV should be [total_tokens, 3, num_heas, "
"head_size].");
}
int max_seq_len = 512;
std::vector<int64_t> ctx_out_shape = {total, num_heads, head_size};
std::vector<int64_t> s_out_shape = {
batch_size, num_heads, max_seq_len, max_seq_len};
return {ctx_out_shape, s_out_shape};
}
std::vector<paddle::DataType> FmhaInferDtype(paddle::DataType x_dtype,
paddle::DataType y_dtype,
paddle::DataType host_y_dtype) {
return {x_dtype, x_dtype};
}
std::vector<paddle::Tensor> FmhaCUDAForward(const paddle::Tensor& qkv,
const paddle::Tensor& cu_seqlen,
const paddle::Tensor& host_seqlen,
bool is_test,
float dropout_rate,
bool zero_tensors,
bool use_fmha_mke_opt) {
CHECK_INPUT(qkv);
CHECK_INPUT(cu_seqlen);
// Note: should not use CHECK_INPUT(max_seq_len_host),
// because it will enforce this input to be GPU tensor
return fmha_cuda_forward(qkv,
cu_seqlen,
host_seqlen,
is_test,
dropout_rate,
zero_tensors,
use_fmha_mke_opt);
}
std::vector<paddle::Tensor> FmhaCUDABackward(
const paddle::Tensor& qkv,
const paddle::Tensor& cu_seqlen,
const paddle::Tensor& host_seqlen,
const paddle::Tensor& softmax_input,
const paddle::Tensor& d_ctx_out,
bool is_test,
float dropout_rate,
bool zero_tensors,
bool use_fmha_mke_opt) {
CHECK_INPUT(qkv);
CHECK_INPUT(cu_seqlen);
CHECK_INPUT(softmax_input);
CHECK_INPUT(d_ctx_out);
return fmha_cuda_backward(qkv,
cu_seqlen,
host_seqlen,
softmax_input,
d_ctx_out,
is_test,
dropout_rate,
zero_tensors,
use_fmha_mke_opt);
}
PD_BUILD_OP(custom_fmha)
.Inputs({"QKV", "CuSeqLen", "HostSeqLen"})
.Outputs({"CtxOut", "SOut"})
.Attrs({"is_test: bool",
"dropout_rate: float",
"zero_tensors: bool",
"use_fmha_mke_opt: bool"})
.SetKernelFn(PD_KERNEL(FmhaCUDAForward))
.SetInferShapeFn(PD_INFER_SHAPE(FmhaInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(FmhaInferDtype));
PD_BUILD_GRAD_OP(custom_fmha)
.Inputs({"QKV", "CuSeqLen", "HostSeqLen", "SOut", paddle::Grad("CtxOut")})
.Outputs({paddle::Grad("QKV")})
.Attrs({"is_test: bool",
"dropout_rate: float",
"zero_tensors: bool",
"use_fmha_mke_opt: bool"})
.SetKernelFn(PD_KERNEL(FmhaCUDABackward));
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from custom_setup_ops import custom_fmha
import numpy as np
total = 2
batch_size = 56
num_heads = 16
head_size = 64
is_training = True
max_seq_len = 512
dropout_rate = 0.1
cu_seqlen = np.arange(batch_size + 1)
cu_seqlen = np.cumsum(cu_seqlen)
total = cu_seqlen[-1]
#print("cu_seqlen", cu_seqlen)
#print("cu_seqlen[-1]", cu_seqlen[-1])
cu_seqlen = paddle.to_tensor(cu_seqlen)
cu_seqlen = paddle.cast(cu_seqlen, 'int32')
qkv = np.random.random((total, 3, num_heads, head_size)).astype(np.float16)
#print("qkv:", qkv)
qkv = paddle.to_tensor(qkv, stop_gradient=False)
max_seq_len_host = [max_seq_len]
max_seq_len_host = paddle.to_tensor(
max_seq_len_host, dtype='int32', place=paddle.CPUPlace())
ctx_out, s_out = custom_fmha(qkv, cu_seqlen, max_seq_len_host, is_training,
dropout_rate, False)
print("print ctx_out and s_out: ")
print(ctx_out)
print(s_out)
# backward.
print("print qkv.grad: ")
grad_ctx_dout = np.random.random(
(total, num_heads, head_size)).astype(np.float16)
grad_ctx_dout = paddle.to_tensor(grad_ctx_dout)
paddle.autograd.backward([ctx_out], [grad_ctx_dout], retain_graph=True)
print(qkv.grad)
# Function
Support gemm_nn/nt + bias with float16, float32 and float64 data types.
# Correcness test
`
python test_fused_dense_op.py
`
# performance result
`
python test_fused_dense_perf.py
`
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
// @x: [x, in_feature] or [xx, xx, in_feature]
// @y: [out_feature, in_feature]
// @out: [x, out_feature] or [xx, xx, out_feature]
// support transx=false, transy=true/false.
std::vector<std::vector<int64_t>> FusedDenseInferShape(
const std::vector<int64_t>& x_shape,
const std::vector<int64_t>& y_shape,
const std::vector<int64_t>& bias_shape,
const bool& transx,
const bool& transy,
const bool& use_addto) {
int x_size = x_shape.size();
int x_m = 1;
for (int i = 0; i < (x_size - 1); i++) {
x_m *= x_shape[i];
}
int x_k = x_shape[x_size - 1];
int y_k = y_shape[0];
int y_n = y_shape[1];
if (transy) {
y_k = y_shape[1];
y_n = y_shape[0];
}
if (x_k != y_k) {
PD_THROW("The reudce dim of A and B in matmul is not equal.");
}
if (transx) {
PD_THROW("Only support cases: transx is False, transy are True/False.");
}
std::vector<int64_t> out_shape(x_shape);
out_shape[x_size - 1] = y_n;
return {out_shape};
}
std::vector<paddle::DataType> FusedDenseInferDtype(
paddle::DataType x_dtype,
paddle::DataType y_dtype,
paddle::DataType bias_dtype) {
return {x_dtype};
}
PD_BUILD_OP(custom_fused_dense)
.Inputs({"X", "Y", "Bias"})
.Outputs({"Out"})
.Attrs({"transx: bool", "transy: bool", "use_addto: bool"})
.SetInferShapeFn(PD_INFER_SHAPE(FusedDenseInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(FusedDenseInferDtype));
PD_BUILD_GRAD_OP(custom_fused_dense)
.Inputs({"X", "Y", "Bias", paddle::Grad("Out")})
.Outputs({paddle::Grad("X"), paddle::Grad("Y"), paddle::Grad("Bias")})
.Attrs({"transx: bool", "transy: bool"});
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#define CUBLAS_VERSION 13000
/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
// includes cublaslt
#include <cublasLt.h>
#endif
#include "paddle/extension.h"
#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#define CHECK_CUBLAS_ERR(error_code) \
do { \
if (error_code != CUBLAS_STATUS_SUCCESS) { \
PD_THROW("cublas error code is ", error_code); \
} \
} while (0)
// todo: allocate 4MB. (the following code looks like 4MB * sizeof(T)?)
constexpr auto kWorkspaceSize = (1 << 22);
// FP64 Wrapper around cublas GEMMEx
// TODO(limin): in fact, alpha and beta are double type.
cublasStatus_t gemm_bias(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha,
const double* A,
int lda,
const double* B,
int ldb,
const float* beta,
double* C,
int ldc) {
return cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
CUDA_R_64F,
lda,
B,
CUDA_R_64F,
ldb,
beta,
C,
CUDA_R_64F,
ldc,
CUDA_R_64F,
CUBLAS_GEMM_DEFAULT);
}
// FP32 Wrapper around cublas GEMMEx
cublasStatus_t gemm_bias(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha,
const float* A,
int lda,
const float* B,
int ldb,
const float* beta,
float* C,
int ldc) {
return cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
CUDA_R_32F,
lda,
B,
CUDA_R_32F,
ldb,
beta,
C,
CUDA_R_32F,
ldc,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT);
}
// FP16 Tensor core wrapper around cublas GEMMEx
cublasStatus_t gemm_bias(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha,
const paddle::float16* A,
int lda,
const paddle::float16* B,
int ldb,
const float* beta,
paddle::float16* C,
int ldc) {
return cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
CUDA_R_16F,
lda,
B,
CUDA_R_16F,
ldb,
beta,
C,
CUDA_R_16F,
ldc,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
}
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
// float16 and float32
template <typename T>
cublasStatus_t cublaslt_matmul_desc_init(
cublasLtMatmulDescOpaque_t* operationDesc) {
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status =
cublasLtMatmulDescInit(operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
return status;
}
// float64
template <>
cublasStatus_t cublaslt_matmul_desc_init<double>(
cublasLtMatmulDescOpaque_t* operationDesc) {
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status =
cublasLtMatmulDescInit(operationDesc, CUBLAS_COMPUTE_64F, CUDA_R_64F);
return status;
}
// float16
template <typename T>
cublasStatus_t set_cublaslt_matrix_layout_init(
cublasLtMatrixLayoutOpaque_t* Adesc,
cublasLtMatrixLayoutOpaque_t* Bdesc,
cublasLtMatrixLayoutOpaque_t* Cdesc,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
int lda,
int ldb,
int ldc) {
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status = cublasLtMatrixLayoutInit(Adesc,
CUDA_R_16F,
transa == CUBLAS_OP_N ? m : k,
transa == CUBLAS_OP_N ? k : m,
lda);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutInit(Bdesc,
CUDA_R_16F,
transb == CUBLAS_OP_N ? k : n,
transb == CUBLAS_OP_N ? n : k,
ldb);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutInit(Cdesc, CUDA_R_16F, m, n, ldc);
CLEANUP:
return status;
}
template <>
cublasStatus_t set_cublaslt_matrix_layout_init<float>(
cublasLtMatrixLayoutOpaque_t* Adesc,
cublasLtMatrixLayoutOpaque_t* Bdesc,
cublasLtMatrixLayoutOpaque_t* Cdesc,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
int lda,
int ldb,
int ldc) {
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status = cublasLtMatrixLayoutInit(Adesc,
CUDA_R_32F,
transa == CUBLAS_OP_N ? m : k,
transa == CUBLAS_OP_N ? k : m,
lda);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutInit(Bdesc,
CUDA_R_32F,
transb == CUBLAS_OP_N ? k : n,
transb == CUBLAS_OP_N ? n : k,
ldb);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutInit(Cdesc, CUDA_R_32F, m, n, ldc);
CLEANUP:
return status;
}
template <>
cublasStatus_t set_cublaslt_matrix_layout_init<double>(
cublasLtMatrixLayoutOpaque_t* Adesc,
cublasLtMatrixLayoutOpaque_t* Bdesc,
cublasLtMatrixLayoutOpaque_t* Cdesc,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
int lda,
int ldb,
int ldc) {
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status = cublasLtMatrixLayoutInit(Adesc,
CUDA_R_64F,
transa == CUBLAS_OP_N ? m : k,
transa == CUBLAS_OP_N ? k : m,
lda);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutInit(Bdesc,
CUDA_R_64F,
transb == CUBLAS_OP_N ? k : n,
transb == CUBLAS_OP_N ? n : k,
ldb);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutInit(Cdesc, CUDA_R_64F, m, n, ldc);
CLEANUP:
return status;
}
#endif
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
template <typename T>
int gemm_bias_lt(cublasLtHandle_t ltHandle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const T* A,
int lda,
const T* B,
int ldb,
const float* beta, /* host pointer */
T* C,
int ldc,
void* workspace,
size_t workspaceSize,
cudaStream_t stream,
bool use_bias,
const void* bias) {
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cublasLtMatmulDescOpaque_t operationDesc = {};
cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
cublasLtMatmulPreferenceOpaque_t preference = {};
int returnedResults = 0;
cublasLtMatmulHeuristicResult_t heuristicResult = {};
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
// for details about defaults; here we just set the transforms for
// A and B.
status = cublaslt_matmul_desc_init<T>(&operationDesc);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatmulDescSetAttribute(
&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatmulDescSetAttribute(
&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
if (use_bias) {
status = cublasLtMatmulDescSetAttribute(
&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
epilogue = CUBLASLT_EPILOGUE_BIAS;
}
status = cublasLtMatmulDescSetAttribute(&operationDesc,
CUBLASLT_MATMUL_DESC_EPILOGUE,
&epilogue,
sizeof(epilogue));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
// Create matrix descriptors. Not setting any extra attributes.
status = set_cublaslt_matrix_layout_init<T>(
&Adesc, &Bdesc, &Cdesc, transa, transb, m, n, k, lda, ldb, ldc);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// Create preference handle; In general, extra attributes can be
// used here to disable tensor ops or to make sure algo selected
// will work with badly aligned A, B, C. However, for simplicity
// here we assume A,B,C are always well aligned (e.g., directly
// come from cudaMalloc)
status = cublasLtMatmulPreferenceInit(&preference);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatmulPreferenceSetAttribute(
&preference,
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&workspaceSize,
sizeof(workspaceSize));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// We just need the best available heuristic to try and run matmul.
// There is no guarantee that this will work. For example, if A is
// badly aligned, you can request more (e.g. 32) algos and try to
// run them one by one until something works.
status = cublasLtMatmulAlgoGetHeuristic(ltHandle,
&operationDesc,
&Adesc,
&Bdesc,
&Cdesc,
&Cdesc,
&preference,
1,
&heuristicResult,
&returnedResults);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
if (returnedResults == 0) {
status = CUBLAS_STATUS_NOT_SUPPORTED;
goto CLEANUP;
}
status = cublasLtMatmul(ltHandle,
&operationDesc,
alpha,
A,
&Adesc,
B,
&Bdesc,
beta,
C,
&Cdesc,
reinterpret_cast<void*>(C),
&Cdesc,
// &heuristicResult.algo,
NULL,
workspace,
workspaceSize,
stream);
CLEANUP:
// Descriptors are no longer needed as all GPU work was already
// enqueued.
PADDLE_ENFORCE_GPU_SUCCESS(status);
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
#endif
template <typename T>
static int linear_bias_cuda_forward_impl(
const paddle::platform::CUDADeviceContext& dev_ctx,
const T* input_data,
const T* weight_data,
const T* bias_data,
bool transx,
bool transy,
int in_features,
int batch_size,
int out_features,
T* output_data,
void* lt_workspace) {
auto handle = dev_ctx.cublas_handle();
auto stream = dev_ctx.stream();
const float alpha = 1.0;
const float beta_zero = 0.0;
const float beta_one = 1.0;
int status = 1;
// nt
cublasOperation_t transpose_x = CUBLAS_OP_T;
cublasOperation_t transpose_y = CUBLAS_OP_N;
if (transy) {
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
status = gemm_bias_lt((cublasLtHandle_t)handle,
transpose_x,
transpose_y,
out_features,
batch_size,
in_features,
&alpha, /* host pointer */
weight_data,
in_features,
input_data,
in_features,
&beta_zero, /* host pointer */
output_data,
out_features,
lt_workspace,
kWorkspaceSize,
stream,
true,
bias_data);
#endif
if (status != 0) {
PD_THROW("cublaslt gemm_bias failed with error code ", status);
#if 0
output.copy_(bias);
status = gemm_bias(
handle,
transpose_x,
transpose_y,
out_features,
batch_size,
in_features,
&alpha,
weight,
in_features,
input_data,
in_features,
&beta_one,
output_data,
out_features);
#endif
}
} else {
// nn
transpose_x = CUBLAS_OP_N;
transpose_y = CUBLAS_OP_N;
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
status = gemm_bias_lt((cublasLtHandle_t)handle,
transpose_x,
transpose_y,
out_features,
batch_size,
in_features,
&alpha, /* host pointer */
weight_data,
out_features,
input_data,
in_features,
&beta_zero, /* host pointer */
output_data,
out_features,
lt_workspace,
kWorkspaceSize,
stream,
true,
bias_data);
#endif
if (status != 0) {
PD_THROW("cublaslt gemm_bias failed with error code ", status);
}
}
return status;
}
template <typename T>
int gemm_bgradb_lt(cublasLtHandle_t ltHandle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const T* A,
int lda,
const T* B,
int ldb,
const float* beta, /* host pointer */
T* C,
int ldc,
void* workspace,
size_t workspaceSize,
cudaStream_t stream,
bool use_bias,
const void* bgrad,
cublasLtEpilogue_t epilogue) {
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cublasLtMatmulDescOpaque_t operationDesc = {};
cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
cublasLtMatmulPreferenceOpaque_t preference = {};
int returnedResults = 0;
cublasLtMatmulHeuristicResult_t heuristicResult = {};
// cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
// for details about defaults; here we just set the transforms for
// A and B.
status = cublaslt_matmul_desc_init<T>(&operationDesc);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatmulDescSetAttribute(
&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatmulDescSetAttribute(
&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
if (use_bias) {
status = cublasLtMatmulDescSetAttribute(&operationDesc,
CUBLASLT_MATMUL_DESC_BIAS_POINTER,
&bgrad,
sizeof(bgrad));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
}
status = cublasLtMatmulDescSetAttribute(&operationDesc,
CUBLASLT_MATMUL_DESC_EPILOGUE,
&epilogue,
sizeof(epilogue));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
// Create matrix descriptors. Not setting any extra attributes.
status = set_cublaslt_matrix_layout_init<T>(
&Adesc, &Bdesc, &Cdesc, transa, transb, m, n, k, lda, ldb, ldc);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// Create preference handle; In general, extra attributes can be
// used here to disable tensor ops or to make sure algo selected
// will work with badly aligned A, B, C. However, for simplicity
// here we assume A,B,C are always well aligned (e.g., directly
// come from cudaMalloc)
status = cublasLtMatmulPreferenceInit(&preference);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatmulPreferenceSetAttribute(
&preference,
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&workspaceSize,
sizeof(workspaceSize));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// We just need the best available heuristic to try and run matmul.
// There is no guarantee that this will work. For example, if A is
// badly aligned, you can request more (e.g. 32) algos and try to
// run them one by one until something works.
status = cublasLtMatmulAlgoGetHeuristic(ltHandle,
&operationDesc,
&Adesc,
&Bdesc,
&Cdesc,
&Cdesc,
&preference,
1,
&heuristicResult,
&returnedResults);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
if (returnedResults == 0) {
status = CUBLAS_STATUS_NOT_SUPPORTED;
goto CLEANUP;
}
status = cublasLtMatmul(ltHandle,
&operationDesc,
alpha,
A,
&Adesc,
B,
&Bdesc,
beta,
C,
&Cdesc,
C,
&Cdesc,
// &heuristicResult.algo,
NULL,
workspace,
workspaceSize,
stream);
CLEANUP:
// Descriptors are no longer needed as all GPU work was already
// enqueued.
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
template <typename T>
int linear_bias_cuda_backward_impl(
const paddle::platform::CUDADeviceContext& dev_ctx,
const T* input,
const T* weight,
const T* d_output,
bool transx,
bool transy,
bool use_addto,
int in_features,
int batch_size,
int out_features,
T* d_weight,
T* d_bias,
T* d_input,
void* lt_workspace) {
auto handle = dev_ctx.cublas_handle();
auto stream = dev_ctx.stream();
const float alpha = 1.0;
const float beta_zero = 0.0;
const float beta_one = 1.0;
int status = 1;
if (transy) {
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
// cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BGRADB;
status = gemm_bgradb_lt((cublasLtHandle_t)handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
in_features,
out_features,
batch_size,
&alpha, /* host pointer */
input,
in_features,
d_output,
out_features,
&beta_zero, /* host pointer */
d_weight,
in_features,
lt_workspace,
kWorkspaceSize,
stream,
true,
static_cast<const void*>(d_bias),
epilogue);
#endif
if (status != 0) {
PD_THROW("cublaslt gemm_bias failed with error code ", status);
#if 0
status = gemm_bias(
handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
in_features,
out_features,
batch_size,
&alpha,
input,
in_features,
d_output,
out_features,
&beta_zero,
d_weight,
in_features);
#endif
}
} else {
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BGRADA;
status = gemm_bgradb_lt((cublasLtHandle_t)handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
out_features,
in_features,
batch_size,
&alpha, /* host pointer */
d_output,
out_features,
input,
in_features,
&beta_zero, /* host pointer */
d_weight,
out_features,
lt_workspace,
kWorkspaceSize,
stream,
true,
static_cast<const void*>(d_bias),
epilogue);
#endif
if (status != 0) {
PD_THROW("cublaslt gemm_bias failed with error code ", status);
}
}
cublasOperation_t transpose_x = CUBLAS_OP_N;
cublasOperation_t transpose_y = CUBLAS_OP_N;
const float beta_dinput = (use_addto ? beta_one : beta_zero);
if (transy) {
status = gemm_bias(handle,
transpose_x,
transpose_y,
in_features,
batch_size,
out_features,
&alpha,
weight,
in_features,
d_output,
out_features,
&beta_dinput,
d_input,
in_features);
} else {
transpose_x = CUBLAS_OP_T;
transpose_y = CUBLAS_OP_N;
status = gemm_bias(handle,
transpose_x,
transpose_y,
in_features,
batch_size,
out_features,
&alpha,
weight,
out_features,
d_output,
out_features,
&beta_dinput,
d_input,
in_features);
}
return status;
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_fused_dense, ctx) {
namespace f = paddle::framework;
const auto* x = ctx.Input<f::Tensor>("X");
const auto* y = ctx.Input<f::Tensor>("Y");
const auto* bias = ctx.Input<f::Tensor>("Bias");
auto* out = ctx.Output<f::Tensor>("Out");
bool transx = ctx.Attr<bool>("transx");
bool transy = ctx.Attr<bool>("transy");
auto& dev_ctx = ctx.cuda_device_context();
auto place = dev_ctx.GetPlace();
if (transx) {
PD_THROW("Attr(transx) must be False currently.");
}
const auto& x_dims = x->dims();
int x_m = 1;
for (int i = 0; i < x_dims.size() - 1; i++) {
x_m *= x_dims[i];
}
int x_k = x_dims[x_dims.size() - 1];
const auto& y_dims = y->dims();
int y_k = y_dims[0];
int y_n = y_dims[1];
if (transy) {
y_k = y_dims[1];
y_n = y_dims[0];
}
if (x_k != y_k) {
PD_THROW("The reudce dim of A and B in matmul is not equal.");
}
auto out_dims = x_dims;
out_dims[x_dims.size() - 1] = y_n;
out->Resize(out_dims);
f::Tensor lt_workspace;
lt_workspace.Resize({kWorkspaceSize});
PD_DISPATCH_FLOATING_AND_HALF_TYPES(
x->dtype(), "linear_bias_cuda_forward_impl", ([&] {
linear_bias_cuda_forward_impl<data_t>(
dev_ctx,
x->data<data_t>(),
y->data<data_t>(),
bias->data<data_t>(),
transx,
transy,
x_k,
x_m,
y_n,
out->mutable_data<data_t>(place),
lt_workspace.mutable_data<data_t>(place));
}));
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_fused_dense_grad, ctx) {
namespace f = paddle::framework;
const auto* x = ctx.Input<f::Tensor>("X");
const auto* y = ctx.Input<f::Tensor>("Y");
const auto* grad_out = ctx.Input<f::Tensor>(f::GradVarName("Out"));
auto* grad_x = ctx.Output<f::Tensor>(f::GradVarName("X"));
auto* grad_y = ctx.Output<f::Tensor>(f::GradVarName("Y"));
auto* grad_bias = ctx.Output<f::Tensor>(f::GradVarName("Bias"));
bool transx = ctx.Attr<bool>("transx");
bool transy = ctx.Attr<bool>("transy");
bool use_addto = ctx.Attr<bool>("use_addto");
auto& dev_ctx = ctx.cuda_device_context();
auto place = dev_ctx.GetPlace();
if (transx) {
PD_THROW("Attr(transx) must be False currently.");
}
const auto& x_dims = x->dims();
int x_m = 1;
for (int i = 0; i < x_dims.size() - 1; i++) {
x_m *= x_dims[i];
}
int x_k = x_dims[x_dims.size() - 1];
const auto& y_dims = y->dims();
int y_k = y_dims[0];
int y_n = y_dims[1];
if (transy) {
y_k = y_dims[1];
y_n = y_dims[0];
}
if (x_k != y_k) {
PD_THROW("The reudce dim of A and B in matmul is not equal.");
}
grad_x->Resize(x_dims);
grad_y->Resize(y_dims);
grad_bias->Resize({y_n});
f::Tensor lt_workspace;
lt_workspace.Resize({kWorkspaceSize});
#if defined(CUDA_VERSION) && CUDA_VERSION < 11000
PD_THROW(
"fused_dense_cuda_backward is not supported on cuda_version < 11000");
#endif
PD_DISPATCH_FLOATING_AND_HALF_TYPES(
x->dtype(), "linear_bias_cuda_backward_impl", ([&] {
linear_bias_cuda_backward_impl<data_t>(
dev_ctx,
x->data<data_t>(),
y->data<data_t>(),
grad_out->data<data_t>(),
transx,
transy,
use_addto,
x_k,
x_m,
y_n,
grad_y->mutable_data<data_t>(place),
grad_bias->mutable_data<data_t>(place),
grad_x->mutable_data<data_t>(place),
lt_workspace.mutable_data<data_t>(place));
}));
}
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from custom_setup_ops import custom_fused_dense
import numpy as np
iters = 1
def test_fused_dense_op(x, weight, bias, transx, transy, grad_out, atol):
# x * weight^t + bias
def run_paddle_mm_bias(x, weight, bias, transx, transy, grad_out):
pp_x = paddle.to_tensor(x, stop_gradient=False)
pp_weight = paddle.to_tensor(weight, stop_gradient=False)
pp_bias = paddle.to_tensor(bias, stop_gradient=False)
pp_out = paddle.matmul(pp_x, pp_weight, transx, transy)
pp_bias_out = paddle.add(pp_out, pp_bias)
pp_grad_out = paddle.to_tensor(grad_out)
paddle.autograd.backward(
[pp_bias_out], [pp_grad_out], retain_graph=True)
return pp_bias_out, pp_x.grad, pp_weight.grad, pp_bias.grad
def run_custom_fuse_dense(x, weight, bias, transx, transy, grad_out):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
weight_tensor = paddle.to_tensor(weight, stop_gradient=False)
bias_tensor = paddle.to_tensor(bias, stop_gradient=False)
out_tensor = custom_fused_dense(x_tensor, weight_tensor, bias_tensor,
transx, transy)
grad_out_tensor = paddle.to_tensor(grad_out)
paddle.autograd.backward(
[out_tensor], [grad_out_tensor], retain_graph=True)
return out_tensor, x_tensor.grad, weight_tensor.grad, bias_tensor.grad
'''
def run_numpy_mm_bias(x, weight, bias, transx, transy):
out = np.matmul(x, weight.transpose(1, 0))
out = out + bias
return out
'''
def run_ref_backward(x, weight, bias, trans, transy, grad_out):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
weight_tensor = paddle.to_tensor(weight, stop_gradient=False)
grad_out_tensor = paddle.to_tensor(grad_out)
# d_weight: x * grad_out^t (nt)
# d_input: weight * grad_out (nn)
if transy:
ref_grad_weight = paddle.matmul(grad_out_tensor, x_tensor, True,
False)
ref_grad_input = paddle.matmul(grad_out_tensor, weight_tensor,
False, False)
else:
ref_grad_weight = paddle.matmul(x_tensor, grad_out_tensor, True,
False)
ref_grad_input = paddle.matmul(grad_out_tensor, weight_tensor,
False, True)
ref_grad_bias = paddle.sum(grad_out_tensor, axis=0)
return ref_grad_input, ref_grad_weight, ref_grad_bias
'''
ref_out = run_numpy_mm_bias(x, weight, bias, transx, transy)
'''
ref_pp_out, ref_pp_x_grad, ref_pp_weight_grad, ref_pp_bias_grad = run_paddle_mm_bias(
x, weight, bias, transx, transy, grad_out)
#print("ref_pp_x_grad shape: ", ref_pp_x_grad.shape)
custom_out, x_grad, weight_grad, bias_grad = run_custom_fuse_dense(
x, weight, bias, transx, transy, grad_out)
#print("x_grad shape: ", x_grad.shape)
new_ref_grad_input, new_ref_grad_weight, new_ref_grad_bias = run_ref_backward(
x, weight, bias, transx, transy, grad_out)
# check out
'''
np.testing.assert_allclose(
ref_out, custom_out.numpy(),
rtol=1e-5, atol=atol)
'''
np.testing.assert_allclose(
ref_pp_out.numpy(), custom_out.numpy(), rtol=1e-5, atol=atol)
# check grad
np.testing.assert_allclose(
ref_pp_x_grad.numpy(), x_grad.numpy(), rtol=1e-5, atol=atol)
np.testing.assert_allclose(
new_ref_grad_weight.numpy(), weight_grad.numpy(), rtol=1e-5, atol=atol)
np.testing.assert_allclose(
new_ref_grad_bias.numpy(), bias_grad.numpy(), rtol=1e-5, atol=atol)
def generate_input_data(m, dtype=np.float16):
# index = np.random.randint(0, 5, (m))
data = np.random.random((m)).astype(dtype)
for i in range(m):
#index[i] = 1
#data[i] = 1.0/(np.exp2(index[i]))
if i % 2 == 0:
data[i] = 0.25
elif i % 3 == 0:
data[i] = 0.5
else:
data[i] = 0.0625
return data
def generate_fixed_input(x_m, in_feature, out_feature, transy, dtype):
x = generate_input_data(x_m * in_feature, dtype)
x = x.reshape(x_m, in_feature)
weight = generate_input_data(out_feature * in_feature, dtype)
if transy:
weight = weight.reshape(out_feature, in_feature)
else:
weight = weight.reshape(in_feature, out_feature)
bias = generate_input_data(out_feature, dtype)
grad_out = generate_input_data(x_m * out_feature, dtype)
grad_out = grad_out.reshape(x_m, out_feature)
return x, weight, bias, grad_out
def generate_ones_input(x_m, in_feature, out_feature, transy, dtype):
x = np.ones(x_m * in_feature).astype(dtype)
x = x.reshape(x_m, in_feature)
weight = np.ones(out_feature * in_feature).astype(dtype)
if transy:
weight = weight.reshape(out_feature, in_feature)
else:
weight = weight.reshape(in_feature, out_feature)
bias = np.ones(out_feature).astype(dtype)
# bias = np.zeros(out_feature).astype(dtype)
grad_out = np.ones(x_m * out_feature).astype(dtype)
grad_out = grad_out.reshape(x_m, out_feature)
return x, weight, bias, grad_out
def test_driver(i=0,
x_m=56,
in_feature=4,
out_feature=8,
transx=False,
transy=True,
dtype=np.float16,
atol=1e-2):
for i in range(iters):
if i == 0:
x, weight, bias, grad_out = generate_ones_input(
x_m, in_feature, out_feature, transy, dtype)
elif i == 1:
x, weight, bias, grad_out = generate_fixed_input(
x_m, in_feature, out_feature, transy, dtype)
else:
x = np.random.random((x_m, in_feature)).astype(dtype)
if transy:
weight = np.random.random(
(out_feature, in_feature)).astype(dtype)
else:
weight = np.random.random(
(in_feature, out_feature)).astype(dtype)
bias = np.random.random((out_feature)).astype(dtype)
grad_out = np.random.random((x_m, out_feature)).astype(dtype)
test_fused_dense_op(x, weight, bias, transx, transy, grad_out, atol)
## Note: mlperf config: x_m from xx to 28672, in_feature is 1024, out_feature is 1024/4096.
for i in range(3):
print("Begin Test ", i)
if i == 0 or i == 1:
fp16_atol = 1e-5
atol = 1e-5
else:
fp16_atol = 0.3
atol = 1e-3
#####################################################
## nt
## randome input: 0.2 is not right, should set to 0.3
print("gemm_nt + bias test: ")
test_driver(
i,
x_m=56,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
atol=fp16_atol)
test_driver(
i,
x_m=56,
in_feature=1024,
out_feature=4096,
transx=False,
transy=True,
atol=fp16_atol)
test_driver(
i,
x_m=1000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
atol=fp16_atol)
## for 0.0625 input, fp16 type's max error is 0.03125
test_driver(
i,
x_m=2000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=np.float32,
atol=atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=np.float32,
atol=atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
out_feature=4096,
transx=False,
transy=True,
dtype=np.float32,
atol=atol)
#####################################################
## nn
print("gemm_nn + bias test: ")
test_driver(
i,
x_m=2,
in_feature=1,
out_feature=4,
transx=False,
transy=False,
dtype=np.float32,
atol=atol)
test_driver(
i,
x_m=56,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=np.float32,
atol=atol)
test_driver(
i,
x_m=56,
in_feature=1024,
out_feature=4096,
transx=False,
transy=False,
dtype=np.float32,
atol=atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=np.float32,
atol=atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
out_feature=4096,
transx=False,
transy=False,
dtype=np.float32,
atol=atol)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from custom_setup_ops import custom_fused_dense
import numpy as np
import time
iters = 100
place = paddle.CUDAPlace(0)
def test_fused_dense_op(x, weight, bias, transx, transy, grad_out, atol):
# x * weight^t + bias
def run_paddle_mm_bias(x, weight, bias, transx, transy, grad_out):
pp_x = paddle.to_tensor(x, stop_gradient=False)
pp_weight = paddle.to_tensor(weight, stop_gradient=False)
pp_bias = paddle.to_tensor(bias, stop_gradient=False)
pp_grad_out = paddle.to_tensor(grad_out)
# warmup
pp_out = paddle.matmul(pp_x, pp_weight, transx, transy)
pp_bias_out = paddle.add(pp_out, pp_bias)
paddle.autograd.backward(
[pp_bias_out], [pp_grad_out], retain_graph=True)
paddle.device.cuda.synchronize(place)
t1 = time.time()
for i in range(iters):
pp_out = paddle.matmul(pp_x, pp_weight, transx, transy)
pp_bias_out = paddle.add(pp_out, pp_bias)
paddle.autograd.backward(
[pp_bias_out], [pp_grad_out], retain_graph=True)
paddle.device.cuda.synchronize(place)
t2 = time.time()
print("paddle gemm time is: (ms)", (t2 - t1) * 1000)
return pp_bias_out, pp_x.grad, pp_weight.grad, pp_bias.grad
def run_custom_fuse_dense(x, weight, bias, transx, transy, grad_out):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
weight_tensor = paddle.to_tensor(weight, stop_gradient=False)
bias_tensor = paddle.to_tensor(bias, stop_gradient=False)
grad_out_tensor = paddle.to_tensor(grad_out)
# warmup
out_tensor = custom_fused_dense(x_tensor, weight_tensor, bias_tensor,
transx, transy)
paddle.autograd.backward(
[out_tensor], [grad_out_tensor], retain_graph=True)
t3 = time.time()
for i in range(iters):
out_tensor = custom_fused_dense(x_tensor, weight_tensor,
bias_tensor, transx, transy)
paddle.autograd.backward(
[out_tensor], [grad_out_tensor], retain_graph=True)
paddle.device.cuda.synchronize(place)
t4 = time.time()
print("cublaslt gemm time is: (ms)", (t4 - t3) * 1000)
return out_tensor, x_tensor.grad, weight_tensor.grad, bias_tensor.grad
def run_ref_backward(x, weight, bias, trans, transy, grad_out):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
weight_tensor = paddle.to_tensor(weight, stop_gradient=False)
grad_out_tensor = paddle.to_tensor(grad_out)
# d_weight: x * grad_out^t (nt)
# d_input: weight * grad_out (nn)
if transy:
ref_grad_weight = paddle.matmul(grad_out_tensor, x_tensor, True,
False)
ref_grad_input = paddle.matmul(grad_out_tensor, weight_tensor,
False, False)
else:
ref_grad_weight = paddle.matmul(x_tensor, grad_out_tensor, True,
False)
ref_grad_input = paddle.matmul(grad_out_tensor, weight_tensor,
False, True)
ref_grad_bias = paddle.sum(grad_out_tensor, axis=0)
return ref_grad_input, ref_grad_weight, ref_grad_bias
#print("ref_pp_x_grad shape: ", ref_pp_x_grad.shape)
custom_out, x_grad, weight_grad, bias_grad = run_custom_fuse_dense(
x, weight, bias, transx, transy, grad_out)
ref_pp_out, ref_pp_x_grad, ref_pp_weight_grad, ref_pp_bias_grad = run_paddle_mm_bias(
x, weight, bias, transx, transy, grad_out)
def generate_input_data(m, dtype=np.float16):
# index = np.random.randint(0, 5, (m))
data = np.random.random((m)).astype(dtype)
for i in range(m):
#index[i] = 1
#data[i] = 1.0/(np.exp2(index[i]))
if i % 2 == 0:
data[i] = 0.25
elif i % 3 == 0:
data[i] = 0.5
else:
data[i] = 0.0625
return data
def generate_fixed_input(x_m, in_feature, out_feature, transy, dtype):
x = generate_input_data(x_m * in_feature, dtype)
x = x.reshape(x_m, in_feature)
weight = generate_input_data(out_feature * in_feature, dtype)
if transy:
weight = weight.reshape(out_feature, in_feature)
else:
weight = weight.reshape(in_feature, out_feature)
bias = generate_input_data(out_feature, dtype)
grad_out = generate_input_data(x_m * out_feature, dtype)
grad_out = grad_out.reshape(x_m, out_feature)
return x, weight, bias, grad_out
def generate_ones_input(x_m, in_feature, out_feature, transy, dtype):
x = np.ones(x_m * in_feature).astype(dtype)
x = x.reshape(x_m, in_feature)
weight = np.ones(out_feature * in_feature).astype(dtype)
if transy:
weight = weight.reshape(out_feature, in_feature)
else:
weight = weight.reshape(in_feature, out_feature)
bias = np.ones(out_feature).astype(dtype)
# bias = np.zeros(out_feature).astype(dtype)
grad_out = np.ones(x_m * out_feature).astype(dtype)
grad_out = grad_out.reshape(x_m, out_feature)
return x, weight, bias, grad_out
def test_driver(i=0,
x_m=56,
in_feature=4,
out_feature=8,
transx=False,
transy=True,
dtype=np.float16,
atol=1e-2):
print("m, in_feature, out_feature = ", x_m, in_feature, out_feature)
if i == 0:
x, weight, bias, grad_out = generate_ones_input(
x_m, in_feature, out_feature, transy, dtype)
elif i == 1:
x, weight, bias, grad_out = generate_fixed_input(
x_m, in_feature, out_feature, transy, dtype)
else:
x = np.random.random((x_m, in_feature)).astype(dtype)
if transy:
weight = np.random.random((out_feature, in_feature)).astype(dtype)
else:
weight = np.random.random((in_feature, out_feature)).astype(dtype)
bias = np.random.random((out_feature)).astype(dtype)
grad_out = np.random.random((x_m, out_feature)).astype(dtype)
test_fused_dense_op(x, weight, bias, transx, transy, grad_out, atol)
print("\n")
## Note: mlperf config: x_m from xx to 28672, in_feature is 1024, out_feature is 1024/4096.
i = 0
fp16_atol = 1e-5
atol = 1e-5
data_type = np.float16
###############################################################################################################
## nt
## randome input: 0.2 is not right, should set to 0.3
print("gemm_nt + bias test: ")
test_driver(
i,
x_m=56,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=1000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=2000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=4000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=8000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=16000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
out_feature=1024,
transx=False,
transy=True,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
out_feature=4096,
transx=False,
transy=True,
dtype=data_type,
atol=atol)
###############################################################################################################
## nn
print("gemm_nn + bias test: ")
test_driver(
i,
x_m=56,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=1000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=2000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=4000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=8000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=16000,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
out_feature=1024,
transx=False,
transy=False,
dtype=data_type,
atol=atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
out_feature=4096,
transx=False,
transy=False,
dtype=data_type,
atol=atol)
# function
This fused op implements the following logics:
```
layer_norm(residual + dropout(input))
```
# Accuracy test
python test_fused_dropout_op.py
# Perf test
python test_fused_dropout_perf.py
# TODO
1. In the unittest, we only test the accuracy when dropout_rate is 0.
2. how to set is_test is true for dropout in eval phase?
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
std::vector<std::vector<int64_t>> FusedDropoutResidualLnInferShape(
const std::vector<int64_t>& x_shape,
const std::vector<int64_t>& residual_shape,
const std::vector<int64_t>& ln_scale_shape,
const std::vector<int64_t>& ln_bias_shape) {
int x_m = 1;
for (int i = 0; i < x_shape.size() - 1; i++) {
x_m *= x_shape[i];
}
const std::vector<int64_t> ln_out_shape = {x_m};
return {x_shape, x_shape, ln_out_shape, ln_out_shape, x_shape};
}
// todo: now, ln_scale is fp16; how ot set ln_mean and ln_var is fp32?
std::vector<paddle::DataType> FusedDropoutResidualLnInferDtype(
paddle::DataType x_dtype,
paddle::DataType residual_dtype,
paddle::DataType ln_scale_dtype,
paddle::DataType ln_bias_dtype) {
// the type of ln_mean/ln_var is the same as ln_scale.
return {x_dtype,
paddle::DataType::UINT8,
paddle::DataType::FLOAT32,
paddle::DataType::FLOAT32,
// ln_scale_dtype,
// ln_scale_dtype,
x_dtype};
}
PD_BUILD_OP(custom_fused_dropout_residual_ln)
.Inputs({"X", "Residual", "LnScale", "LnBias"})
.Outputs({"Out", "DropoutMask", "LnMean", "LnVar", "DropoutResidualOut"})
.Attrs({"ln_epsilon: float",
"is_test: bool",
"fix_seed: bool",
"seed_val: int",
"is_upscale_in_train: bool",
"dropout_rate: float"})
.SetInferShapeFn(PD_INFER_SHAPE(FusedDropoutResidualLnInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(FusedDropoutResidualLnInferDtype));
PD_BUILD_GRAD_OP(custom_fused_dropout_residual_ln)
.Inputs({"X",
"Residual",
"LnScale",
"LnBias",
"DropoutMask",
"LnMean",
"LnVar",
"DropoutResidualOut",
paddle::Grad("Out")})
.Outputs({paddle::Grad("X"),
paddle::Grad("Residual"),
paddle::Grad("LnScale"),
paddle::Grad("LnBias")})
.Attrs({"ln_epsilon: float",
"is_test: bool",
"fix_seed: bool",
"seed_val: int",
"is_upscale_in_train: bool",
"dropout_rate: float"});
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_fused_dropout_residual_ln, ctx) {
namespace f = paddle::framework;
const auto &x = *ctx.Input<f::Tensor>("X");
const auto &residual = *ctx.Input<f::Tensor>("Residual");
const auto &ln_scale = *ctx.Input<f::Tensor>("LnScale");
const auto &ln_bias = *ctx.Input<f::Tensor>("LnBias");
auto &final_out = *ctx.Output<f::Tensor>("Out");
auto &dropout_mask_out = *ctx.Output<f::Tensor>("DropoutMask");
auto &ln_mean = *ctx.Output<f::Tensor>("LnMean");
auto &ln_var = *ctx.Output<f::Tensor>("LnVar");
auto &dropout_residual_out = *ctx.Output<f::Tensor>("DropoutResidualOut");
auto ln_epsilon = ctx.Attr<float>("ln_epsilon");
auto is_test = ctx.Attr<bool>("is_test");
auto fix_seed = ctx.Attr<bool>("fix_seed");
auto seed_val = ctx.Attr<int>("seed_val");
auto is_upscale_in_train = ctx.Attr<bool>("is_upscale_in_train");
auto dropout_rate = ctx.Attr<float>("dropout_rate");
auto &dev_ctx = ctx.cuda_device_context();
auto place = dev_ctx.GetPlace();
// inputs
const auto &x_dims = x.dims();
int x_m = 1;
for (int i = 0; i < x_dims.size() - 1; i++) {
x_m *= x_dims[i];
}
int x_n = x_dims[x_dims.size() - 1];
// outputs
final_out.Resize(x_dims);
dropout_mask_out.Resize(x_dims);
dropout_residual_out.Resize(x_dims);
ln_mean.Resize({x_m});
ln_var.Resize({x_m});
paddle::operators::DropoutParam dropout_param(fix_seed,
0,
is_test,
is_upscale_in_train,
dropout_rate,
nullptr,
seed_val);
PD_DISPATCH_FLOATING_AND_HALF_TYPES(
x.dtype(), "LayernormResidualDropoutBias", ([&] {
paddle::operators::FusedDropoutLayerNormHelper<data_t, uint8_t>
fused_dropout_layernorm_helper(
dev_ctx, x_m, x_n, dropout_param, ln_epsilon);
fused_dropout_layernorm_helper
.LayernormResidualDropoutBias<data_t, true>(
dev_ctx,
x.data<data_t>(), // out_linear_out_data,
residual.data<data_t>(), // residual_data
nullptr, // bias_data,
ln_scale.data<data_t>(),
ln_bias.data<data_t>(),
dev_ctx.Alloc<data_t>(&dropout_residual_out),
dev_ctx.Alloc<uint8_t>(&dropout_mask_out),
dev_ctx.Alloc<data_t>(&final_out),
dev_ctx.Alloc<paddle::operators::LayerNormParamType<data_t>>(
&ln_mean),
dev_ctx.Alloc<paddle::operators::LayerNormParamType<data_t>>(
&ln_var));
}));
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_fused_dropout_residual_ln_grad, ctx) {
namespace f = paddle::framework;
const auto &ln_scale = *ctx.Input<f::Tensor>("LnScale");
const auto &dropout_mask_out = *ctx.Input<f::Tensor>("DropoutMask");
const auto &ln_mean = *ctx.Input<f::Tensor>("LnMean");
const auto &ln_var = *ctx.Input<f::Tensor>("LnVar");
const auto &dropout_residual_out =
*ctx.Input<f::Tensor>("DropoutResidualOut");
const auto &grad_out = *ctx.Input<f::Tensor>(f::GradVarName("Out"));
auto &grad_x = *ctx.Output<f::Tensor>(f::GradVarName("X"));
auto &grad_residual = *ctx.Output<f::Tensor>(f::GradVarName("Residual"));
auto &grad_ln_scale = *ctx.Output<f::Tensor>(f::GradVarName("LnScale"));
auto &grad_ln_bias = *ctx.Output<f::Tensor>(f::GradVarName("LnBias"));
f::Tensor grad_dropout_residual_out;
auto ln_epsilon = ctx.Attr<float>("ln_epsilon");
auto is_test = ctx.Attr<bool>("is_test");
auto fix_seed = ctx.Attr<bool>("fix_seed");
auto seed_val = ctx.Attr<int>("seed_val");
auto is_upscale_in_train = ctx.Attr<bool>("is_upscale_in_train");
auto dropout_rate = ctx.Attr<float>("dropout_rate");
auto &dev_ctx = ctx.cuda_device_context();
auto place = dev_ctx.GetPlace();
const auto &x_dims = grad_out.dims();
int x_m = 1;
for (int i = 0; i < x_dims.size() - 1; i++) {
x_m *= x_dims[i];
}
int x_n = x_dims[x_dims.size() - 1];
// output
grad_x.Resize(x_dims);
grad_residual.Resize(x_dims);
grad_dropout_residual_out.Resize(x_dims);
grad_ln_scale.Resize(ln_scale.dims());
grad_ln_bias.Resize(ln_scale.dims());
paddle::operators::DropoutParam dropout_param(fix_seed,
0,
is_test,
is_upscale_in_train,
dropout_rate,
nullptr,
seed_val);
PD_DISPATCH_FLOATING_AND_HALF_TYPES(
grad_out.dtype(), "LayernormResidualDropoutBiasGrad", ([&] {
paddle::operators::FusedDropoutLayerNormHelper<data_t, uint8_t>
fused_dropout_layernorm_helper(
dev_ctx, x_m, x_n, dropout_param, ln_epsilon);
fused_dropout_layernorm_helper
.LayernormResidualDropoutBiasGrad<data_t, true>(
dev_ctx,
grad_out.data<data_t>(),
dropout_residual_out.data<data_t>(),
dropout_mask_out.data<uint8_t>(),
ln_scale.data<data_t>(),
ln_mean.data<paddle::operators::LayerNormParamType<data_t>>(),
ln_var.data<paddle::operators::LayerNormParamType<data_t>>(),
dev_ctx.Alloc<data_t>(&grad_dropout_residual_out),
dev_ctx.Alloc<data_t>(&grad_ln_scale),
dev_ctx.Alloc<data_t>(&grad_ln_bias),
dev_ctx.Alloc<data_t>(&grad_x), // d_out_linear_out_data,
nullptr, // d_out_linear_bias_data,
dev_ctx.Alloc<data_t>(&grad_residual));
}));
}
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from custom_setup_ops import custom_fused_dropout_residual_ln
import numpy as np
iters = 2
def test_fused_dropout_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
grad_out, atol):
# layer_norm(dropout(x) + residual)
def run_paddle_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
grad_out):
pp_x = paddle.to_tensor(x, stop_gradient=False)
pp_residual = paddle.to_tensor(residual, stop_gradient=False)
pp_ln_scale = paddle.to_tensor(ln_scale, stop_gradient=False)
pp_ln_bias = paddle.to_tensor(ln_bias, stop_gradient=False)
pp_grad_out = paddle.to_tensor(grad_out)
if dropout_rate > 0:
pp_out = paddle.nn.functional.dropout(pp_x, dropout_rate)
pp_add_out = paddle.add(pp_out, pp_residual)
else:
pp_add_out = paddle.add(pp_x, pp_residual)
pp_out = paddle.nn.functional.layer_norm(
pp_add_out, pp_add_out.shape[1:], pp_ln_scale, pp_ln_bias, epsilon)
paddle.autograd.backward([pp_out], [pp_grad_out], retain_graph=True)
return pp_out, pp_add_out, pp_x.grad, pp_residual.grad, pp_ln_scale.grad, pp_ln_bias.grad
def run_custom_fuse_dropout_op(x, residual, ln_scale, ln_bias, epsilon,
dropout_rate, grad_out):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
residual_tensor = paddle.to_tensor(residual, stop_gradient=False)
ln_scale_tensor = paddle.to_tensor(ln_scale, stop_gradient=False)
ln_bias_tensor = paddle.to_tensor(ln_bias, stop_gradient=False)
grad_out_tensor = paddle.to_tensor(grad_out)
# Note: use the default config of dropout.
is_test = False
fix_seed = True
is_upscale_in_train = True
seed_val = 0
out_tensor, dropout_mask, ln_mean, ln_var, dropout_residual_out = custom_fused_dropout_residual_ln(
x_tensor, residual_tensor, ln_scale_tensor, ln_bias_tensor, epsilon,
is_test, fix_seed, seed_val, is_upscale_in_train, dropout_rate)
paddle.autograd.backward(
[out_tensor], [grad_out_tensor], retain_graph=True)
return out_tensor, dropout_residual_out, x_tensor.grad, residual_tensor.grad, ln_scale_tensor.grad, ln_bias_tensor.grad
pp_out, pp_add_out, pp_x_grad, pp_residual_grad, pp_ln_scale_grad, pp_ln_bias_grad = run_paddle_op(
x, residual, ln_scale, ln_bias, epsilon, dropout_rate, grad_out)
out, add_out, x_grad, residual_grad, ln_scale_grad, ln_bias_grad = run_custom_fuse_dropout_op(
x, residual, ln_scale, ln_bias, epsilon, dropout_rate, grad_out)
# check out
np.testing.assert_allclose(
pp_out.numpy(), out.numpy(), rtol=1e-5, atol=atol)
np.testing.assert_allclose(
pp_add_out.numpy(), add_out.numpy(), rtol=1e-5, atol=atol)
# check grad
np.testing.assert_allclose(
pp_x_grad.numpy(), x_grad.numpy(), rtol=1e-5, atol=atol)
np.testing.assert_allclose(
pp_residual_grad.numpy(), residual_grad.numpy(), rtol=1e-5, atol=atol)
np.testing.assert_allclose(
pp_ln_scale_grad.numpy(), ln_scale_grad.numpy(), rtol=1e-5, atol=atol)
np.testing.assert_allclose(
pp_ln_bias_grad.numpy(), ln_bias_grad.numpy(), rtol=1e-5, atol=atol)
def generate_input_data(m, dtype=np.float16):
data = np.random.random((m)).astype(dtype)
for i in range(m):
if i % 2 == 0:
data[i] = 0.25
elif i % 3 == 0:
data[i] = 0.5
else:
data[i] = 0.0625
return data
def generate_fixed_input(x_m, in_feature, dtype):
x = generate_input_data(x_m * in_feature, dtype)
x = x.reshape(x_m, in_feature)
residual = generate_input_data(x_m * in_feature, dtype)
residual = residual.reshape(x_m, in_feature)
ln_scale = generate_input_data(in_feature, dtype)
ln_bias = generate_input_data(in_feature, dtype)
grad_out = generate_input_data(x_m * in_feature, dtype)
grad_out = grad_out.reshape(x_m, in_feature)
return x, residual, ln_scale, ln_bias, grad_out
def generate_ones_input(x_m, in_feature, dtype):
x = np.ones(x_m * in_feature).astype(dtype)
x = x.reshape(x_m, in_feature)
residual = np.ones(x_m * in_feature).astype(dtype)
residual = residual.reshape(x_m, in_feature)
ln_scale = np.ones(in_feature).astype(dtype)
ln_bias = np.ones(in_feature).astype(dtype)
grad_out = np.ones(x_m * in_feature).astype(dtype)
grad_out = grad_out.reshape(x_m, in_feature)
return x, residual, ln_scale, ln_bias, grad_out
def test_driver(i=0,
x_m=56,
in_feature=4,
epsilon=1e-5,
dropout_rate=0,
dtype=np.float16,
atol=1e-2):
for j in range(iters):
if i == 0:
x, residual, ln_scale, ln_bias, grad_out = generate_ones_input(
x_m, in_feature, dtype)
elif i == 1:
x, residual, ln_scale, ln_bias, grad_out = generate_fixed_input(
x_m, in_feature, dtype)
else:
x = np.random.random((x_m, in_feature)).astype(dtype)
residual = np.random.random((x_m, in_feature)).astype(dtype)
ln_scale = np.random.random((in_feature)).astype(dtype)
ln_bias = np.random.random((in_feature)).astype(dtype)
grad_out = np.random.random((x_m, in_feature)).astype(dtype)
test_fused_dropout_op(x, residual, ln_scale, ln_bias, epsilon,
dropout_rate, grad_out, atol)
## Note: mlperf config: x_m is xx to 28672, in_feature is 1024.
## Note: only dropout_rate=0 is tested.
for i in range(3):
print("Begin Test ", i)
atol = 1e-5
if i == 0 or i == 1:
fp16_atol = 1e-5
else:
fp16_atol = 1e-2
atol = 1e-3
# fp16
test_driver(
i,
x_m=56,
in_feature=1024,
epsilon=1e-5,
dropout_rate=0,
dtype=np.float16,
atol=fp16_atol)
test_driver(
i,
x_m=28672,
in_feature=1024,
epsilon=1e-5,
dropout_rate=0,
dtype=np.float16,
atol=fp16_atol)
# fp32
test_driver(
i,
x_m=28672,
in_feature=1024,
epsilon=1e-5,
dropout_rate=0,
dtype=np.float32,
atol=atol)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from custom_setup_ops import custom_fused_dropout_residual_ln
import numpy as np
import time
iters = 100
place = paddle.CUDAPlace(0)
def test_fused_dropout_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
grad_out, atol):
# layer_norm(dropout(x) + residual)
def run_paddle_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
grad_out):
pp_x = paddle.to_tensor(x, stop_gradient=False)
pp_residual = paddle.to_tensor(residual, stop_gradient=False)
pp_ln_scale = paddle.to_tensor(ln_scale, stop_gradient=False)
pp_ln_bias = paddle.to_tensor(ln_bias, stop_gradient=False)
pp_grad_out = paddle.to_tensor(grad_out)
# warmup
if dropout_rate > 0:
pp_out = paddle.nn.functional.dropout(pp_x, dropout_rate)
pp_add_out = paddle.add(pp_out, pp_residual)
else:
pp_add_out = paddle.add(pp_x, pp_residual)
pp_out = paddle.nn.functional.layer_norm(
pp_add_out, pp_add_out.shape[1:], pp_ln_scale, pp_ln_bias, epsilon)
paddle.autograd.backward([pp_out], [pp_grad_out], retain_graph=True)
paddle.device.cuda.synchronize(place)
t1 = time.time()
for i in range(iters):
if dropout_rate > 0:
pp_out = paddle.nn.functional.dropout(pp_x, dropout_rate)
pp_add_out = paddle.add(pp_out, pp_residual)
else:
pp_add_out = paddle.add(pp_x, pp_residual)
pp_out = paddle.nn.functional.layer_norm(
pp_add_out, pp_add_out.shape[1:], pp_ln_scale, pp_ln_bias,
epsilon)
paddle.autograd.backward([pp_out], [pp_grad_out], retain_graph=True)
paddle.device.cuda.synchronize(place)
t2 = time.time()
print("paddle dropout+add+ln time is: (ms)", (t2 - t1) * 1000)
return pp_out, pp_add_out, pp_x.grad, pp_residual.grad, pp_ln_scale.grad, pp_ln_bias.grad
def run_custom_fuse_dropout_op(x, residual, ln_scale, ln_bias, epsilon,
dropout_rate, grad_out):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
residual_tensor = paddle.to_tensor(residual, stop_gradient=False)
ln_scale_tensor = paddle.to_tensor(ln_scale, stop_gradient=False)
ln_bias_tensor = paddle.to_tensor(ln_bias, stop_gradient=False)
grad_out_tensor = paddle.to_tensor(grad_out)
# Note: use the default config of dropout.
is_test = False
fix_seed = True
is_upscale_in_train = True
seed_val = 0
# warmup
out_tensor, dropout_mask, ln_mean, ln_var, dropout_residual_out = custom_fused_dropout_residual_ln(
x_tensor, residual_tensor, ln_scale_tensor, ln_bias_tensor, epsilon,
is_test, fix_seed, seed_val, is_upscale_in_train, dropout_rate)
paddle.autograd.backward(
[out_tensor], [grad_out_tensor], retain_graph=True)
paddle.device.cuda.synchronize(place)
t3 = time.time()
for i in range(iters):
out_tensor, dropout_mask, ln_mean, ln_var, dropout_residual_out = custom_fused_dropout_residual_ln(
x_tensor, residual_tensor, ln_scale_tensor, ln_bias_tensor,
epsilon, is_test, fix_seed, seed_val, is_upscale_in_train,
dropout_rate)
paddle.autograd.backward(
[out_tensor], [grad_out_tensor], retain_graph=True)
paddle.device.cuda.synchronize(place)
t4 = time.time()
print("fused_dropout_residual_ln op time is: (ms)", (t4 - t3) * 1000)
return out_tensor, dropout_residual_out, x_tensor.grad, residual_tensor.grad, ln_scale_tensor.grad, ln_bias_tensor.grad
pp_out, pp_add_out, pp_x_grad, pp_residual_grad, pp_ln_scale_grad, pp_ln_bias_grad = run_paddle_op(
x, residual, ln_scale, ln_bias, epsilon, dropout_rate, grad_out)
out, add_out, x_grad, residual_grad, ln_scale_grad, ln_bias_grad = run_custom_fuse_dropout_op(
x, residual, ln_scale, ln_bias, epsilon, dropout_rate, grad_out)
def generate_input_data(m, dtype=np.float16):
data = np.random.random((m)).astype(dtype)
for i in range(m):
if i % 2 == 0:
data[i] = 0.25
elif i % 3 == 0:
data[i] = 0.5
else:
data[i] = 0.0625
return data
def generate_fixed_input(x_m, in_feature, dtype):
x = generate_input_data(x_m * in_feature, dtype)
x = x.reshape(x_m, in_feature)
residual = generate_input_data(x_m * in_feature, dtype)
residual = residual.reshape(x_m, in_feature)
ln_scale = generate_input_data(in_feature, dtype)
ln_bias = generate_input_data(in_feature, dtype)
grad_out = generate_input_data(x_m * in_feature, dtype)
grad_out = grad_out.reshape(x_m, in_feature)
return x, residual, ln_scale, ln_bias, grad_out
def generate_ones_input(x_m, in_feature, dtype):
x = np.ones(x_m * in_feature).astype(dtype)
x = x.reshape(x_m, in_feature)
residual = np.ones(x_m * in_feature).astype(dtype)
residual = residual.reshape(x_m, in_feature)
ln_scale = np.ones(in_feature).astype(dtype)
ln_bias = np.ones(in_feature).astype(dtype)
grad_out = np.ones(x_m * in_feature).astype(dtype)
grad_out = grad_out.reshape(x_m, in_feature)
return x, residual, ln_scale, ln_bias, grad_out
def test_driver(i=0,
x_m=56,
in_feature=4,
epsilon=1e-5,
dropout_rate=0,
dtype=np.float16,
atol=1e-2):
print("nrows, ncols, dtype = ", x_m, in_feature, dtype)
if i == 0:
x, residual, ln_scale, ln_bias, grad_out = generate_ones_input(
x_m, in_feature, dtype)
elif i == 1:
x, residual, ln_scale, ln_bias, grad_out = generate_fixed_input(
x_m, in_feature, dtype)
else:
x = np.random.random((x_m, in_feature)).astype(dtype)
residual = np.random.random((x_m, in_feature)).astype(dtype)
ln_scale = np.random.random((in_feature)).astype(dtype)
ln_bias = np.random.random((in_feature)).astype(dtype)
grad_out = np.random.random((x_m, in_feature)).astype(dtype)
test_fused_dropout_op(x, residual, ln_scale, ln_bias, epsilon, dropout_rate,
grad_out, atol)
## Note: mlperf config: x_m is xx to 28672, in_feature is 1024.
## Note: only dropout_rate=0 is tested.
i = 0
atol = 1e-5
fp16_atol = 1e-5
# fp16
print("***************fp16 performance: ")
x_m_values = [
56, 200, 500, 1000, 2000, 4000, 8000, 16000, 18000, 20000, 25000, 28000
]
for x_m in x_m_values:
test_driver(
i,
x_m=x_m,
in_feature=1024,
epsilon=1e-5,
dropout_rate=0.1,
dtype=np.float16,
atol=fp16_atol)
# fp32
print("**************fp32 performance: ")
for x_m in x_m_values:
test_driver(
i,
x_m=x_m,
in_feature=1024,
epsilon=1e-5,
dropout_rate=0.1,
dtype=np.float32,
atol=atol)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
std::vector<std::vector<int64_t>> LrInferShape(std::vector<int64_t> x_shape) {
return {x_shape};
}
// input: [1], int64
// output: [1], float
std::vector<paddle::DataType> LrInferDtype(paddle::DataType x_dtype) {
return {paddle::DataType::FLOAT32};
}
PD_BUILD_OP(custom_lr)
.Inputs({"X"})
.Outputs({"Out"})
.Attrs({"base_lr: float",
"end_lr: float",
"degree: float",
"warmup_step: int64_t",
"start_warmup_step: int64_t",
"max_step: int64_t"})
.SetInferShapeFn(PD_INFER_SHAPE(LrInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(LrInferDtype));
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
// x_data[0]: current step which is numbered from 0.
// Note: when computing, we should use x_data[0] + 1.
// y_data[0]: the lr var of this step
__global__ void compute_lr_fwd_kernel(const int64_t* x_data,
float* y_data,
float base_lr,
float end_lr,
float degree,
int64_t start_warmup_step,
int64_t warmup_step,
int64_t max_step) {
int64_t step = x_data[0] + 1;
int64_t offset_step = (start_warmup_step == 0 ? 1 : 0);
int64_t mod_step = step - offset_step - start_warmup_step;
double y;
if (mod_step < warmup_step) {
auto p = mod_step / (warmup_step + 1e-6);
y = base_lr * p;
} else {
auto p = (step - offset_step) / static_cast<double>(max_step);
p = (p >= 1 ? 0 : (::pow(1 - p, degree)));
y = (base_lr - end_lr) * p + end_lr;
}
y_data[0] = static_cast<float>(y);
// y_data[0] = base_lr * (static_cast<float>(max_step - x_data[0]) /
// max_step);
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_lr, ctx) {
namespace f = paddle::framework;
const auto* x = ctx.Input<f::Tensor>("X");
auto* out = ctx.Output<f::Tensor>("Out");
auto& dev_ctx = ctx.cuda_device_context();
auto place = dev_ctx.GetPlace();
auto stream = dev_ctx.stream();
float base_lr = ctx.Attr<float>("base_lr");
float end_lr = ctx.Attr<float>("end_lr");
float degree = ctx.Attr<float>("degree");
int64_t start_warmup_step = ctx.Attr<int64_t>("start_warmup_step");
int64_t warmup_step = ctx.Attr<int64_t>("warmup_step");
int64_t max_step = ctx.Attr<int64_t>("max_step");
const auto& x_dims = x->dims();
if (x_dims.size() != 1 || x_dims[0] != 1) {
PD_THROW("The shape of input x must be [1].");
}
auto out_dims = x_dims;
out->Resize(out_dims);
const int64_t* x_data = x->data<int64_t>();
float* out_data = out->mutable_data<float>(x->place());
compute_lr_fwd_kernel<<<1, 1, 0, stream>>>(x_data,
out_data,
base_lr,
end_lr,
degree,
start_warmup_step,
warmup_step,
max_step);
}
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle.fluid.core as core
from paddle.utils.cpp_extension import CUDAExtension, setup
compile_dir = os.environ.get('/public/home/zhangqha/for_baidu/Paddle-develop/build', '/public/home/zhangqha/for_baidu/Paddle-develop/build')
define_macros = []
if core.is_compiled_with_mkldnn():
define_macros.append(('PADDLE_WITH_MKLDNN', None))
if core.is_compiled_with_nccl():
define_macros.append(('PADDLE_WITH_NCCL', None))
define_macros.append(('PADDLE_WITH_HIP', None))
define_macros.append(('PADDLE_WITH_RCCL', None))
setup(
name='custom_setup_ops',
ext_modules=CUDAExtension(
sources=[
#'./fused_dense_op/fused_dense_cuda.cc',
#'./fused_dense_op/fused_dense_cuda.cu',
###'./fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cc',
###'./fused_dropout_residual_ln/fused_dropout_residual_ln_cuda.cu',
#'./fmhalib/fmha_cuda.cc',
#'./fmhalib/fmha_cuda.cu',
'./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc',
'./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu',
'./lr_op/lr_op_cuda.cc',
'./lr_op/lr_op_cuda.cu',
'./acc_merge/acc_merge.cc',
'./acc_merge/acc_merge.cu',
],
# extra_objects=[os.path.join(apex_lib_dir, 'libfmha.so')],
# include_dirs=[apex_dir],
# library_dirs=[apex_lib_dir],
# extra_link_args=['-lfmha', '-ldl', '-lcublas'],
_compile_dir=compile_dir,
define_macros=define_macros))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment