// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include "paddle/extension.h" #define CHECK_INPUT(x) \ PD_CHECK(x.place().GetType() == phi::AllocationType::GPU, \ #x " must be a GPU Tensor.") std::vector fmha_cuda_forward(const paddle::Tensor& qkv, const paddle::Tensor& cu_seqlen, const paddle::Tensor& host_seqlen, bool is_test, float dropout_rate, bool zero_tensors, bool use_fmha_mke_opt); std::vector fmha_cuda_backward( const paddle::Tensor& qkv, const paddle::Tensor& cu_seqlen, const paddle::Tensor& host_seqlen, const paddle::Tensor& softmax_input, const paddle::Tensor& d_ctx_out, bool is_test, float dropout_rate, bool zero_tensors, bool use_fmha_mke_opt); /* *x_shape(fp16) = [total_tokens, 3, num_heads, head_size] *y_shape(int32) = [batch_size + 1] */ std::vector> FmhaInferShape( const std::vector& x_shape, const std::vector& y_shape, const std::vector& host_y_shape, const bool& is_test, const float& dropout_rate, const bool& zero_tensors, const bool& use_fmha_mke_opt) { int total = x_shape[0]; int num_heads = x_shape[2]; int head_size = x_shape[3]; int batch_size = y_shape[0] - 1; if (x_shape[1] != 3) { PD_THROW( "The shape for input QKV should be [total_tokens, 3, num_heas, " "head_size]."); } int max_seq_len = 512; std::vector ctx_out_shape = {total, num_heads, head_size}; std::vector s_out_shape = { batch_size, num_heads, max_seq_len, max_seq_len}; return {ctx_out_shape, s_out_shape}; } std::vector FmhaInferDtype(paddle::DataType x_dtype, paddle::DataType y_dtype, paddle::DataType host_y_dtype) { return {x_dtype, x_dtype}; } std::vector FmhaCUDAForward(const paddle::Tensor& qkv, const paddle::Tensor& cu_seqlen, const paddle::Tensor& host_seqlen, bool is_test, float dropout_rate, bool zero_tensors, bool use_fmha_mke_opt) { CHECK_INPUT(qkv); CHECK_INPUT(cu_seqlen); // Note: should not use CHECK_INPUT(max_seq_len_host), // because it will enforce this input to be GPU tensor return fmha_cuda_forward(qkv, cu_seqlen, host_seqlen, is_test, dropout_rate, zero_tensors, use_fmha_mke_opt); } std::vector FmhaCUDABackward( const paddle::Tensor& qkv, const paddle::Tensor& cu_seqlen, const paddle::Tensor& host_seqlen, const paddle::Tensor& softmax_input, const paddle::Tensor& d_ctx_out, bool is_test, float dropout_rate, bool zero_tensors, bool use_fmha_mke_opt) { CHECK_INPUT(qkv); CHECK_INPUT(cu_seqlen); CHECK_INPUT(softmax_input); CHECK_INPUT(d_ctx_out); return fmha_cuda_backward(qkv, cu_seqlen, host_seqlen, softmax_input, d_ctx_out, is_test, dropout_rate, zero_tensors, use_fmha_mke_opt); } PD_BUILD_OP(custom_fmha) .Inputs({"QKV", "CuSeqLen", "HostSeqLen"}) .Outputs({"CtxOut", "SOut"}) .Attrs({"is_test: bool", "dropout_rate: float", "zero_tensors: bool", "use_fmha_mke_opt: bool"}) .SetKernelFn(PD_KERNEL(FmhaCUDAForward)) .SetInferShapeFn(PD_INFER_SHAPE(FmhaInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(FmhaInferDtype)); PD_BUILD_GRAD_OP(custom_fmha) .Inputs({"QKV", "CuSeqLen", "HostSeqLen", "SOut", paddle::Grad("CtxOut")}) .Outputs({paddle::Grad("QKV")}) .Attrs({"is_test: bool", "dropout_rate: float", "zero_tensors: bool", "use_fmha_mke_opt: bool"}) .SetKernelFn(PD_KERNEL(FmhaCUDABackward));