Unverified Commit 85bc98ac authored by qinyiqun's avatar qinyiqun Committed by GitHub
Browse files

ISSUE/628 适配QY C610 GPU,增加编译选项,适配已有算子。添加bge类模型所需的算子, (#629)



* ISSUE/628 适配QY C610 GPU,增加编译选项,适配已有算子。添加bge类模型所需的算子,包括gelu,layer_norm,lp_norm(支持l1,l2 norm),relu,softmax,tanh。

---------
Co-authored-by: default avatarxgqdut2016 <kenan_gewei@163.com>
Co-authored-by: default avatarxgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
parent 7c397dd2
......@@ -8,7 +8,7 @@
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/rearrange_nvidia.cuh"
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -52,6 +52,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -99,6 +102,9 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -144,6 +150,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
......
#ifndef __RELU_CUDA_H__
#define __RELU_CUDA_H__
#include <cmath>
namespace op::relu::cuda {
typedef struct ReluOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float x_f = __bfloat162float(x);
float result = (x_f > 0.0f ? x_f : 0.0f);
return __float2bfloat16(result);
} else if constexpr (std::is_same_v<T, half>) {
float x_f = __half2float(x);
float result = (x_f > 0.0f ? x_f : 0.0f);
return __float2half(result);
} else if constexpr (std::is_same_v<T, float>) {
return (x > 0.0f ? x : 0.0f);
} else {
return (x > 0.0 ? x : 0.0);
}
}
} ReluOp;
} // namespace op::relu::cuda
#endif // __RELU_CUDA_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../../../build/ninetoothed/relu.h"
#endif
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "relu_nvidia.cuh"
namespace op::relu::nvidia {
......@@ -40,7 +43,7 @@ infiniStatus_t Descriptor::calculate(
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
#ifdef ENABLE_NINETOOTHED
const auto &ndim{_info.getNdim()};
const auto &x_shape_{_info.getInputShape(0)};
const auto &x_strides_{_info.getInputStrides(0)};
......@@ -72,9 +75,20 @@ infiniStatus_t Descriptor::calculate(
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#else
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::ReluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::ReluOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::ReluOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::ReluOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#endif
return INFINI_STATUS_SUCCESS;
}
} // namespace op::relu::nvidia
#endif
#ifndef __RELU_NVIDIA_API_H__
#define __RELU_NVIDIA_API_H__
#ifdef ENABLE_NINETOOTHED
// #ifdef ENABLE_NINETOOTHED
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(relu, nvidia)
#endif
// #endif
#endif // __RELU_NVIDIA_API_H__
......@@ -5,11 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/relu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#ifdef ENABLE_NINETOOTHED
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/relu_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
#include "metax/relu_metax.h"
......@@ -36,14 +34,13 @@ __C infiniStatus_t infiniopCreateReluDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
......@@ -70,14 +67,13 @@ __C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, s
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
......@@ -111,14 +107,13 @@ __C infiniStatus_t infiniopRelu(
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
......@@ -147,14 +142,13 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) {
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/rms_norm_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/rms_norm_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
......@@ -52,6 +52,9 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -94,6 +97,9 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -137,6 +143,9 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -179,6 +188,9 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DESTROY(INFINI_DEVICE_HYGON, nvidia);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/rope_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/rope_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
......@@ -56,6 +56,9 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -98,6 +101,9 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -149,6 +155,9 @@ __C infiniStatus_t infiniopRoPE(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -192,6 +201,9 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
......
......@@ -22,8 +22,13 @@ public:
__nv_bfloat16 denominator = __float2bfloat16(__fadd_rn(1.0f, __expf(__bfloat162float(-x))));
return __float2bfloat16(1.0f) / denominator;
} else if constexpr (std::is_same_v<T, float>) {
float denominator = __fadd_rn(1.0f, __expf(-x));
return __frcp_rn(denominator);
if (x >= 0.0f) {
float z = expf(-x);
return 1.0f / (1.0f + z);
} else {
float z = expf(x);
return z / (1.0f + z);
}
} else { // double
return 1.0 / (1.0 + exp(-x));
}
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/sigmoid_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/sigmoid_nvidia.cuh"
#endif
......@@ -31,6 +31,9 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor(
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -52,6 +55,9 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -82,6 +88,9 @@ __C infiniStatus_t infiniopSigmoid(
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -106,6 +115,9 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __SOFTMAX_KERNEL_CUH__
#define __SOFTMAX_KERNEL_CUH__
#include <cub/block/block_reduce.cuh>
struct __align__(8) DataMaxSum { // update the global max and sum, store the
// output at max_tmp and sum_tmp
float max_tmp; // store max
float sum_tmp; // store sum
};
__device__ __forceinline__ DataMaxSum reduce_dms_op(DataMaxSum a,
DataMaxSum b) {
bool a_bigger = (a.max_tmp > b.max_tmp);
DataMaxSum bigger = a_bigger ? a : b;
DataMaxSum smaller = a_bigger ? b : a;
bigger.sum_tmp = bigger.sum_tmp + smaller.sum_tmp * __expf(smaller.max_tmp - bigger.max_tmp);
return bigger;
}
template <typename T, unsigned int BLOCK_SIZE>
__device__ void blockSoftmaxKernel(
T const *input, T *output, size_t dimsize,
ptrdiff_t stride) {
int tid = blockIdx.x % stride + (blockIdx.x - blockIdx.x % stride) * dimsize; // now, tid = i(JKS) + k(S) + s;
DataMaxSum dms_partial;
dms_partial.max_tmp = -__FLT_MAX__;
dms_partial.sum_tmp = 0.0f;
DataMaxSum dms_input;
for (int ind = threadIdx.x; ind < dimsize; ind += BLOCK_SIZE) {
dms_input.max_tmp = static_cast<float>(input[tid + ind * stride]);
dms_input.sum_tmp = 1.0f;
dms_partial = reduce_dms_op(dms_partial,
dms_input); // reduce the data to one block
}
typedef cub::BlockReduce<DataMaxSum, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ DataMaxSum dms_total;
DataMaxSum dms_block = BlockReduce(temp_storage).Reduce(dms_partial, reduce_dms_op);
if (threadIdx.x == 0) { // must set threadIdx.x = 0 write the output to memory
dms_total = dms_block;
}
__syncthreads();
float inv = __fdividef(1.0F, dms_total.sum_tmp);
for (int ind = threadIdx.x; ind < dimsize; ind += BLOCK_SIZE) {
output[tid + ind * stride] = static_cast<T>(
__expf(static_cast<float>(
input[tid + ind * stride])
- dms_total.max_tmp)
* inv);
}
}
template <typename T>
struct SumOp {
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
return a + b;
}
};
template <typename T>
struct MaxOp {
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
return max(a, b);
}
};
template <template <typename> class ReductionOp, typename T,
int thread_group_width>
__inline__ __device__ T WarpAllReduce(T val) {
for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
}
return val;
}
template <typename T, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y, int numPerThreadx>
__device__ void warpSoftmaxKernel(T const *input, T *output,
size_t othersize, size_t dimsize, ptrdiff_t stride) {
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
float dataPerThreadx[numPerThreadx];
if (otherIdx < othersize) {
__shared__ float max_total[BLOCK_SIZE_y];
__shared__ float sum_total[BLOCK_SIZE_y];
float max_data = -__FLT_MAX__;
for (int ph = 0; threadIdx.x + ph * BLOCK_SIZE_x < dimsize; ph++) {
dataPerThreadx[ph] = static_cast<float>(input[tid + (threadIdx.x + ph * BLOCK_SIZE_x) * stride]);
max_data = max(max_data, dataPerThreadx[ph]);
}
max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
if (threadIdx.x == 0) {
max_total[threadIdx.y] = max_data;
}
//--------------------------------------------
float sum_data = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_SIZE_x < dimsize; ph++) {
dataPerThreadx[ph] = __expf(dataPerThreadx[ph] - max_total[threadIdx.y]);
sum_data += dataPerThreadx[ph];
}
sum_data = WarpAllReduce<SumOp, float, BLOCK_SIZE_x>(sum_data);
if (threadIdx.x == 0) {
sum_total[threadIdx.y] = sum_data;
}
//--------------------------------------------
for (int ph = 0; threadIdx.x + ph * BLOCK_SIZE_x < dimsize; ph++) {
output[tid + (threadIdx.x + ph * BLOCK_SIZE_x) * stride] = static_cast<T>(
dataPerThreadx[ph] * __fdividef(1.0F, sum_total[threadIdx.y]));
}
}
}
#endif // __SOFTMAX_KERNEL_CUH__
#ifndef __SOFTMAX_INFO_H__
#define __SOFTMAX_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace op::softmax {
class SoftmaxInfo {
SoftmaxInfo() = default;
public:
infiniDtype_t dtype;
size_t othersize;
size_t dimsize;
ptrdiff_t stride;
static utils::Result<SoftmaxInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, int axis) {
auto dtype = y_desc->dtype();
if (dtype != x_desc->dtype()) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
auto shape = y_desc->shape();
CHECK_SAME_SHAPE(shape, x_desc->shape());
auto ndim = y_desc->ndim();
if (axis < 0) {
axis += (int)(ndim);
}
size_t othersize = 1;
for (int i = 0; i < (int)ndim; i++) {
if (i != axis) {
othersize *= shape[i];
}
}
size_t dimsize = shape[axis];
ptrdiff_t stride = 1;
for (int i = ndim - 1; i > axis; i--) {
stride *= (ptrdiff_t)shape[i];
}
return utils::Result<SoftmaxInfo>(SoftmaxInfo{
dtype,
othersize,
dimsize,
stride});
}
};
} // namespace op::softmax
#endif // __SOFTMAX_INFO_H__
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "softmax_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <typename Tdata, unsigned int BLOCK_SIZE>
INFINIOP_CUDA_KERNEL blockSoftmax(
Tdata *y, const Tdata *x,
size_t dimsize,
ptrdiff_t stride) {
blockSoftmaxKernel<Tdata, BLOCK_SIZE>(x, y, dimsize, stride);
}
template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y, int numPerThreadx>
INFINIOP_CUDA_KERNEL warpSoftmax(
Tdata *y, const Tdata *x,
size_t othersize,
size_t dimsize,
ptrdiff_t stride) {
warpSoftmaxKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y, numPerThreadx>(x, y, othersize, dimsize, stride);
}
namespace op::softmax::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
int axis) {
auto info = SoftmaxInfo::create(y_desc, x_desc, axis);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
size_t othersize, size_t dimsize, ptrdiff_t stride,
cudaStream_t stream) {
int num_blocks = (int)othersize;
if (dtype == INFINI_DTYPE_F16) {
if (dimsize > 1024) {
blockSoftmax<half, BLOCK_SIZE>
<<<num_blocks, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
dimsize, stride);
} else if (dimsize > 31) {
constexpr unsigned int BLOCK_SIZE_x = 32;
constexpr unsigned int BLOCK_SIZE_y = 32;
constexpr int numPerThreadx = 32;
int num_block_x = (num_blocks + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpSoftmax<half, BLOCK_SIZE_x, BLOCK_SIZE_y, numPerThreadx>
<<<grid_dim, block_dim, 0, stream>>>((half *)y, (const half *)x,
othersize, dimsize, stride);
} else {
constexpr unsigned int BLOCK_SIZE_x = 16;
constexpr unsigned int BLOCK_SIZE_y = 32;
constexpr int numPerThreadx = 2;
int num_block_x = (num_blocks + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpSoftmax<half, BLOCK_SIZE_x, BLOCK_SIZE_y, numPerThreadx>
<<<grid_dim, block_dim, 0, stream>>>((half *)y, (const half *)x,
othersize, dimsize, stride);
}
} else if (dtype == INFINI_DTYPE_F32) {
if (dimsize > 1024) {
blockSoftmax<float, BLOCK_SIZE>
<<<num_blocks, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
dimsize, stride);
} else if (dimsize > 31) {
constexpr unsigned int BLOCK_SIZE_x = 32;
constexpr unsigned int BLOCK_SIZE_y = 32;
constexpr int numPerThreadx = 32;
int num_block_x = (num_blocks + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpSoftmax<float, BLOCK_SIZE_x, BLOCK_SIZE_y, numPerThreadx>
<<<grid_dim, block_dim, 0, stream>>>((float *)y, (const float *)x,
othersize, dimsize, stride);
} else {
constexpr unsigned int BLOCK_SIZE_x = 16;
constexpr unsigned int BLOCK_SIZE_y = 32;
constexpr int numPerThreadx = 2;
int num_block_x = (num_blocks + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpSoftmax<float, BLOCK_SIZE_x, BLOCK_SIZE_y, numPerThreadx>
<<<grid_dim, block_dim, 0, stream>>>((float *)y, (const float *)x,
othersize, dimsize, stride);
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *y,
const void *x,
void *stream_) const {
cudaStream_t stream = (cudaStream_t)stream_;
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::softmax::nvidia
#ifndef __SOFTMAX_NVIDIA_H__
#define __SOFTMAX_NVIDIA_H__
#include "../softmax.h"
DESCRIPTOR(nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/softmax.h"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/softmax_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/softmax_metax.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/softmax_ascend.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/softmax_bang.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/softmax_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/softmax_moore.h"
#endif
__C infiniStatus_t infiniopCreateSoftmaxDescriptor(
infiniopHandle_t handle,
infiniopSoftmaxDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
int axis) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::softmax::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::softmax::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
x_desc, axis);
switch (handle->device) {
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang)
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun)
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore)
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopGetSoftmaxWorkspaceSize(infiniopSoftmaxDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::softmax::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang)
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore)
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopSoftmax(
infiniopSoftmaxDescriptor_t desc,
void *workspace, size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::softmax::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, y, x, stream);
switch (desc->device_type) {
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang)
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun)
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore)
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopDestroySoftmaxDescriptor(infiniopSoftmaxDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::softmax::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DESTROY(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
DESTROY(INFINI_DEVICE_CAMBRICON, bang)
#endif
#ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
DESTROY(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_KUNLUN_API
DESTROY(INFINI_DEVICE_KUNLUN, kunlun)
#endif
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, moore)
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#ifndef SOFTMAX_H
#define SOFTMAX_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::softmax::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
SoftmaxInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
SoftmaxInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
int axis); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
#endif // SOFTMAX_H
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/softplus_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/softplus_nvidia.cuh"
#endif
......@@ -34,6 +34,9 @@ __C infiniStatus_t infiniopCreateSoftplusDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -59,6 +62,9 @@ __C infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -92,6 +98,9 @@ __C infiniStatus_t infiniopSoftplus(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -119,6 +128,9 @@ infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/sub_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/sub_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
......@@ -73,6 +76,9 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
......@@ -113,6 +119,9 @@ __C infiniStatus_t infiniopSub(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
......@@ -146,6 +155,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/swiglu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/swiglu_nvidia.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
......@@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -94,6 +97,9 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -144,6 +150,9 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
......@@ -189,6 +198,9 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
......
#include "tanh_cpu.h"
namespace op::tanh::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(output_shape, input_shape);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<TanhOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<TanhOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<TanhOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<TanhOp, bf16_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::tanh::cpu
#ifndef __TANH_CPU_H__
#define __TANH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <cmath>
ELEMENTWISE_DESCRIPTOR(tanh, cpu)
namespace op::tanh::cpu {
typedef struct TanhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &input) const {
return std::tanh(input);
}
} TanhOp;
} // namespace op::tanh::cpu
#endif // __TANH_CPU_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment