Commit 362f0187 authored by wooway777's avatar wooway777
Browse files

issue/1032 - provide an alternate cuda swiglu

parent 718b18cf
#ifndef __SWIGLU_CUDA_KERNEL_CUH__
#define __SWIGLU_CUDA_KERNEL_CUH__
template <typename T>
__device__ __forceinline__ T sigmoid(const T &x) {
if constexpr (std::is_same_v<T, half2>) {
return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
} else if constexpr (std::is_same_v<T, half>) {
return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
} else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
float x0 = __bfloat162float(__low2bfloat16(x));
float x1 = __bfloat162float(__high2bfloat16(x));
float sig0 = __frcp_rn(__fadd_rn(1.0f, __expf(-x0)));
float sig1 = __frcp_rn(__fadd_rn(1.0f, __expf(-x1)));
return __floats2bfloat162_rn(sig0, sig1);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float xf = __bfloat162float(x);
return __float2bfloat16_rn(__frcp_rn(__fadd_rn(1.0f, __expf(-xf))));
} else if constexpr (std::is_same_v<T, float>) {
return __frcp_rn(__fadd_rn(1, __expf(-x)));
} else {
return 1 / (1 + std::exp(-x));
}
}
template <typename T, unsigned int BLOCK_SIZE>
__device__ void SwiGLUCudaKernel(
T *c,
const T *a,
const T *b,
int length,
size_t batch, size_t seq_len, size_t hidden_dim,
ptrdiff_t c_strides_0, ptrdiff_t c_strides_1,
ptrdiff_t a_strides_0, ptrdiff_t a_strides_1,
ptrdiff_t b_strides_0, ptrdiff_t b_strides_1) {
int ind_c = 0;
int ind_a = 0;
int ind_b = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < length) {
ind_c += tid % (int)hidden_dim;
ind_a += tid % (int)hidden_dim;
ind_b += tid % (int)hidden_dim;
tid = tid / (int)hidden_dim;
ind_c += (tid % (int)seq_len) * (int)c_strides_1;
ind_a += (tid % (int)seq_len) * (int)a_strides_1;
ind_b += (tid % (int)seq_len) * (int)b_strides_1;
tid = tid / (int)seq_len;
ind_c += (tid % (int)batch) * (int)c_strides_0;
ind_a += (tid % (int)batch) * (int)a_strides_0;
ind_b += (tid % (int)batch) * (int)b_strides_0;
T gate = b[ind_b];
T up = a[ind_a];
if constexpr (std::is_same_v<T, half2>) {
c[ind_c] = __hmul2(__hmul2(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, half>) {
c[ind_c] = __hmul(__hmul(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
cuda_bfloat162 sig = sigmoid(gate);
float gate0 = __bfloat162float(__low2bfloat16(gate));
float gate1 = __bfloat162float(__high2bfloat16(gate));
float sig0 = __bfloat162float(__low2bfloat16(sig));
float sig1 = __bfloat162float(__high2bfloat16(sig));
float up0 = __bfloat162float(__low2bfloat16(up));
float up1 = __bfloat162float(__high2bfloat16(up));
float res0 = __fmul_rn(__fmul_rn(gate0, sig0), up0);
float res1 = __fmul_rn(__fmul_rn(gate1, sig1), up1);
c[ind_c] = __floats2bfloat162_rn(res0, res1);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
cuda_bfloat16 sig = sigmoid(gate);
float gatef = __bfloat162float(gate);
float sigf = __bfloat162float(sig);
float upf = __bfloat162float(up);
c[ind_c] = __float2bfloat16_rn(__fmul_rn(__fmul_rn(gatef, sigf), upf));
} else if constexpr (std::is_same_v<T, float>) {
c[ind_c] = __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up);
} else {
c[ind_c] = gate * sigmoid(gate) * up;
}
}
}
#endif // __SWIGLU_CUDA_KERNEL_CUH__
#ifndef __SWIGLU_CUDA_INFO_H__
#define __SWIGLU_CUDA_INFO_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
namespace op::swiglu_cuda {
class SwiGLUCudaInfo {
SwiGLUCudaInfo() = default;
public:
infiniDtype_t dtype;
size_t length;
size_t batch, seq_len, hidden_dim;
ptrdiff_t c_strides_0, c_strides_1;
ptrdiff_t a_strides_0, a_strides_1;
ptrdiff_t b_strides_0, b_strides_1;
static utils::Result<SwiGLUCudaInfo> createSwiGLUCudaInfo(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
auto dtype = c_desc->dtype();
if (dtype != a_desc->dtype() || dtype != b_desc->dtype()) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
auto shape = c_desc->shape();
CHECK_SAME_SHAPE(shape, a_desc->shape(), b_desc->shape());
auto ndim = c_desc->ndim();
size_t hidden_dim = shape[ndim - 1];
size_t seq_len = shape[ndim - 2];
size_t batch = (ndim == 3 ? shape[0] : 1);
size_t length = batch * seq_len * hidden_dim;
ptrdiff_t c_strides_0 = (ndim == 3 ? c_desc->strides()[0] : 0);
ptrdiff_t c_strides_1 = (ndim == 3 ? c_desc->strides()[1] : c_desc->strides()[0]);
ptrdiff_t a_strides_0 = (ndim == 3 ? a_desc->strides()[0] : 0);
ptrdiff_t a_strides_1 = (ndim == 3 ? a_desc->strides()[1] : a_desc->strides()[0]);
ptrdiff_t b_strides_0 = (ndim == 3 ? b_desc->strides()[0] : 0);
ptrdiff_t b_strides_1 = (ndim == 3 ? b_desc->strides()[1] : b_desc->strides()[0]);
return utils::Result<SwiGLUCudaInfo>(SwiGLUCudaInfo{
dtype,
length,
batch,
seq_len,
hidden_dim,
c_strides_0,
c_strides_1,
a_strides_0,
a_strides_1,
b_strides_0,
b_strides_1});
}
};
} // namespace op::swiglu_cuda
#endif // __SWIGLU_CUDA_INFO_H__
#ifndef __SWIGLU_CUDA_METAX_H__
#define __SWIGLU_CUDA_METAX_H__
#include "../swiglu_cuda.h"
DESCRIPTOR(metax)
#endif
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel_cuda.cuh"
#include "swiglu_metax_cuda.h"
template <typename T, unsigned int BLOCK_SIZE>
INFINIOP_METAX_KERNEL SwiGLUCuda(
T *c,
const T *a,
const T *b,
int length,
size_t batch, size_t seq_len, size_t hidden_dim,
ptrdiff_t c_strides_0, ptrdiff_t c_strides_1,
ptrdiff_t a_strides_0, ptrdiff_t a_strides_1,
ptrdiff_t b_strides_0, ptrdiff_t b_strides_1) {
SwiGLUCudaKernel<T, BLOCK_SIZE>(c, a, b, length, batch, seq_len, hidden_dim,
c_strides_0, c_strides_1,
a_strides_0, a_strides_1,
b_strides_0, b_strides_1);
}
namespace op::swiglu_cuda::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
auto info = SwiGLUCudaInfo::createSwiGLUCudaInfo(c_desc, a_desc, b_desc);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE, typename T>
infiniStatus_t calculate_swiglu_cuda(
const SwiGLUCudaInfo &info,
T *c,
const T *a,
const T *b,
hcStream_t stream,
void *workspace) {
int length = (int)info.length;
int batch = (int)info.batch;
int seq_len = (int)info.seq_len;
int hidden_dim = (int)info.hidden_dim;
int c_strides_0 = (int)info.c_strides_0;
int c_strides_1 = (int)info.c_strides_1;
int a_strides_0 = (int)info.a_strides_0;
int a_strides_1 = (int)info.a_strides_1;
int b_strides_0 = (int)info.b_strides_0;
int b_strides_1 = (int)info.b_strides_1;
int num_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
SwiGLUCuda<T, BLOCK_SIZE>
<<<num_blocks, BLOCK_SIZE, 0, stream>>>(c, a, b, length, batch, seq_len, hidden_dim,
c_strides_0, c_strides_1,
a_strides_0, a_strides_1,
b_strides_0, b_strides_1);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream_) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, TDATA) \
calculate_swiglu_cuda<BLOCK_SIZE, TDATA>(_info, (TDATA *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
#define CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, float); \
else if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, __hpcc_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu_cuda::metax
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel_cuda.cuh"
#include "swiglu_nvidia_cuda.cuh"
template <typename T, unsigned int BLOCK_SIZE>
INFINIOP_CUDA_KERNEL SwiGLUCuda(
T *c,
const T *a,
const T *b,
int length,
size_t batch, size_t seq_len, size_t hidden_dim,
ptrdiff_t c_strides_0, ptrdiff_t c_strides_1,
ptrdiff_t a_strides_0, ptrdiff_t a_strides_1,
ptrdiff_t b_strides_0, ptrdiff_t b_strides_1) {
SwiGLUCudaKernel<T, BLOCK_SIZE>(c, a, b, length, batch, seq_len, hidden_dim,
c_strides_0, c_strides_1,
a_strides_0, a_strides_1,
b_strides_0, b_strides_1);
}
namespace op::swiglu_cuda::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
auto info = SwiGLUCudaInfo::createSwiGLUCudaInfo(c_desc, a_desc, b_desc);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE, typename T>
infiniStatus_t calculate_swiglu_cuda(
const SwiGLUCudaInfo &info,
T *c,
const T *a,
const T *b,
cudaStream_t stream,
void *workspace) {
int length = (int)info.length;
int batch = (int)info.batch;
int seq_len = (int)info.seq_len;
int hidden_dim = (int)info.hidden_dim;
int c_strides_0 = (int)info.c_strides_0;
int c_strides_1 = (int)info.c_strides_1;
int a_strides_0 = (int)info.a_strides_0;
int a_strides_1 = (int)info.a_strides_1;
int b_strides_0 = (int)info.b_strides_0;
int b_strides_1 = (int)info.b_strides_1;
int num_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
SwiGLUCuda<T, BLOCK_SIZE>
<<<num_blocks, BLOCK_SIZE, 0, stream>>>(c, a, b, length, batch, seq_len, hidden_dim,
c_strides_0, c_strides_1,
a_strides_0, a_strides_1,
b_strides_0, b_strides_1);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream_) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
cudaStream_t stream = (cudaStream_t)stream_;
#define CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, TDATA) \
calculate_swiglu_cuda<BLOCK_SIZE, TDATA>(_info, (TDATA *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
#define CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, float); \
else if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, __nv_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu_cuda::nvidia
#ifndef __SWIGLU_CUDA_NVIDIA_H__
#define __SWIGLU_CUDA_NVIDIA_H__
#include "../swiglu_cuda.h"
DESCRIPTOR(nvidia)
#endif
......@@ -6,11 +6,7 @@
#include "cpu/swiglu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NINETOOTHED)
#include "ninetoothed/swiglu.h"
#else
#include "nvidia/swiglu_nvidia.cuh"
#endif
#include "nvidia/swiglu_nvidia_cuda.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
......@@ -19,7 +15,7 @@
#if defined(ENABLE_NINETOOTHED)
#include "ninetoothed/swiglu.h"
#else
#include "metax/swiglu_metax.h"
#include "metax/swiglu_metax_cuda.h"
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -48,33 +44,38 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
{a_desc, \
b_desc})
#define CREATE_CUDA(CASE, NAMESPACE) \
case CASE: \
return op::swiglu_cuda::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::swiglu_cuda::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
a_desc, \
b_desc)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_NVIDIA, ninetoothed);
#else
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
CREATE_CUDA(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_ILUVATAR, ninetoothed);
#else
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
CREATE_CUDA(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
CREATE_CUDA(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
CREATE_CUDA(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
......@@ -83,7 +84,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_METAX, ninetoothed);
#else
CREATE(INFINI_DEVICE_METAX, metax);
CREATE_CUDA(INFINI_DEVICE_METAX, metax);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -110,32 +111,34 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
#define GET_CUDA(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::swiglu_cuda::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_NVIDIA, ninetoothed);
#else
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
GET_CUDA(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_ILUVATAR, ninetoothed);
#else
GET(INFINI_DEVICE_ILUVATAR, nvidia);
GET_CUDA(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
GET_CUDA(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
GET_CUDA(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
GET_CUDA(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
......@@ -144,7 +147,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_METAX, ninetoothed);
#else
GET(INFINI_DEVICE_METAX, metax);
GET_CUDA(INFINI_DEVICE_METAX, metax);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -177,33 +180,34 @@ __C infiniStatus_t infiniopSwiGLU(
return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
#define CALCULATE_CUDA(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::swiglu_cuda::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, a, b, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_NVIDIA, ninetoothed);
#else
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
CALCULATE_CUDA(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_ILUVATAR, ninetoothed);
#else
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
CALCULATE_CUDA(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
CALCULATE_CUDA(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
CALCULATE_CUDA(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
CALCULATE_CUDA(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
......@@ -212,7 +216,7 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_METAX, ninetoothed);
#else
CALCULATE(INFINI_DEVICE_METAX, metax);
CALCULATE_CUDA(INFINI_DEVICE_METAX, metax);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -240,33 +244,34 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
delete reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
#define DELETE_CUDA(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::swiglu_cuda::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_NVIDIA, ninetoothed);
#else
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
DELETE_CUDA(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_ILUVATAR, ninetoothed);
#else
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
DELETE_CUDA(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
DELETE_CUDA(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
DELETE_CUDA(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
DELETE_CUDA(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
......@@ -275,7 +280,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_METAX, ninetoothed);
#else
DELETE(INFINI_DEVICE_METAX, metax);
DELETE_CUDA(INFINI_DEVICE_METAX, metax);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
......
#ifndef SWIGLU_CUDA_H
#define SWIGLU_CUDA_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::swiglu_cuda::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
SwiGLUCudaInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
SwiGLUCudaInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t c_desc, \
infiniopTensorDescriptor_t a_desc, \
infiniopTensorDescriptor_t b_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *c, \
const void *a, \
const void *b, \
void *stream) const; \
}; \
}
#endif // SWIGLU_CUDA_H
......@@ -34,6 +34,8 @@ _TEST_CASES_DATA = [
# Large tensors
((16, 5632), None, None, None),
((4, 4, 5632), None, None, None),
((1, 512, 16384), (16777216, 32768, 1), (16777216, 32768, 1), None),
((1, 16384, 16384), (536870912, 32768, 1), (536870912, 32768, 1), None),
]
# Tolerance configuration
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment