issue/1032 - provide an alternate cuda swiglu

362f0187 · wooway777 · 718b18cf · 362f0187 · 362f0187 · 362f0187
Commit 362f0187 authored Feb 28, 2026 by wooway777
9 changed files
--- a/src/infiniop/ops/swiglu/cuda/kernel_cuda.cuh
+++ b/src/infiniop/ops/swiglu/cuda/kernel_cuda.cuh
+#ifndef __SWIGLU_CUDA_KERNEL_CUH__
+#define __SWIGLU_CUDA_KERNEL_CUH__
+template <typename T>
+__device__ __forceinline__ T sigmoid(const T &x) {
+    if constexpr (std::is_same_v<T, half2>) {
+        return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
+    } else if constexpr (std::is_same_v<T, half>) {
+        return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+        float x0 = __bfloat162float(__low2bfloat16(x));
+        float x1 = __bfloat162float(__high2bfloat16(x));
+        float sig0 = __frcp_rn(__fadd_rn(1.0f, __expf(-x0)));
+        float sig1 = __frcp_rn(__fadd_rn(1.0f, __expf(-x1)));
+        return __floats2bfloat162_rn(sig0, sig1);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        float xf = __bfloat162float(x);
+        return __float2bfloat16_rn(__frcp_rn(__fadd_rn(1.0f, __expf(-xf))));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return __frcp_rn(__fadd_rn(1, __expf(-x)));
+    } else {
+        return 1 / (1 + std::exp(-x));
+    }
+}
+
+template <typename T, unsigned int BLOCK_SIZE>
+__device__ void SwiGLUCudaKernel(
+    T *c,
+    const T *a,
+    const T *b,
+    int length,
+    size_t batch, size_t seq_len, size_t hidden_dim,
+    ptrdiff_t c_strides_0, ptrdiff_t c_strides_1,
+    ptrdiff_t a_strides_0, ptrdiff_t a_strides_1,
+    ptrdiff_t b_strides_0, ptrdiff_t b_strides_1) {
+    int ind_c = 0;
+    int ind_a = 0;
+    int ind_b = 0;
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid < length) {
+        ind_c += tid % (int)hidden_dim;
+        ind_a += tid % (int)hidden_dim;
+        ind_b += tid % (int)hidden_dim;
+        tid = tid / (int)hidden_dim;
+        ind_c += (tid % (int)seq_len) * (int)c_strides_1;
+        ind_a += (tid % (int)seq_len) * (int)a_strides_1;
+        ind_b += (tid % (int)seq_len) * (int)b_strides_1;
+        tid = tid / (int)seq_len;
+        ind_c += (tid % (int)batch) * (int)c_strides_0;
+        ind_a += (tid % (int)batch) * (int)a_strides_0;
+        ind_b += (tid % (int)batch) * (int)b_strides_0;
+
+        T gate = b[ind_b];
+        T up = a[ind_a];
+        if constexpr (std::is_same_v<T, half2>) {
+            c[ind_c] = __hmul2(__hmul2(gate, sigmoid(gate)), up);
+        } else if constexpr (std::is_same_v<T, half>) {
+            c[ind_c] = __hmul(__hmul(gate, sigmoid(gate)), up);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            cuda_bfloat162 sig = sigmoid(gate);
+            float gate0 = __bfloat162float(__low2bfloat16(gate));
+            float gate1 = __bfloat162float(__high2bfloat16(gate));
+            float sig0 = __bfloat162float(__low2bfloat16(sig));
+            float sig1 = __bfloat162float(__high2bfloat16(sig));
+            float up0 = __bfloat162float(__low2bfloat16(up));
+            float up1 = __bfloat162float(__high2bfloat16(up));
+            float res0 = __fmul_rn(__fmul_rn(gate0, sig0), up0);
+            float res1 = __fmul_rn(__fmul_rn(gate1, sig1), up1);
+            c[ind_c] = __floats2bfloat162_rn(res0, res1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            cuda_bfloat16 sig = sigmoid(gate);
+            float gatef = __bfloat162float(gate);
+            float sigf = __bfloat162float(sig);
+            float upf = __bfloat162float(up);
+            c[ind_c] = __float2bfloat16_rn(__fmul_rn(__fmul_rn(gatef, sigf), upf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            c[ind_c] = __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up);
+        } else {
+            c[ind_c] = gate * sigmoid(gate) * up;
+        }
+    }
+}
+
+#endif // __SWIGLU_CUDA_KERNEL_CUH__
--- a/src/infiniop/ops/swiglu/info.h
+++ b/src/infiniop/ops/swiglu/info.h
+#ifndef __SWIGLU_CUDA_INFO_H__
+#define __SWIGLU_CUDA_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::swiglu_cuda {
+
+class SwiGLUCudaInfo {
+    SwiGLUCudaInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    size_t length;
+    size_t batch, seq_len, hidden_dim;
+    ptrdiff_t c_strides_0, c_strides_1;
+    ptrdiff_t a_strides_0, a_strides_1;
+    ptrdiff_t b_strides_0, b_strides_1;
+
+    static utils::Result<SwiGLUCudaInfo> createSwiGLUCudaInfo(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
+        auto dtype = c_desc->dtype();
+        if (dtype != a_desc->dtype() || dtype != b_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        auto shape = c_desc->shape();
+        CHECK_SAME_SHAPE(shape, a_desc->shape(), b_desc->shape());
+
+        auto ndim = c_desc->ndim();
+        size_t hidden_dim = shape[ndim - 1];
+        size_t seq_len = shape[ndim - 2];
+        size_t batch = (ndim == 3 ? shape[0] : 1);
+
+        size_t length = batch * seq_len * hidden_dim;
+
+        ptrdiff_t c_strides_0 = (ndim == 3 ? c_desc->strides()[0] : 0);
+        ptrdiff_t c_strides_1 = (ndim == 3 ? c_desc->strides()[1] : c_desc->strides()[0]);
+        ptrdiff_t a_strides_0 = (ndim == 3 ? a_desc->strides()[0] : 0);
+        ptrdiff_t a_strides_1 = (ndim == 3 ? a_desc->strides()[1] : a_desc->strides()[0]);
+        ptrdiff_t b_strides_0 = (ndim == 3 ? b_desc->strides()[0] : 0);
+        ptrdiff_t b_strides_1 = (ndim == 3 ? b_desc->strides()[1] : b_desc->strides()[0]);
+
+        return utils::Result<SwiGLUCudaInfo>(SwiGLUCudaInfo{
+            dtype,
+            length,
+            batch,
+            seq_len,
+            hidden_dim,
+            c_strides_0,
+            c_strides_1,
+            a_strides_0,
+            a_strides_1,
+            b_strides_0,
+            b_strides_1});
+    }
+};
+
+} // namespace op::swiglu_cuda
+
+#endif // __SWIGLU_CUDA_INFO_H__
--- a/src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.h
+++ b/src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.h
+#ifndef __SWIGLU_CUDA_METAX_H__
+#define __SWIGLU_CUDA_METAX_H__
+
+#include "../swiglu_cuda.h"
+
+DESCRIPTOR(metax)
+
+#endif
--- a/src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.maca
+++ b/src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.maca
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../cuda/kernel_cuda.cuh"
+#include "swiglu_metax_cuda.h"
+
+template <typename T, unsigned int BLOCK_SIZE>
+INFINIOP_METAX_KERNEL SwiGLUCuda(
+    T *c,
+    const T *a,
+    const T *b,
+    int length,
+    size_t batch, size_t seq_len, size_t hidden_dim,
+    ptrdiff_t c_strides_0, ptrdiff_t c_strides_1,
+    ptrdiff_t a_strides_0, ptrdiff_t a_strides_1,
+    ptrdiff_t b_strides_0, ptrdiff_t b_strides_1) {
+    SwiGLUCudaKernel<T, BLOCK_SIZE>(c, a, b, length, batch, seq_len, hidden_dim,
+                                    c_strides_0, c_strides_1,
+                                    a_strides_0, a_strides_1,
+                                    b_strides_0, b_strides_1);
+}
+
+namespace op::swiglu_cuda::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+    auto info = SwiGLUCudaInfo::createSwiGLUCudaInfo(c_desc, a_desc, b_desc);
+    CHECK_RESULT(info);
+    
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename T>
+infiniStatus_t calculate_swiglu_cuda(
+    const SwiGLUCudaInfo &info,
+    T *c,
+    const T *a,
+    const T *b,
+    hcStream_t stream,
+    void *workspace) {
+    
+    int length = (int)info.length;
+    int batch = (int)info.batch;
+    int seq_len = (int)info.seq_len;
+    int hidden_dim = (int)info.hidden_dim;
+    int c_strides_0 = (int)info.c_strides_0;
+    int c_strides_1 = (int)info.c_strides_1;
+    int a_strides_0 = (int)info.a_strides_0;
+    int a_strides_1 = (int)info.a_strides_1;
+    int b_strides_0 = (int)info.b_strides_0;
+    int b_strides_1 = (int)info.b_strides_1;
+
+    int num_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    SwiGLUCuda<T, BLOCK_SIZE>
+        <<<num_blocks, BLOCK_SIZE, 0, stream>>>(c, a, b, length, batch, seq_len, hidden_dim,
+                                                c_strides_0, c_strides_1,
+                                                a_strides_0, a_strides_1,
+                                                b_strides_0, b_strides_1);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream_) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+#define CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, TDATA) \
+    calculate_swiglu_cuda<BLOCK_SIZE, TDATA>(_info, (TDATA *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
+#define CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                                \
+        if (_info.dtype == INFINI_DTYPE_F16)                         \
+            return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)                    \
+            return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                   \
+            return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, __hpcc_bfloat16); \
+        else                                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                   \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::swiglu_cuda::metax
--- a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cu
+++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cu
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel_cuda.cuh"
+#include "swiglu_nvidia_cuda.cuh"
+
+template <typename T, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL SwiGLUCuda(
+    T *c,
+    const T *a,
+    const T *b,
+    int length,
+    size_t batch, size_t seq_len, size_t hidden_dim,
+    ptrdiff_t c_strides_0, ptrdiff_t c_strides_1,
+    ptrdiff_t a_strides_0, ptrdiff_t a_strides_1,
+    ptrdiff_t b_strides_0, ptrdiff_t b_strides_1) {
+    SwiGLUCudaKernel<T, BLOCK_SIZE>(c, a, b, length, batch, seq_len, hidden_dim,
+                                    c_strides_0, c_strides_1,
+                                    a_strides_0, a_strides_1,
+                                    b_strides_0, b_strides_1);
+}
+
+namespace op::swiglu_cuda::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+    auto info = SwiGLUCudaInfo::createSwiGLUCudaInfo(c_desc, a_desc, b_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename T>
+infiniStatus_t calculate_swiglu_cuda(
+    const SwiGLUCudaInfo &info,
+    T *c,
+    const T *a,
+    const T *b,
+    cudaStream_t stream,
+    void *workspace) {
+
+    int length = (int)info.length;
+    int batch = (int)info.batch;
+    int seq_len = (int)info.seq_len;
+    int hidden_dim = (int)info.hidden_dim;
+    int c_strides_0 = (int)info.c_strides_0;
+    int c_strides_1 = (int)info.c_strides_1;
+    int a_strides_0 = (int)info.a_strides_0;
+    int a_strides_1 = (int)info.a_strides_1;
+    int b_strides_0 = (int)info.b_strides_0;
+    int b_strides_1 = (int)info.b_strides_1;
+
+    int num_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    SwiGLUCuda<T, BLOCK_SIZE>
+        <<<num_blocks, BLOCK_SIZE, 0, stream>>>(c, a, b, length, batch, seq_len, hidden_dim,
+                                                c_strides_0, c_strides_1,
+                                                a_strides_0, a_strides_1,
+                                                b_strides_0, b_strides_1);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream_) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+#define CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, TDATA) \
+    calculate_swiglu_cuda<BLOCK_SIZE, TDATA>(_info, (TDATA *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
+#define CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                                \
+        if (_info.dtype == INFINI_DTYPE_F16)                         \
+            return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)                    \
+            return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                   \
+            return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, __nv_bfloat16); \
+        else                                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                   \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::swiglu_cuda::nvidia
--- a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cuh
+++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cuh
+#ifndef __SWIGLU_CUDA_NVIDIA_H__
+#define __SWIGLU_CUDA_NVIDIA_H__
+
+#include "../swiglu_cuda.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -6,11 +6,7 @@
 #include "cpu/swiglu_cpu.h"
 #endif
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
-#if defined(ENABLE_NINETOOTHED)
-#include "ninetoothed/swiglu.h"
-#else
-#include "nvidia/swiglu_nvidia.cuh"
-#endif
+#include "nvidia/swiglu_nvidia_cuda.cuh"
 #endif
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/swiglu_kunlun.h"
@@ -19,7 +15,7 @@
 #if defined(ENABLE_NINETOOTHED)
 #include "ninetoothed/swiglu.h"
 #else
-#include "metax/swiglu_metax.h"
+#include "metax/swiglu_metax_cuda.h"
 #endif
 #endif
 #ifdef ENABLE_CAMBRICON_API
@@ -48,33 +44,38 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
            {a_desc,                                                          \
             b_desc})

+#define CREATE_CUDA(CASE, NAMESPACE)                                               \
+    case CASE:                                                                     \
+        return op::swiglu_cuda::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                \
+            reinterpret_cast<op::swiglu_cuda::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                                \
+            a_desc,                                                                \
+            b_desc)
+
    switch (handle->device) {

 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-#ifdef ENABLE_NINETOOTHED
-        CREATE(INFINI_DEVICE_NVIDIA, ninetoothed);
-#else
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
+        CREATE_CUDA(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_ILUVATAR_API
 #ifdef ENABLE_NINETOOTHED
        CREATE(INFINI_DEVICE_ILUVATAR, ninetoothed);
 #else
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+        CREATE_CUDA(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #endif
 #ifdef ENABLE_ALI_API
-        CREATE(INFINI_DEVICE_ALI, nvidia);
+        CREATE_CUDA(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_QY_API
        CREATE(INFINI_DEVICE_QY, nvidia);
 #endif
 #ifdef ENABLE_HYGON_API
-        CREATE(INFINI_DEVICE_HYGON, nvidia);
+        CREATE_CUDA(INFINI_DEVICE_HYGON, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
@@ -83,7 +84,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
 #ifdef ENABLE_NINETOOTHED
        CREATE(INFINI_DEVICE_METAX, ninetoothed);
 #else
-        CREATE(INFINI_DEVICE_METAX, metax);
+        CREATE_CUDA(INFINI_DEVICE_METAX, metax);
 #endif
 #endif
 #ifdef ENABLE_CAMBRICON_API
@@ -110,32 +111,34 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
        *size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
        return INFINI_STATUS_SUCCESS

+#define GET_CUDA(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                     \
+        *size = reinterpret_cast<op::swiglu_cuda::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
    switch (desc->device_type) {
+
 #ifdef ENABLE_CPU_API
        GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-#ifdef ENABLE_NINETOOTHED
-        GET(INFINI_DEVICE_NVIDIA, ninetoothed);
-#else
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
+        GET_CUDA(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_ILUVATAR_API
 #ifdef ENABLE_NINETOOTHED
        GET(INFINI_DEVICE_ILUVATAR, ninetoothed);
 #else
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+        GET_CUDA(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #endif
 #ifdef ENABLE_ALI_API
-        GET(INFINI_DEVICE_ALI, nvidia);
+        GET_CUDA(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
+        GET_CUDA(INFINI_DEVICE_QY, nvidia);
 #endif
 #ifdef ENABLE_HYGON_API
-        GET(INFINI_DEVICE_HYGON, nvidia);
+        GET_CUDA(INFINI_DEVICE_HYGON, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
        GET(INFINI_DEVICE_KUNLUN, kunlun);
@@ -144,7 +147,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
 #ifdef ENABLE_NINETOOTHED
        GET(INFINI_DEVICE_METAX, ninetoothed);
 #else
-        GET(INFINI_DEVICE_METAX, metax);
+        GET_CUDA(INFINI_DEVICE_METAX, metax);
 #endif
 #endif
 #ifdef ENABLE_CAMBRICON_API
@@ -177,33 +180,34 @@ __C infiniStatus_t infiniopSwiGLU(
        return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
            ->calculate(workspace, workspace_size, c, {a, b}, stream)

+#define CALCULATE_CUDA(CASE, NAMESPACE)                                               \
+    case CASE:                                                                        \
+        return reinterpret_cast<const op::swiglu_cuda::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, a, b, stream)
+
    switch (desc->device_type) {

 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-#ifdef ENABLE_NINETOOTHED
-        CALCULATE(INFINI_DEVICE_NVIDIA, ninetoothed);
-#else
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
+        CALCULATE_CUDA(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_ILUVATAR_API
 #ifdef ENABLE_NINETOOTHED
        CALCULATE(INFINI_DEVICE_ILUVATAR, ninetoothed);
 #else
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+        CALCULATE_CUDA(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #endif
 #ifdef ENABLE_ALI_API
-        CALCULATE(INFINI_DEVICE_ALI, nvidia);
+        CALCULATE_CUDA(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
+        CALCULATE_CUDA(INFINI_DEVICE_QY, nvidia);
 #endif
 #ifdef ENABLE_HYGON_API
-        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+        CALCULATE_CUDA(INFINI_DEVICE_HYGON, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
@@ -212,7 +216,7 @@ __C infiniStatus_t infiniopSwiGLU(
 #ifdef ENABLE_NINETOOTHED
        CALCULATE(INFINI_DEVICE_METAX, ninetoothed);
 #else
-        CALCULATE(INFINI_DEVICE_METAX, metax);
+        CALCULATE_CUDA(INFINI_DEVICE_METAX, metax);
 #endif
 #endif
 #ifdef ENABLE_CAMBRICON_API
@@ -240,33 +244,34 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
        delete reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc); \
        return INFINI_STATUS_SUCCESS;

+#define DELETE_CUDA(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                         \
+        delete reinterpret_cast<const op::swiglu_cuda::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
    switch (desc->device_type) {

 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-#ifdef ENABLE_NINETOOTHED
-        DELETE(INFINI_DEVICE_NVIDIA, ninetoothed);
-#else
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
+        DELETE_CUDA(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_ILUVATAR_API
 #ifdef ENABLE_NINETOOTHED
        DELETE(INFINI_DEVICE_ILUVATAR, ninetoothed);
 #else
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+        DELETE_CUDA(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #endif
 #ifdef ENABLE_ALI_API
-        DELETE(INFINI_DEVICE_ALI, nvidia);
+        DELETE_CUDA(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
+        DELETE_CUDA(INFINI_DEVICE_QY, nvidia);
 #endif
 #ifdef ENABLE_HYGON_API
-        DELETE(INFINI_DEVICE_HYGON, nvidia);
+        DELETE_CUDA(INFINI_DEVICE_HYGON, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
@@ -275,7 +280,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
 #ifdef ENABLE_NINETOOTHED
        DELETE(INFINI_DEVICE_METAX, ninetoothed);
 #else
-        DELETE(INFINI_DEVICE_METAX, metax);
+        DELETE_CUDA(INFINI_DEVICE_METAX, metax);
 #endif
 #endif
 #ifdef ENABLE_CAMBRICON_API

--- a/src/infiniop/ops/swiglu/swiglu_cuda.h
+++ b/src/infiniop/ops/swiglu/swiglu_cuda.h
+#ifndef SWIGLU_CUDA_H
+#define SWIGLU_CUDA_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::swiglu_cuda::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        SwiGLUCudaInfo _info;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            SwiGLUCudaInfo info,                                 \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t c_desc,                   \
+            infiniopTensorDescriptor_t a_desc,                   \
+            infiniopTensorDescriptor_t b_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *c,                                             \
+            const void *a,                                       \
+            const void *b,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // SWIGLU_CUDA_H
--- a/test/infinicore/ops/swiglu.py
+++ b/test/infinicore/ops/swiglu.py
@@ -34,6 +34,8 @@ _TEST_CASES_DATA = [
    # Large tensors
    ((16, 5632), None, None, None),
    ((4, 4, 5632), None, None, None),
+    ((1, 512, 16384), (16777216, 32768, 1), (16777216, 32768, 1), None),
+    ((1, 16384, 16384), (536870912, 32768, 1), (536870912, 32768, 1), None),
 ]

 # Tolerance configuration