issue/46: Add binary infrastructure and refactor swiglu cpu using binary

10c9525f · Zimin Li · 150dde0c · 10c9525f · 10c9525f · 10c9525f
Commit 10c9525f authored Mar 24, 2025 by Zimin Li
6 changed files
--- a/src/infiniop/devices/cpu/common_cpu.cc
+++ b/src/infiniop/devices/cpu/common_cpu.cc
@@ -19,7 +19,7 @@ size_t indexToOffset(
    const size_t *shape,
    const ptrdiff_t *strides) {
    size_t res = 0;
-    for (size_t i = ndim; i-- >= 0;) {
+    for (size_t i = ndim; i-- > 0;) {
        res += (flat_index % shape[i]) * strides[i];
        flat_index /= shape[i];
    }

--- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
+++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
 #include "swiglu_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include <cmath>
-#include <cstdlib>
-infiniopStatus_t cpuCreateSwiGLUDescriptor(
+namespace op::swiglu::cpu {
-    infiniopCpuHandle_t handle,
-    infiniopSwiGLUCpuDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-    auto const out = c_desc,
-               up = a_desc,
-               gate = b_desc;
-    auto dtype = out->dtype;
+Descriptor::~Descriptor() = default;
-    // Check dtypes
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t up_desc,
+    infiniopTensorDescriptor_t gate_desc) {
-    constexpr infiniDtype_t SUPPORTED_DTYPES[] = {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    constexpr std::array<infiniDtype_t, 3> SUPPORTED_DTYPES = {
        INFINI_DTYPE_F16,
        INFINI_DTYPE_F32,
        INFINI_DTYPE_F64,
    };
-    auto supported = false;
-    for (auto supported_dtype : SUPPORTED_DTYPES) {
-        if (dtype == supported_dtype) {
-            supported = true;
-            break;
-        }
-    }
-    if (!supported || gate->dtype != dtype || up->dtype != dtype) {
-        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
-    }
-    // Check shapes
+    // Perform generic binary operator check
+    CHECK_STATUS(op::common_cpu::binary_op::check(out_desc, up_desc, gate_desc, SUPPORTED_DTYPES, true, true));
-    if (out->ndim != 2 || gate->ndim != 2 || up->ndim != 2) {
-        return INFINIOP_STATUS_BAD_TENSOR_SHAPE;
-    }
-    auto const n = out->shape[0],
-               d = out->shape[1],
-               n_g = gate->shape[0],
-               d_g = gate->shape[1],
-               n_u = up->shape[0],
-               d_u = up->shape[1];
-    if (n_g != n || n_u != n || d_g != d || d_u != d) {
-        return INFINIOP_STATUS_BAD_TENSOR_SHAPE;
-    }
    // Create descriptor
+    *desc_ptr = new Descriptor(
-    *desc_ptr = new SwiGLUCpuDescriptor{
+        out_desc->dtype(),
-        INFINI_DEVICE_CPU,
+        {out_desc, up_desc, gate_desc},
-        dtype,
+        nullptr,
-        n,
+        handle->device,
-        d,
+        handle->device_id);
-        out->strides[0],
-        out->strides[1],
+    return INFINI_STATUS_SUCCESS;
-        gate->strides[0],
-        gate->strides[1],
-        up->strides[0],
-        up->strides[1],
-    };
-    return INFINIOP_STATUS_SUCCESS;
-}
-template <class T>
-T sigmoid(T x) {
-    return 1 / (1 + std::exp(-x));
 }
-template <class T>
+infiniStatus_t Descriptor::calculate(
-T swiglu(T gate, T up) {
+    void *c,
-    return gate * sigmoid(gate) * up;
+    const void *a,
-}
+    const void *b,
+    void *stream) const {
-template <class T>
-void swiglu_ptr(uint8_t *out, uint8_t const *gate, uint8_t const *up) {
-    auto out_ = reinterpret_cast<T *>(out);
-    auto gate_ = reinterpret_cast<T const *>(gate);
-    auto up_ = reinterpret_cast<T const *>(up);
-    *out_ = swiglu(*gate_, *up_);
-}
-template <>
-void swiglu_ptr<uint16_t>(uint8_t *out, uint8_t const *gate, uint8_t const *up) {
-    auto out_ = reinterpret_cast<uint16_t *>(out);
-    auto gate_ = reinterpret_cast<uint16_t const *>(gate);
-    auto up_ = reinterpret_cast<uint16_t const *>(up);
-    *out_ = f32_to_f16(swiglu(f16_to_f32(*gate_), f16_to_f32(*up_)));
-}
-infiniopStatus_t cpuSwiGLU(
-    infiniopSwiGLUCpuDescriptor_t desc,
-    void *c, void const *a, void const *b) {
-    auto out = reinterpret_cast<uint8_t *>(c);
+    switch (_dtype) {
-    auto up = reinterpret_cast<uint8_t const *>(a);
-    auto gate = reinterpret_cast<uint8_t const *>(b);
-    auto const unit = infiniSizeof(desc->dtype);
-    for (size_t i = 0; i < desc->n; ++i) {
-        for (size_t j = 0; j < desc->d; ++j) {
-            auto out_ = out + (i * desc->s_no + j * desc->s_do) * unit;
-            auto gate_ = gate + (i * desc->s_ng + j * desc->s_dg) * unit;
-            auto up_ = up + (i * desc->s_nu + j * desc->s_du) * unit;
-            switch (desc->dtype) {
    case INFINI_DTYPE_F16:
-                swiglu_ptr<uint16_t>(out_, gate_, up_);
+        op::common_cpu::binary_op::calculate<fp16_t, SwiGLUOp>(_info, c, a, b);
        break;
    case INFINI_DTYPE_F32:
-                swiglu_ptr<float>(out_, gate_, up_);
+        op::common_cpu::binary_op::calculate<float, SwiGLUOp>(_info, c, a, b);
        break;
    case INFINI_DTYPE_F64:
-                swiglu_ptr<double>(out_, gate_, up_);
+        op::common_cpu::binary_op::calculate<double, SwiGLUOp>(_info, c, a, b);
        break;
    default:
-                // unreachable
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-                std::abort();
-            }
    }
-    }
-    return INFINIOP_STATUS_SUCCESS;
-}
-infiniopStatus_t cpuDestroySwiGLUDescriptor(
+    return INFINI_STATUS_SUCCESS;
-    infiniopSwiGLUCpuDescriptor_t desc) {
-    delete desc;
-    return INFINIOP_STATUS_SUCCESS;
 }
+} // namespace op::swiglu::cpu
--- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
+++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
-#ifndef __INFINIOP_SWIGLU_CPU_H__
+#ifndef __SWIGLU_CPU_H__
-#define __INFINIOP_SWIGLU_CPU_H__
+#define __SWIGLU_CPU_H__
-#include "./swiglu_cpu_api.h"
+#include "../../../binary/cpu/binary.h"
-typedef struct SwiGLUCpuDescriptor {
+BINARY_DESCRIPTOR(swiglu, cpu)
-    infiniDevice_t device;
-    infiniDtype_t dtype;
-    size_t n, d;
-    ptrdiff_t
-        s_no, // n stride of out
-        s_do, // d stride of out
-        s_ng, // n stride of gate
-        s_dg, // d stride of gate
-        s_nu, // n stride of up
-        s_du; // d stride of up
-} SwiGLUCpuDescriptor;
-#endif // __INFINIOP_SWIGLU_CPU_H__
+struct SwiGLUOp {
+private:
+    template <typename T>
+    T sigmoid(const T &x) const {
+        return 1 / (1 + std::exp(-x));
+    }
+public:
+    template <typename T>
+    T operator()(const T &up, const T &gate) const {
+        return gate * sigmoid(gate) * up;
+    }
+};
+#endif // __SWIGLU_CPU_H__
--- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu_api.h
+++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu_api.h
-#ifndef __INFINIOP_SWIGLU_CPU_API_H__
-#define __INFINIOP_SWIGLU_CPU_API_H__
-#include "../../../devices/cpu/cpu_handle.h"
-#include "infiniop/operator.h"
-struct SwiGLUCpuDescriptor;
-typedef struct SwiGLUCpuDescriptor *infiniopSwiGLUCpuDescriptor_t;
-infiniopStatus_t cpuCreateSwiGLUDescriptor(
-    infiniopCpuHandle_t handle,
-    infiniopSwiGLUCpuDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc);
-infiniopStatus_t cpuSwiGLU(
-    infiniopSwiGLUCpuDescriptor_t desc,
-    void *c, void const *a, void const *b);
-infiniopStatus_t cpuDestroySwiGLUDescriptor(
-    infiniopSwiGLUCpuDescriptor_t desc);
-#endif // __INFINIOP_SWIGLU_CPU_API_H__
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -2,112 +2,141 @@
 #include "../../handle.h"
 #include "infiniop/ops/swiglu.h"
+#ifdef ENABLE_CPU_API
+#include "cpu/swiglu_cpu.h"
+#endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/swiglu_cuda.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/swiglu_bang.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/swiglu_ascend.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "maca/swiglu_maca.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/swiglu_kunlun.h"
+#endif
 __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
-    infiniopHandle_t handle, infiniopSwiGLUDescriptor_t *desc_ptr,
+    infiniopHandle_t handle,
-    infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
+    infiniopSwiGLUDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
+#define CREATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                \
+        return op::swiglu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                           \
+            reinterpret_cast<op::swiglu::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                           \
+            a_desc,                                                           \
+            b_desc)
    switch (handle->device) {
 #ifdef ENABLE_CPU_API
-    case INFINI_DEVICE_CPU:
+        CREATE(INFINI_DEVICE_CPU, cpu);
-        return cpuCreateSwiGLUDescriptor(
-            handle, (infiniopSwiGLUCpuDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
-#endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu:
-        return cudaCreateSwiGLUDescriptor((CudaHandle_t)handle,
-                                          (SwiGLUCudaDescriptor_t *)desc_ptr,
-                                          c_desc, a_desc, b_desc);
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-    case DevCambriconMlu: {
-        return bangCreateSwiGLUDescriptor((BangHandle_t)handle,
-                                          (SwiGLUBangDescriptor_t *)desc_ptr,
-                                          c_desc, a_desc, b_desc);
-    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
+#ifdef ENABLE_CUDA_API
-    case DevAscendNpu:
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
-        return ascendCreateSwiGLUDescriptor(
-            (AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
-            c_desc, a_desc, b_desc);
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCreateSwiGLUDescriptor((MacaHandle_t)handle,
-                                          (SwiGLUMacaDescriptor_t *)desc_ptr,
-                                          c_desc, a_desc, b_desc);
-    }
 #endif
-#ifdef ENABLE_MTHREADS_GPU
+#ifdef ENABLE_CAMBRICON_API
-    case DevMthreadsGpu:
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-        return musaCreateSwiGLUDescriptor(
-            handle, (SwiGLUMusaDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
 #endif
-    }
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, maca);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-};
+    }
-__C infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
+#undef CREATE
-                                  const void *a, const void *b,
+}
+__C infiniStatus_t infiniopSwiGLU(
+    infiniopSwiGLUDescriptor_t desc,
+    void *c,
+    const void *a,
+    const void *b,
    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                   \
+        return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(c, a, b, stream)
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
+#ifdef ENABLE_CPU_API
-        return cpuSwiGLU((SwiGLUCpuDescriptor_t)desc, c, a, b, stream);
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_NV_GPU
+#ifdef ENABLE_CUDA_API
-    case DevNvGpu:
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
-        return cudaSwiGLU((SwiGLUCudaDescriptor_t)desc, c, a, b, stream);
 #endif
-#ifdef ENABLE_CAMBRICON_MLU
+#ifdef ENABLE_CAMBRICON_API
-    case DevCambriconMlu: {
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-        return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
-    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
+#ifdef ENABLE_ASCEND_API
-    case DevAscendNpu:
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
-        return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
 #endif
-#ifdef ENABLE_METAX_GPU
+#ifdef ENABLE_METAX_API
-    case DevMetaxGpu:
+        CALCULATE(INFINI_DEVICE_METAX, maca);
-        return macaSwiGLU((SwiGLUMacaDescriptor_t)desc, c, a, b, stream);
 #endif
-#ifdef ENABLE_MTHREADS_GPU
+#ifdef ENABLE_KUNLUN_API
-    case DevMthreadsGpu:
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-        return musaSwiGLU((SwiGLUMusaDescriptor_t)desc, c, a, b, stream);
 #endif
-    }
+    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
 }
 __C infiniStatus_t
 infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        delete reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
+#ifdef ENABLE_CPU_API
-        return cpuDestroySwiGLUDescriptor((SwiGLUCpuDescriptor_t)desc);
+        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_NV_GPU
+#ifdef ENABLE_CUDA_API
-    case DevNvGpu:
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
-        return cudaDestroySwiGLUDescriptor((SwiGLUCudaDescriptor_t)desc);
 #endif
-#ifdef ENABLE_CAMBRICON_MLU
+#ifdef ENABLE_CAMBRICON_API
-    case DevCambriconMlu: {
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-        return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
-    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
+#ifdef ENABLE_ASCEND_API
-    case DevAscendNpu:
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
-        return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
 #endif
-#ifdef ENABLE_METAX_GPU
+#ifdef ENABLE_METAX_API
-    case DevMetaxGpu:
+        DELETE(INFINI_DEVICE_METAX, maca);
-        return macaDestroySwiGLUDescriptor((SwiGLUMacaDescriptor_t)desc);
 #endif
-#ifdef ENABLE_MTHREADS_GPU
+#ifdef ENABLE_KUNLUN_API
-    case DevMthreadsGpu:
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-        return musaDestroySwiGLUDescriptor((SwiGLUMusaDescriptor_t)desc);
 #endif
-    }
+    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
 }
\ No newline at end of file
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
@@ -25,19 +25,25 @@ _TEST_CASES_ = [
    # shape, a_stride, b_stride, c_stride
    ((13, 4), None, None, None),
    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    # ((13, 4, 4), None, None, None),
+    ((13, 4, 4), None, None, None),
-    # ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
    ((16, 5632), None, None, None),
    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    # ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), None, None, None),
-    # ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
 ]
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
 # Inplace options applied for each test case in _TEST_CASES_
 _INPLACE = [
-    "Inplace.OUT_OF_PLACE",
+    Inplace.OUT_OF_PLACE,
-    "Inplace.INPLACE_A",
+    Inplace.INPLACE_A,
-    "Inplace.INPLACE_B",
+    Inplace.INPLACE_B,
 ]
 # Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
@@ -48,7 +54,7 @@ _TEST_CASES = [
 ]
 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16]
+_TENSOR_DTYPES = [torch.float16, torch.float32]
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
@@ -61,12 +67,6 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_A = auto()
-    INPLACE_B = auto()
 class SwiGLUDescriptor(Structure):
    _fields_ = [("device", c_int32)]
@@ -132,7 +132,7 @@ def test(
    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
    for tensor in [a_tensor, b_tensor, c_tensor]:
-        tensor.descriptor.contents.invalidate()
+        tensor.destroyDesc(lib)
    def lib_swiglu():
        check_error(