Merge pull request #47 from YdrMaster/swiglu-cpu

issue/46: swiglu 算子 - CPU

Merge pull request #47 from YdrMaster/swiglu-cpu
issue/46: swiglu 算子 - CPU
65df17f7 · PanZezhong1725 · GitHub · 9daba5b8 · ee68b55d · 65df17f7
Unverified Commit 65df17f7 authored Mar 26, 2025 by PanZezhong1725 Committed by GitHub Mar 26, 2025
13 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -32,7 +32,7 @@ jobs:
        xmake-version: latest

    - name: configure xmake
-      run: xmake f -cv
+      run: xmake f --omp=y -cv

    - name: build with xmake
      run: xmake build

--- a/src/infiniop/binary/binary.h
+++ b/src/infiniop/binary/binary.h
+#ifndef __INFINIOP_BINARY_H__
+#define __INFINIOP_BINARY_H__
+
+#include "../operator.h"
+#include "../tensor.h"
+#include <numeric>
+
+/**
+ * 该类的设计基于 matmul.h 中 YdrMaster 设计的 DESCRIPTOR 宏。
+ */
+
+#define BINARY_DESCRIPTOR(OP, NAMESPACE)                  \
+                                                          \
+    namespace op::OP::NAMESPACE {                         \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+        infiniDtype_t _dtype;                             \
+        op::binary::BinaryInfo _info;                     \
+                                                          \
+        Descriptor(                                       \
+            infiniDtype_t dtype,                          \
+            op::binary::BinaryInfo info,                  \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _dtype(dtype),                              \
+              _info(info) {}                              \
+                                                          \
+    public:                                               \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t c_desc,            \
+            infiniopTensorDescriptor_t a_desc,            \
+            infiniopTensorDescriptor_t b_desc);           \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *c,                                      \
+            const void *a,                                \
+            const void *b,                                \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+namespace op::binary {
+
+// Stores metadata for binary operations on CPU
+struct BinaryInfo {
+    size_t c_data_size;
+    size_t ndim;
+    bool contiguous;
+    bool broadcasted;
+    std::vector<size_t> c_shape;
+    std::vector<size_t> a_shape;
+    std::vector<size_t> b_shape;
+    std::vector<ptrdiff_t> c_strides;
+    std::vector<ptrdiff_t> a_strides;
+    std::vector<ptrdiff_t> b_strides;
+};
+
+inline infiniStatus_t createBinaryInfo(BinaryInfo &info,
+                                       infiniopTensorDescriptor_t c_desc,
+                                       infiniopTensorDescriptor_t a_desc,
+                                       infiniopTensorDescriptor_t b_desc) {
+
+    if (!c_desc || !a_desc || !b_desc) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    info.c_data_size = c_desc->numel();
+    info.ndim = c_desc->ndim();
+    info.contiguous = c_desc->isContiguous() && a_desc->isContiguous() && b_desc->isContiguous();
+
+    // Destination cannot have broadcast setup
+    if (c_desc->hasBroadcastDim()) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+    const bool ndim_match = (c_desc->ndim() == a_desc->ndim()) && (c_desc->ndim() == b_desc->ndim());
+    info.broadcasted = !info.contiguous && (!ndim_match || a_desc->hasBroadcastDim() || b_desc->hasBroadcastDim());
+
+    info.c_shape = std::move(c_desc->shape());
+    info.a_shape = std::move(a_desc->shape());
+    info.b_shape = std::move(b_desc->shape());
+    info.c_strides = std::move(c_desc->strides());
+    info.a_strides = std::move(a_desc->strides());
+    info.b_strides = std::move(b_desc->strides());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::binary
+
+#endif // __INFINIOP_BINARY_H__
--- a/src/infiniop/binary/cpu/binary_cpu.h
+++ b/src/infiniop/binary/cpu/binary_cpu.h
+#ifndef __INFINIOP_BINARY_CPU_H__
+#define __INFINIOP_BINARY_CPU_H__
+
+#include "../../devices/cpu/common_cpu.h"
+#include "../binary.h"
+#include <utility>
+
+namespace op::common_cpu {
+
+namespace binary_op {
+
+// Perform binary computation when inputs and the output can have different dtypes
+template <typename Tc, typename Ta, typename Tb, typename BinaryOp, typename... Args>
+void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *b, Args &&...args) {
+    auto a_ = reinterpret_cast<const Ta *>(a);
+    auto b_ = reinterpret_cast<const Tb *>(b);
+    auto c_ = reinterpret_cast<Tc *>(c);
+    ptrdiff_t data_size = info.c_data_size;
+
+#pragma omp parallel for
+    for (ptrdiff_t i = 0; i < data_size; ++i) {
+        size_t a_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.a_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data()));
+        size_t b_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.b_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data()));
+        size_t c_index = info.contiguous ? i : (op::common_cpu::indexToOffset(i, info.ndim, info.c_shape.data(), info.c_strides.data()));
+
+        c_[c_index] = BinaryOp{}(a_[a_index], b_[b_index], std::forward<Args>(args)...);
+    }
+}
+
+// Perform binary computation when all inputs and the output share the same dtype
+template <typename Tdata, typename BinaryOp, typename... Args>
+void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *b, Args &&...args) {
+    auto a_ = reinterpret_cast<const Tdata *>(a);
+    auto b_ = reinterpret_cast<const Tdata *>(b);
+    auto c_ = reinterpret_cast<Tdata *>(c);
+    ptrdiff_t data_size = info.c_data_size;
+
+#pragma omp parallel for
+    for (ptrdiff_t i = 0; i < data_size; ++i) {
+        size_t a_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.a_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data()));
+        size_t b_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.b_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data()));
+        size_t c_index = info.contiguous ? i : (op::common_cpu::indexToOffset(i, info.ndim, info.c_shape.data(), info.c_strides.data()));
+
+        if constexpr (std::is_same_v<Tdata, fp16_t>) {
+            float a_val = utils::cast<float>(a_[a_index]);
+            float b_val = utils::cast<float>(b_[b_index]);
+            c_[c_index] = utils::cast<fp16_t>(BinaryOp{}(a_val, b_val, std::forward<Args>(args)...));
+        } else {
+            c_[c_index] = BinaryOp{}(a_[a_index], b_[b_index], std::forward<Args>(args)...);
+        }
+    }
+}
+
+} // namespace binary_op
+} // namespace op::common_cpu
+
+#endif // __INFINIOP_BINARY_CPU_H__
--- a/src/infiniop/devices/cpu/common_cpu.cc
+++ b/src/infiniop/devices/cpu/common_cpu.cc
 #include "common_cpu.h"

+namespace op::common_cpu {
+
 size_t indexToReducedOffset(
    size_t flat_index,
    size_t ndim,
@@ -19,7 +21,7 @@ size_t indexToOffset(
    const size_t *shape,
    const ptrdiff_t *strides) {
    size_t res = 0;
-    for (size_t i = ndim; i-- >= 0;) {
+    for (size_t i = ndim; i-- > 0;) {
        res += (flat_index % shape[i]) * strides[i];
        flat_index /= shape[i];
    }
@@ -48,3 +50,5 @@ std::vector<size_t> getPaddedShape(
    }
    return padded_shape;
 }
+
+} // namespace op::common_cpu
--- a/src/infiniop/devices/cpu/common_cpu.h
+++ b/src/infiniop/devices/cpu/common_cpu.h
@@ -13,6 +13,8 @@
 #include <omp.h>
 #endif

+namespace op::common_cpu {
+
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
 size_t indexToReducedOffset(size_t flat_index, size_t ndim, const ptrdiff_t *broadcasted_strides, const ptrdiff_t *target_strides);

@@ -28,4 +30,6 @@ size_t getPaddedSize(size_t ndim, size_t *shape, const size_t *pads);
 // calculate the padded shape and store the result in padded_shape
 std::vector<size_t> getPaddedShape(size_t ndim, const size_t *shape, const size_t *pads);

+} // namespace op::common_cpu
+
 #endif // __INFINIOP__COMMON_CPU_H__
--- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
+++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
+#include "swiglu_cpu.h"
+
+namespace op::swiglu::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t up_desc,
+    infiniopTensorDescriptor_t gate_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+    const auto &out_shape = out_desc->shape();
+    const auto &up_shape = up_desc->shape();
+    const auto &gate_shape = gate_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    if (!SAME_VEC(out_shape, up_shape, gate_shape)) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    op::binary::BinaryInfo info;
+    CHECK_STATUS(op::binary::createBinaryInfo(info, out_desc, up_desc, gate_desc));
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        dtype,
+        std::move(info),
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        op::common_cpu::binary_op::calculate<fp16_t, SwiGLUOp>(_info, c, a, b);
+        break;
+    case INFINI_DTYPE_F32:
+        op::common_cpu::binary_op::calculate<float, SwiGLUOp>(_info, c, a, b);
+        break;
+    case INFINI_DTYPE_F64:
+        op::common_cpu::binary_op::calculate<double, SwiGLUOp>(_info, c, a, b);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::swiglu::cpu
--- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
+++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
+#ifndef __SWIGLU_CPU_H__
+#define __SWIGLU_CPU_H__
+
+#include "../../../binary/cpu/binary_cpu.h"
+
+BINARY_DESCRIPTOR(swiglu, cpu)
+
+struct SwiGLUOp {
+private:
+    template <typename T>
+    T sigmoid(const T &x) const {
+        return 1 / (1 + std::exp(-x));
+    }
+
+public:
+    template <typename T>
+    T operator()(const T &up, const T &gate) const {
+        return gate * sigmoid(gate) * up;
+    }
+};
+
+#endif // __SWIGLU_CPU_H__
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -2,15 +2,30 @@
 #include "../../handle.h"
 #include "infiniop/ops/swiglu.h"

+#ifdef ENABLE_CPU_API
+#include "cpu/swiglu_cpu.h"
+#endif
+
 __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
-    infiniopHandle_t handle, infiniopSwiGLUDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
+    infiniopHandle_t handle,
+    infiniopSwiGLUDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                \
+        return op::swiglu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                           \
+            reinterpret_cast<op::swiglu::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                           \
+            a_desc,                                                           \
+            b_desc)
+
    switch (handle->device) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuCreateSwiGLUDescriptor(
-            handle, (SwiGLUCpuDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu:
@@ -43,17 +58,30 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
        return musaCreateSwiGLUDescriptor(
            handle, (SwiGLUMusaDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
 #endif
-    }
+
+    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-};
+    }
+
+#undef CREATE
+}

-__C infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
-                                  const void *a, const void *b,
+__C infiniStatus_t infiniopSwiGLU(
+    infiniopSwiGLUDescriptor_t desc,
+    void *c,
+    const void *a,
+    const void *b,
    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                   \
+        return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(c, a, b, stream)
+
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuSwiGLU((SwiGLUCpuDescriptor_t)desc, c, a, b, stream);
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu:
@@ -76,16 +104,26 @@ __C infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
    case DevMthreadsGpu:
        return musaSwiGLU((SwiGLUMusaDescriptor_t)desc, c, a, b, stream);
 #endif
-    }
+
+    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
 }

 __C infiniStatus_t
 infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        delete reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuDestroySwiGLUDescriptor((SwiGLUCpuDescriptor_t)desc);
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu:
@@ -108,6 +146,10 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
    case DevMthreadsGpu:
        return musaDestroySwiGLUDescriptor((SwiGLUMusaDescriptor_t)desc);
 #endif
-    }
+
+    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
 }
--- a/src/infiniop/tensor.h
+++ b/src/infiniop/tensor.h
@@ -28,6 +28,10 @@ public:
    bool isContiguous() const;
    size_t numel() const;

+    // a dim is broadcasted if it's corresponding stride is 0 but dim > 1
+    bool hasBroadcastDim() const;
+    std::vector<size_t> getBroadcastDim() const;
+
    infiniopTensorDescriptor_t dimMerge(size_t dim_start, size_t dim_end) const;
    infiniopTensorDescriptor_t dimSplit(size_t axis, const std::vector<size_t> &dims) const;
    infiniopTensorDescriptor_t dimPermute(const std::vector<size_t> &order) const;

--- a/src/infiniop/tensor_descriptor.cc
+++ b/src/infiniop/tensor_descriptor.cc
 #include "../utils.h"
 #include "tensor.h"
+#include <algorithm>
 #include <cstring>
 #include <functional>
 #include <numeric>
@@ -85,6 +86,24 @@ bool InfiniopTensorDescriptor::isContiguous() const {
    return isContiguous(0, ndim() - 1);
 }

+bool InfiniopTensorDescriptor::hasBroadcastDim() const {
+    return std::any_of(
+        _shape.begin(), _shape.end(),
+        [&, i = 0](const auto &) mutable {
+            return _shape[i] != 1 && _strides[i++] == 0;
+        });
+}
+
+std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
+    std::vector<size_t> res;
+    for (size_t i = 0; i < ndim(); ++i) {
+        if (_shape[i] != 1 && _strides[i] == 0) {
+            res.push_back(i);
+        }
+    }
+    return res;
+}
+
 infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
    if (dim_start > dim_end || dim_end >= ndim()) {
        return nullptr;

--- a/src/utils/check.h
+++ b/src/utils/check.h
 #ifndef INFINIUTILS_CHECK_H
 #define INFINIUTILS_CHECK_H
 #include <iostream>
+#include <tuple>

 #define CHECK_API_OR(API, EXPECT, ACTION)                                       \
    do {                                                                        \
@@ -30,4 +31,13 @@
                     return INFINI_STATUS_BAD_TENSOR_DTYPE); \
    } while (0)

+#define SAME_VEC(...)                                     \
+    [&] {                                                 \
+        auto &&_vec = std::forward_as_tuple(__VA_ARGS__); \
+        const auto &_base = std::get<0>(_vec);            \
+        return [&_base](auto &&...args) {                 \
+            return ((args == _base) && ...);              \
+        }(__VA_ARGS__);                                   \
+    }()
+
 #endif // INFINIUTILS_CHECK_H
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
@@ -25,19 +25,25 @@ _TEST_CASES_ = [
    # shape, a_stride, b_stride, c_stride
    ((13, 4), None, None, None),
    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    # ((13, 4, 4), None, None, None),
-    # ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
    ((16, 5632), None, None, None),
    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    # ((4, 4, 5632), None, None, None),
-    # ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
 ]

+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
 # Inplace options applied for each test case in _TEST_CASES_
 _INPLACE = [
-    "Inplace.OUT_OF_PLACE",
-    "Inplace.INPLACE_A",
-    "Inplace.INPLACE_B",
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
 ]

 # Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
@@ -48,7 +54,7 @@ _TEST_CASES = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16]
+_TENSOR_DTYPES = [torch.float16, torch.float32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
@@ -61,12 +67,6 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_A = auto()
-    INPLACE_B = auto()
-
-
 class SwiGLUDescriptor(Structure):
    _fields_ = [("device", c_int32)]

@@ -132,7 +132,7 @@ def test(

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
    for tensor in [a_tensor, b_tensor, c_tensor]:
-        tensor.descriptor.contents.invalidate()
+        tensor.destroyDesc(lib)

    def lib_swiglu():
        check_error(

--- a/xmake/cpu.lua
+++ b/xmake/cpu.lua
@@ -37,3 +37,8 @@ target("infinirt-cpu")
    set_languages("cxx17")
    add_files("../src/infinirt/cpu/*.cc")
 target_end()
+
+if has_config("omp") then
+    add_requires("openmp")
+    add_packages("openmp")
+end