Merge remote-tracking branch 'origin/main' into issue/142

c2e87202 · Catheriany · 41818f84 · c203635b · c2e87202 · c2e87202
Commit c2e87202 authored Jun 04, 2025 by Catheriany
20 changed files
--- a/src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
+++ b/src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
+#ifndef __ACLNN_SWIGLU_H__
+#define __ACLNN_SWIGLU_H__
+
+#include "../../../../utils.h"
+#include "../../../../utils/check.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::swiglu::ascend {
+class SwigluInfo {
+
+    SwigluInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    std::vector<size_t> shape;
+    int32_t ndim;
+    std::vector<ptrdiff_t> c_strides;
+    std::vector<ptrdiff_t> a_strides;
+    std::vector<ptrdiff_t> b_strides;
+
+    static utils::Result<SwigluInfo> create(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
+        CHECK_OR_RETURN(c_desc && a_desc && b_desc, INFINI_STATUS_BAD_PARAM);
+        CHECK_OR_RETURN(!c_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+        CHECK_OR_RETURN(c_desc->ndim() == a_desc->ndim()
+                            && c_desc->ndim() == b_desc->ndim()
+                            && (c_desc->ndim() == 2 || c_desc->ndim() == 3),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_SAME_SHAPE(c_desc->shape(), a_desc->shape(), b_desc->shape());
+        int32_t ndim = c_desc->ndim();
+        CHECK_OR_RETURN(c_desc->stride(ndim - 1) == 1
+                            && a_desc->stride(ndim - 1) == 1
+                            && b_desc->stride(ndim - 1) == 1,
+                        INFINI_STATUS_BAD_TENSOR_STRIDES);
+        CHECK_OR_RETURN(c_desc->dtype() == a_desc->dtype()
+                            && c_desc->dtype() == b_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+        return utils::Result<SwigluInfo>(SwigluInfo{
+            c_desc->dtype(),
+            c_desc->shape(),
+            ndim,
+            c_desc->strides(),
+            a_desc->strides(),
+            b_desc->strides(),
+        });
+    }
+};
+
+class Descriptor final : public InfiniopDescriptor {
+    SwigluInfo _info;
+    size_t _workspace_size;
+
+    Descriptor(SwigluInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) : InfiniopDescriptor{device_type, device_id},
+                                                                                                    _info(info), _workspace_size(workspace_size) {}
+
+public:
+    ~Descriptor();
+    static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr,
+                                 infiniopTensorDescriptor_t c_desc,
+                                 std::vector<infiniopTensorDescriptor_t> input_descs);
+    size_t workspaceSize() const { return _workspace_size; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *c,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+
+extern "C" infiniStatus_t swiglu_kernel_launch(
+    void *c, void *a, void *b,
+    infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
+    ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
+    ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream);
+
+} // namespace op::swiglu::ascend
+#endif // __ACLNN_SWIGLU_H__
--- a/src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
+++ b/src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
+#include "../../../devices/ascend/ascend_kernel_common.h"
+
+using namespace AscendC;
+
+template <typename T>
+class SwigluKernel {
+public:
+    __aicore__ inline SwigluKernel() {}
+    __aicore__ inline void init(GM_ADDR c, GM_ADDR a, GM_ADDR b,
+                                size_t batch_, size_t seq, size_t hd,
+                                ptrdiff_t stride_batch_c,
+                                ptrdiff_t stride_batch_a,
+                                ptrdiff_t stride_batch_b,
+                                ptrdiff_t stride_seq_c,
+                                ptrdiff_t stride_seq_a,
+                                ptrdiff_t stride_seq_b);
+    __aicore__ inline void process();
+
+private:
+    __aicore__ inline void copyIn(size_t i);
+    __aicore__ inline void compute(size_t i);
+    __aicore__ inline void copyOut(size_t i);
+
+private:
+    GlobalTensor<T> _c_gm, _a_gm, _b_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _in_queue_a, _in_queue_b;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> _out_queue_c;
+
+    TPipe _pipe;
+    float _beta_value = 1.0f;
+    size_t _block_idx, _tile_len, _copy_len,
+        _batch, _seq_len, _hidden_size,
+        _stride_seq_a, _stride_seq_b, _stride_seq_c;
+    int64_t _stride_batch_a = 1, _stride_batch_b = 1, _stride_batch_c = 1;
+};
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::init(GM_ADDR c, GM_ADDR a, GM_ADDR b,
+                                             size_t batch_, size_t seq, size_t hd,
+                                             ptrdiff_t stride_batch_c,
+                                             ptrdiff_t stride_batch_a,
+                                             ptrdiff_t stride_batch_b,
+                                             ptrdiff_t stride_seq_c,
+                                             ptrdiff_t stride_seq_a,
+                                             ptrdiff_t stride_seq_b) {
+    // Init Shape & StrideVariables
+    _batch = batch_;
+    _seq_len = seq;
+    _hidden_size = hd;
+    _stride_batch_a = stride_batch_a;
+    _stride_batch_b = stride_batch_b;
+    _stride_batch_c = stride_batch_c;
+    _stride_seq_a = stride_seq_a;
+    _stride_seq_b = stride_seq_b;
+    _stride_seq_c = stride_seq_c;
+
+    _block_idx = GetBlockIdx();
+    _tile_len = _block_idx < (_hidden_size % BLOCK_NUM) ? (_hidden_size / BLOCK_NUM) + 1 : (_hidden_size / BLOCK_NUM);
+    _copy_len = alignTileLen<T>(_tile_len, BYTE_ALIGN);
+
+    // Set global tensor
+    _a_gm.SetGlobalBuffer((__gm__ T *)a);
+    _b_gm.SetGlobalBuffer((__gm__ T *)b);
+    _c_gm.SetGlobalBuffer((__gm__ T *)c);
+
+    // _pipe alloc memory to queue, the unit is bytes
+    _pipe.InitBuffer(_in_queue_a, BUFFER_NUM, _copy_len * sizeof(T));
+    _pipe.InitBuffer(_in_queue_b, BUFFER_NUM, _copy_len * sizeof(T));
+    _pipe.InitBuffer(_out_queue_c, BUFFER_NUM, _copy_len * sizeof(T));
+}
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::copyIn(size_t i) {
+    // Alloc tensor from queue memory
+    LocalTensor<T> aLocal = _in_queue_a.AllocTensor<T>();
+    LocalTensor<T> bLocal = _in_queue_b.AllocTensor<T>();
+    // Get idx of current tile
+    auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
+    auto seq_idx = _batch == 1 ? i : i % _seq_len;
+
+    ptrdiff_t idxa = batch_idx * _stride_batch_a + seq_idx * _stride_seq_a + _block_idx * _tile_len;
+    ptrdiff_t idxb = batch_idx * _stride_batch_b + seq_idx * _stride_seq_b + _block_idx * _tile_len;
+    // Copy process_th tile from global tensor to local tensor
+    DataCopy(aLocal, _a_gm[idxa], _copy_len);
+    DataCopy(bLocal, _b_gm[idxb], _copy_len);
+
+    // Enque input tensor to VECIN queue
+    _in_queue_a.EnQue(aLocal);
+    _in_queue_b.EnQue(bLocal);
+}
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::compute(size_t i) {
+    // Deque input tensors from VECIN queue
+    LocalTensor<T> aLocal = _in_queue_a.DeQue<T>();
+    LocalTensor<T> bLocal = _in_queue_b.DeQue<T>();
+    LocalTensor<T> cLocal = _out_queue_c.AllocTensor<T>();
+    // Call SwiGLU ascend api
+    SwiGLU<T, false>(cLocal, aLocal, bLocal, _beta_value, _copy_len);
+    // Enque result and free input
+    _out_queue_c.EnQue<T>(cLocal);
+    _in_queue_a.FreeTensor(aLocal);
+    _in_queue_b.FreeTensor(bLocal);
+}
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::copyOut(size_t i) {
+    // Deque output tensor from VECOUT queue
+    LocalTensor<T> cLocal = _out_queue_c.DeQue<T>();
+    auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
+    auto seq_idx = _batch == 1 ? i : i % _seq_len;
+    ptrdiff_t idxc = batch_idx * _stride_batch_c + seq_idx * _stride_seq_c + _block_idx * _tile_len;
+    // Copy progress_th tile from local tensor to global tensor
+    if (_tile_len * sizeof(T) % BYTE_ALIGN != 0) {
+        DataCopyExtParams dcep = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
+        DataCopyPad(_c_gm[idxc], cLocal, dcep);
+    } else {
+        DataCopy(_c_gm[idxc], cLocal, _tile_len);
+    }
+    // Free output Local tensor
+    _out_queue_c.FreeTensor(cLocal);
+}
+
+template <typename T>
+__aicore__ inline void SwigluKernel<T>::process() {
+    for (size_t i = 0; i < _batch * _seq_len; ++i) {
+        copyIn(i);
+        compute(i);
+        copyOut(i);
+    }
+}
+
+#define DEFINE_SWIGLU_KERNEL(KERNEL_NAME, TYPE)                                 \
+    __global__ __aicore__ void KERNEL_NAME(GM_ADDR c, GM_ADDR a, GM_ADDR b,     \
+                                           size_t batch, size_t seq, size_t hd, \
+                                           ptrdiff_t stride_batch_c,            \
+                                           ptrdiff_t stride_batch_a,            \
+                                           ptrdiff_t stride_batch_b,            \
+                                           ptrdiff_t stride_seq_c,              \
+                                           ptrdiff_t stride_seq_a,              \
+                                           ptrdiff_t stride_seq_b) {            \
+        SwigluKernel<TYPE> op;                                                  \
+        op.init(c, a, b,                                                        \
+                batch, seq, hd,                                                 \
+                stride_batch_c, stride_batch_a, stride_batch_b,                 \
+                stride_seq_c, stride_seq_a, stride_seq_b);                      \
+        op.process();                                                           \
+    }
+
+DEFINE_SWIGLU_KERNEL(swiglu_kernel_half, half)
+DEFINE_SWIGLU_KERNEL(swiglu_kernel_float, float)
+
+#undef DEFINE_SWIGLU_KERNEL
+
+extern "C" infiniStatus_t swiglu_kernel_launch(
+    void *c, void *a, void *b,
+    infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
+    ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
+    ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream) {
+
+#define LAUNCH_SWIGLU_KERNEL(DTYPE_ENUM, KERNEL_NAME)       \
+    case DTYPE_ENUM:                                        \
+        KERNEL_NAME<<<BLOCK_NUM, nullptr, stream>>>(        \
+            c, a, b,                                        \
+            batch,                                          \
+            seq,                                            \
+            hd,                                             \
+            stride_batch_c, stride_batch_a, stride_batch_b, \
+            stride_seq_c, stride_seq_a, stride_seq_b);      \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (dtype) {
+        LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F16, swiglu_kernel_half)
+        LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F32, swiglu_kernel_float)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH_SWIGLU_KERNEL
+}
--- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
+++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
@@ -8,50 +8,41 @@ infiniStatus_t Descriptor::create(
    infiniopHandle_t handle_,
    Descriptor **desc_ptr,
    infiniopTensorDescriptor_t out_desc,
-    infiniopTensorDescriptor_t up_desc,
-    infiniopTensorDescriptor_t gate_desc) {
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
    auto dtype = out_desc->dtype();
+
+    const auto &up_desc = input_desc_vec.at(0);
+    const auto &gate_desc = input_desc_vec.at(1);
    const auto &out_shape = out_desc->shape();
    const auto &up_shape = up_desc->shape();
    const auto &gate_shape = gate_desc->shape();

    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-    if (!SAME_VEC(out_shape, up_shape, gate_shape)) {
-        return INFINI_STATUS_BAD_TENSOR_SHAPE;
-    }

-    op::binary::BinaryInfo info;
-    CHECK_STATUS(op::binary::createBinaryInfo(info, out_desc, up_desc, gate_desc));
+    CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);

-    // Create descriptor
-    *desc_ptr = new Descriptor(
-        dtype,
-        std::move(info),
-        nullptr,
-        handle->device,
-        handle->device_id);
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);

    return INFINI_STATUS_SUCCESS;
 }

 infiniStatus_t Descriptor::calculate(
-    void *c,
-    const void *a,
-    const void *b,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
    void *stream) const {

    switch (_dtype) {
    case INFINI_DTYPE_F16:
-        op::common_cpu::binary_op::calculate<fp16_t, SwiGLUOp>(_info, c, a, b);
-        break;
+        return _device_info->calculate<SwiGLUOp, fp16_t>(_info, output, inputs, stream);
    case INFINI_DTYPE_F32:
-        op::common_cpu::binary_op::calculate<float, SwiGLUOp>(_info, c, a, b);
-        break;
+        return _device_info->calculate<SwiGLUOp, float>(_info, output, inputs, stream);
    case INFINI_DTYPE_F64:
-        op::common_cpu::binary_op::calculate<double, SwiGLUOp>(_info, c, a, b);
-        break;
+        return _device_info->calculate<SwiGLUOp, double>(_info, output, inputs, stream);
    default:
        return INFINI_STATUS_BAD_TENSOR_DTYPE;
    }

--- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
+++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
 #ifndef __SWIGLU_CPU_H__
 #define __SWIGLU_CPU_H__

-#include "../../../binary/cpu/binary_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"

-BINARY_DESCRIPTOR(swiglu, cpu)
+ELEMENTWISE_DESCRIPTOR(swiglu, cpu)

-struct SwiGLUOp {
+namespace op::swiglu::cpu {
+typedef struct SwiGLUOp {
 private:
    template <typename T>
    T sigmoid(const T &x) const {
-        return 1 / (1 + std::exp(-x));
+        return T(1) / (T(1) + std::exp(-x));
    }

 public:
+    static constexpr size_t num_inputs = 2;
    template <typename T>
    T operator()(const T &up, const T &gate) const {
        return gate * sigmoid(gate) * up;
    }
-};
+} SwiGLUOp;
+} // namespace op::swiglu::cpu

 #endif // __SWIGLU_CPU_H__
--- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cu
+++ b/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cu
+#include "swiglu_cuda.cuh"
+#include "swiglu_cuda_internal.cuh"
+
+namespace op::swiglu::cuda {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &up_desc = input_desc_vec.at(0);
+    const auto &gate_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &up_shape = up_desc->shape();
+    const auto &gate_shape = gate_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::swiglu::cuda
--- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh
+++ b/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh
+#ifndef __SWIGLU_CUDA_API_H__
+#define __SWIGLU_CUDA_API_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(swiglu, cuda)
+
+#endif // __SWIGLU_CUDA_API_H__
--- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh
+++ b/src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh
+#ifndef __SWIGLU_CUDA_H__
+#define __SWIGLU_CUDA_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include <cuda_fp16.h>
+
+namespace op::swiglu::cuda {
+typedef struct SwiGLUOp {
+private:
+    template <typename T>
+    __device__ __forceinline__ T sigmoid(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __frcp_rn(__fadd_rn(1, __expf(-x)));
+        } else {
+            return 1 / (1 + std::exp(-x));
+        }
+    }
+
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &up, const T &gate) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmul2(__hmul2(gate, sigmoid(gate)), up);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hmul(__hmul(gate, sigmoid(gate)), up);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up);
+        } else {
+            return gate * sigmoid(gate) * up;
+        }
+    }
+} SwiGLUOp;
+} // namespace op::swiglu::cuda
+
+#endif // __SWIGLU_CUDA_H__
--- a/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.cc
+++ b/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.cc
+#include "swiglu_kunlun.h"
+
+// Op interface declare
+LAUNCH_ELEMENTWISE_KERNEL(SwiGLU)
+
+namespace op::swiglu::kunlun {
+
+typedef struct SwiGLUOp {
+    static constexpr size_t num_inputs = 2;
+    template <typename Tdata, typename... Args>
+    static infiniStatus_t launch(Args... args) {
+        launchSwiGLUKernel<Tdata>(args...);
+        return INFINI_STATUS_SUCCESS;
+    }
+} SwiGLUOp;
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::kunlun::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &up_desc = input_desc_vec.at(0);
+    const auto &gate_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &up_shape = up_desc->shape();
+    const auto &gate_shape = gate_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+    CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
+
+    // create KUNLUN elementwise descriptor
+    CREATE_ELEMENTWISE_KUNLUN_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SwiGLUOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::swiglu::kunlun
--- a/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.h
+++ b/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.h
+#ifndef __SWIGLU_KUNLUN_H__
+#define __SWIGLU_KUNLUN_H__
+
+#include "../../../elementwise/kunlun/elementwise_kunlun.h"
+
+ELEMENTWISE_DESCRIPTOR(swiglu, kunlun)
+
+#endif // __SWIGLU_KUNLUN_H__
--- a/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
+++ b/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
+#ifndef __SWIGLU_KUNLUN_H__
+#define __SWIGLU_KUNLUN_H__
+
+#include "../../../devices/kunlun/kunlun_kernel_common.h"
+#include "../../../elementwise/kunlun/elementwise_kunlun_kernel.h"
+
+/// @brief Define swiglu op for local mem
+typedef struct SwiGLUOp {
+private:
+    template <typename T>
+    inline __device__ T sigmoid(T x) const {
+        return 1.0f / (1.0f + exp(-x));
+    }
+
+public:
+    // This static number must be set in other Ops
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    inline __device__ T operator()(const T *inputs) const {
+        T up = inputs[0];
+        T gate = inputs[1];
+        T out = gate * sigmoid(gate) * up;
+        return out;
+    }
+} SwiGLUOp;
+
+// Definition for swiglu kernel interface
+LAUNCH_ELEMENTWISE_KERNEL_IMPL(SwiGLU, SwiGLUOp)
+
+// Template instantiate
+LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, float)
+
+#endif // __SWIGLU_KUNLUN_H__
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -5,6 +5,15 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/swiglu_cpu.h"
 #endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/swiglu_cuda.cuh"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/swiglu_kunlun.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/swiglu_ascend.h"
+#endif

 __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
    infiniopHandle_t handle,
@@ -19,19 +28,19 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
            handle,                                                           \
            reinterpret_cast<op::swiglu::NAMESPACE::Descriptor **>(desc_ptr), \
            c_desc,                                                           \
-            a_desc,                                                           \
-            b_desc)
+            {a_desc,                                                          \
+             b_desc})

    switch (handle->device) {

 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu:
-        return cudaCreateSwiGLUDescriptor((CudaHandle_t)handle,
-                                          (SwiGLUCudaDescriptor_t *)desc_ptr,
-                                          c_desc, a_desc, b_desc);
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -40,11 +49,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
                                          c_desc, a_desc, b_desc);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu:
-        return ascendCreateSwiGLUDescriptor(
-            (AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
-            c_desc, a_desc, b_desc);
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
@@ -66,8 +72,52 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
 #undef CREATE
 }

+__C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                \
+        *size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda)
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun)
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+    case DevCambriconMlu: {
+        return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
+    }
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend)
+#endif
+#ifdef ENABLE_METAX_GPU
+    case DevMetaxGpu: {
+        return macaGetSwiGLUWorkspaceSize((SwiGLUMacaDescriptor_t)desc, size);
+    }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaGetSwiGLUWorkspaceSize((SwiGLUMusaDescriptor_t)desc, size);
+    }
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
 __C infiniStatus_t infiniopSwiGLU(
    infiniopSwiGLUDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
    void *c,
    const void *a,
    const void *b,
@@ -76,25 +126,26 @@ __C infiniStatus_t infiniopSwiGLU(
 #define CALCULATE(CASE, NAMESPACE)                                               \
    case CASE:                                                                   \
        return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(c, a, b, stream)
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)

    switch (desc->device_type) {

 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu:
-        return cudaSwiGLU((SwiGLUCudaDescriptor_t)desc, c, a, b, stream);
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
        return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu:
-        return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu:
@@ -125,18 +176,19 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu:
-        return cudaDestroySwiGLUDescriptor((SwiGLUCudaDescriptor_t)desc);
+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
        return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu:
-        return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu:

--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -3,15 +3,22 @@

 #include <cub/block/block_reduce.cuh>

+/*
+ * Device functions for reduction operations on CUDA.
+ *
+ * Note: Only local result on thread 0 is guranteed to be correct.
+ *       A manual broadcast is needed for other threads.
+ */
 namespace op::common_cuda::reduce_op {

+// Sum(x^2) on contiguous data of length count
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
 __device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
    Tcompute ss = 0;

    // Each thread computes its partial sum
    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
-        ss += Tcompute(data_ptr[i] * data_ptr[i]);
+        ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]);
    }

    // Use CUB block-level reduction
@@ -21,6 +28,36 @@ __device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t cou
    return BlockReduce(temp_storage).Sum(ss);
 }

+// Sum(x) on contiguous data of length count
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, size_t count) {
+    Tcompute s = 0;
+
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        s += Tcompute(data_ptr[i]);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Sum(s);
+}
+
+// Max(x) on contiguous data of length count
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
+    Tdata max_ = data_ptr[0];
+
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        max_ = cub::Max()(max_, data_ptr[i]);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
+}
+
 } // namespace op::common_cuda::reduce_op

 #endif
--- a/src/infiniop/reduce/kunlun/reduce_kunlun.h
+++ b/src/infiniop/reduce/kunlun/reduce_kunlun.h
+#ifndef __INFINIOP_REDUCE_KUNLUN_H__
+#define __INFINIOP_REDUCE_KUNLUN_H__
+
+#include "../../devices/kunlun/kunlun_kernel_common.h"
+
+namespace op::common_kunlun::reduce_op {
+
+using namespace device::kunlun::kernel;
+
+// Use 16 floats instruction to calculate reduce
+// data_ptr is the pointer of LM
+static inline __device__ float sumSquaredF32(float *data_ptr, int count) {
+    __local__ float acc_buf[16];
+    int remain = count % 16;
+    int offset_last = count - remain;
+    int mask = lowerBitMask(remain - 1);
+    // Load last 16 data
+    float32x16_t v_last = vload_lm_float32x16_mz((data_ptr + offset_last), mask);
+    // Do v_last * v_last
+    v_last = vvmul_float32x16(v_last, v_last);
+    // for every 16 float data
+    for (int i = 0; i < offset_last; i += 16) {
+        float32x16_t v_0 = vload_lm_float32x16_mz(data_ptr + i);
+        // Do v_0 * v_0
+        v_0 = vvmul_float32x16(v_0, v_0);
+        // Add to v_last
+        v_last = vvadd_float32x16(v_last, v_0);
+    }
+    vstore_lm_float32x16_mz(acc_buf, v_last);
+    mfence();
+    float res = 0.0f;
+    for (int i = 0; i < 16; ++i) {
+        res += acc_buf[i];
+    }
+    return res;
+}
+
+} // namespace op::common_kunlun::reduce_op
+
+#endif
--- a/src/infiniop/tensor.h
+++ b/src/infiniop/tensor.h
@@ -2,9 +2,19 @@
 #define __INFINIOP_TENSOR_H__

 #include "infiniop/tensor_descriptor.h"
+
+#include "../utils.h"
+
 #include <string>
 #include <vector>

+#define TRANSFORM_TENSOR_DESC(__TENSOR_DESC__, __OP__) \
+    do {                                               \
+        auto __RESULT__ = __TENSOR_DESC__->__OP__;     \
+        CHECK_RESULT(__RESULT__);                      \
+        __TENSOR_DESC__ = __RESULT__.take();           \
+    } while (0)
+
 struct InfiniopTensorDescriptor {
 private:
    // Datatype
@@ -32,9 +42,9 @@ public:
    bool hasBroadcastDim() const;
    std::vector<size_t> getBroadcastDim() const;

-    infiniopTensorDescriptor_t dimMerge(size_t dim_start, size_t dim_end) const;
-    infiniopTensorDescriptor_t dimSplit(size_t axis, const std::vector<size_t> &dims) const;
-    infiniopTensorDescriptor_t dimPermute(const std::vector<size_t> &order) const;
+    utils::Result<infiniopTensorDescriptor_t> dimMerge(size_t dim_start, size_t dim_end) const;
+    utils::Result<infiniopTensorDescriptor_t> dimSplit(size_t axis, const std::vector<size_t> &dims) const;
+    utils::Result<infiniopTensorDescriptor_t> dimPermute(const std::vector<size_t> &order) const;

    std::string toString() const;
 };

--- a/src/infiniop/tensor_descriptor.cc
+++ b/src/infiniop/tensor_descriptor.cc
@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip
        std::vector<ptrdiff_t> strides(ndim);
        ptrdiff_t dsize = 1;
        if (ndim > 0) {
-            for (size_t i = ndim - 1; i >= 0; i--) {
+            for (int i = (int)ndim - 1; i >= 0; i--) {
                strides[i] = dsize;
                dsize *= shape_[i];
            }
@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
    return res;
 }

-infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
-    if (dim_start > dim_end || dim_end >= ndim()) {
-        return nullptr;
-    }
+utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
+    CHECK_OR_RETURN(dim_start <= dim_end && dim_end < ndim(), INFINI_STATUS_BAD_PARAM);

    size_t new_ndim = ndim() - (dim_end - dim_start);
    std::vector<size_t> new_shape(new_ndim);
@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
        index++;
    }

-    if (!isContiguous(dim_start, dim_end)) {
-        return nullptr;
-    }
+    CHECK_OR_RETURN(isContiguous(dim_start, dim_end), INFINI_STATUS_BAD_PARAM);

    new_shape[index] = 1;
    for (size_t i = dim_start; i <= dim_end; i++) {
@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
        index++;
    }

-    return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data());
+    return utils::Result<infiniopTensorDescriptor_t>(
+        new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
 }

-infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const {
+utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const {
    size_t ndim_ = ndim();

-    if (dim(axis) != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>())) {
-        return nullptr;
-    }
+    CHECK_OR_RETURN(dim(axis) == std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>()),
+                    INFINI_STATUS_BAD_PARAM);

    size_t new_ndim = ndim_ + dims.size() - 1;
    std::vector<size_t> new_shape(new_ndim);
@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const
        index++;
    }

-    return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data());
+    return utils::Result<infiniopTensorDescriptor_t>(
+        new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
 }

-infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const {
+utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const {
    auto ndim_ = ndim();
-    if (order.size() != ndim_) {
-        return nullptr;
-    }
+    CHECK_OR_RETURN(order.size() == ndim_, INFINI_STATUS_BAD_PARAM);
    std::vector<size_t> new_shape(ndim_);
    std::vector<ptrdiff_t> new_strides(ndim_);
    for (size_t i = 0; i < ndim_; i++) {
-        if (std::find(order.begin(), order.end(), i) == order.end()) {
-            return nullptr;
-        }
+        CHECK_OR_RETURN(std::find(order.begin(), order.end(), i) != order.end(), INFINI_STATUS_BAD_PARAM);
        new_shape[i] = dim(order[i]);
        new_strides[i] = stride(order[i]);
    }
-    return new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data());
+    return utils::Result<infiniopTensorDescriptor_t>(
+        new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data()));
 }

 std::string InfiniopTensorDescriptor::toString() const {

--- a/src/infinirt/bang/infinirt_bang.cc
+++ b/src/infinirt/bang/infinirt_bang.cc
@@ -6,7 +6,8 @@

 namespace infinirt::bang {
 infiniStatus_t getDeviceCount(int *count) {
-    CHECK_BANGRT(cnrtGetDeviceCount(count));
+    unsigned int device_count = static_cast<unsigned int>(*count);
+    CHECK_BANGRT(cnrtGetDeviceCount(&device_count));
    return INFINI_STATUS_SUCCESS;
 }

@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() {

 infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
    cnrtQueue_t queue;
-    CHECK_BANGRT(cnrtQueueCreate(&stream));
+    CHECK_BANGRT(cnrtQueueCreate(&queue));
    *stream_ptr = queue;
    return INFINI_STATUS_SUCCESS;
 }
@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
 }

 infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) {
-    auto status = cnrtQueryNotifier((cnrtQueue_t)stream);
+    auto status = cnrtQueryNotifier((cnrtNotifier_t)event);
    if (status == cnrtSuccess) {
        *status_ptr = INFINIRT_EVENT_COMPLETE;
    } else if (status == cnrtErrorBusy) {
@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
 }

 infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) {
-    CHECK_BANGRT(cnrtMemcpy(dst, src, size, toBangMemcpyKind(kind)));
+    CHECK_BANGRT(cnrtMemcpy(dst, (void *)src, size, toBangMemcpyKind(kind)));
    return INFINI_STATUS_SUCCESS;
 }

 infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
-    CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
+    CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, (void *)src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
    return INFINI_STATUS_SUCCESS;
 }


--- a/src/infinirt/bang/infinirt_bang.h
+++ b/src/infinirt/bang/infinirt_bang.h
@@ -3,7 +3,7 @@
 #include "../infinirt_impl.h"

 namespace infinirt::bang {
-#ifdef ENABLE_BANG_API
+#ifdef ENABLE_CAMBRICON_API
 INFINIRT_DEVICE_API_IMPL
 #else
 INFINIRT_DEVICE_API_NOOP

--- a/src/infinirt/infinirt.cc
+++ b/src/infinirt/infinirt.cc
@@ -4,6 +4,9 @@
 #include "bang/infinirt_bang.h"
 #include "cpu/infinirt_cpu.h"
 #include "cuda/infinirt_cuda.cuh"
+#include "kunlun/infinirt_kunlun.h"
+#include "maca/infinirt_maca.h"
+#include "musa/infinirt_musa.h"

 thread_local infiniDevice_t CURRENT_DEVICE_TYPE = INFINI_DEVICE_CPU;
 thread_local int CURRENT_DEVICE_ID = 0;
@@ -42,40 +45,49 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
    return INFINI_STATUS_SUCCESS;
 }

-#define INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, ACTION)   \
-    {                                                       \
-        infiniStatus_t _status;                             \
-        switch (CURRENT_DEVICE_TYPE) {                      \
-        case INFINI_DEVICE_CPU:                             \
-            _status = infinirt::cpu::API PARAMS;            \
-            break;                                          \
-        case INFINI_DEVICE_NVIDIA:                          \
-            _status = infinirt::cuda::API PARAMS;           \
-            break;                                          \
-        case INFINI_DEVICE_CAMBRICON:                       \
-            _status = infinirt::bang::API PARAMS;           \
-            break;                                          \
-        case INFINI_DEVICE_ASCEND:                          \
-            _status = infinirt::ascend::API PARAMS;         \
-            break;                                          \
-        default:                                            \
-            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
-        }                                                   \
-        { ACTION; }                                         \
-        return _status;                                     \
+#define INFINIRT_CALL_DEVICE_API_AND(DEVICE_TYPE, API, PARAMS, ACTION) \
+    {                                                                  \
+        infiniStatus_t _status;                                        \
+        switch (DEVICE_TYPE) {                                         \
+        case INFINI_DEVICE_CPU:                                        \
+            _status = infinirt::cpu::API PARAMS;                       \
+            break;                                                     \
+        case INFINI_DEVICE_NVIDIA:                                     \
+            _status = infinirt::cuda::API PARAMS;                      \
+            break;                                                     \
+        case INFINI_DEVICE_CAMBRICON:                                  \
+            _status = infinirt::bang::API PARAMS;                      \
+            break;                                                     \
+        case INFINI_DEVICE_ASCEND:                                     \
+            _status = infinirt::ascend::API PARAMS;                    \
+            break;                                                     \
+        case INFINI_DEVICE_METAX:                                      \
+            _status = infinirt::maca::API PARAMS;                      \
+            break;                                                     \
+        case INFINI_DEVICE_MOORE:                                      \
+            _status = infinirt::musa::API PARAMS;                      \
+            break;                                                     \
+        case INFINI_DEVICE_KUNLUN:                                     \
+            _status = infinirt::kunlun::API PARAMS;                    \
+            break;                                                     \
+        default:                                                       \
+            _status = INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;         \
+        }                                                              \
+        { ACTION; }                                                    \
+        return _status;                                                \
    }

-#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, )
+#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(CURRENT_DEVICE_TYPE, API, PARAMS, )

 __C infiniStatus_t infinirtGetDeviceCount(infiniDevice_t device, int *count) {
    if (count == nullptr) {
        return INFINI_STATUS_NULL_POINTER;
    }
-    INFINIRT_CALL_DEVICE_API(getDeviceCount, (count));
+    INFINIRT_CALL_DEVICE_API_AND(device, getDeviceCount, (count), {});
 }

 __C infiniStatus_t infinirtSetDevice(infiniDevice_t device, int device_id) {
-    INFINIRT_CALL_DEVICE_API_AND(setDevice, (device_id),
+    INFINIRT_CALL_DEVICE_API_AND(device, setDevice, (device_id),
                                 { CURRENT_DEVICE_TYPE = device;
                                   CURRENT_DEVICE_ID = device_id; });
 }

--- a/src/utils.h
+++ b/src/utils.h
@@ -98,4 +98,14 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) {
    }
 }

+#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
+
+namespace utils {
+
+inline size_t align(size_t size, size_t alignment) {
+    return (size + alignment - 1) & ~(alignment - 1);
+}
+
+} // namespace utils
+
 #endif
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -3,6 +3,16 @@
 #include <iostream>
 #include <tuple>

+#define CHECK_OR_RETURN(CONDITION, ERROR)                                    \
+    do {                                                                     \
+        if (!(CONDITION)) {                                                  \
+            std::cerr << "Check Failed: `(" << #CONDITION << ")` is False"   \
+                      << " from " << __func__                                \
+                      << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
+            return ERROR;                                                    \
+        }                                                                    \
+    } while (0)
+
 #define CHECK_API_OR(API, EXPECT, ACTION)                                       \
    do {                                                                        \
        auto api_result_ = (API);                                               \
@@ -31,13 +41,22 @@
                     return INFINI_STATUS_BAD_TENSOR_DTYPE); \
    } while (0)

-#define SAME_VEC(...)                                     \
-    [&] {                                                 \
-        auto &&_vec = std::forward_as_tuple(__VA_ARGS__); \
-        const auto &_base = std::get<0>(_vec);            \
-        return [&_base](auto &&...args) {                 \
-            return ((args == _base) && ...);              \
-        }(__VA_ARGS__);                                   \
-    }()
+#define CHECK_DTYPE_ANY_INT(DT)                                                        \
+    CHECK_DTYPE(DT,                                                                    \
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, \
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+#define CHECK_SAME_VEC(ERR, FIRST, ...)              \
+    do {                                             \
+        for (const auto &shape___ : {__VA_ARGS__}) { \
+            if (FIRST != shape___) {                 \
+                return ERR;                          \
+            }                                        \
+        }                                            \
+    } while (0)
+
+#define CHECK_SAME_SHAPE(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_SHAPE, FIRST, __VA_ARGS__)
+
+#define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__)

 #endif // INFINIUTILS_CHECK_H