Merge branch 'main' into issue/150

4e4d3415 · Catheriany · GitHub · d1c46889 · 1a4cfb99 · 4e4d3415
Unverified Commit 4e4d3415 authored Apr 29, 2025 by Catheriany Committed by GitHub Apr 29, 2025
20 changed files
--- a/src/infiniop/ops/add/cuda/add_cuda.cu
+++ b/src/infiniop/ops/add/cuda/add_cuda.cu
+#include "add_cuda.cuh"
+#include "add_cuda_internal.cuh"
+
+namespace op::add::cuda {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, AddOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, AddOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, AddOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::add::cuda
--- a/src/infiniop/ops/add/cuda/add_cuda.cuh
+++ b/src/infiniop/ops/add/cuda/add_cuda.cuh
+#ifndef __ADD_CUDA_API_H__
+#define __ADD_CUDA_API_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(add, cuda)
+
+#endif // __ADD_CUDA_API_H__
--- a/src/infiniop/ops/add/cuda/add_cuda_internal.cuh
+++ b/src/infiniop/ops/add/cuda/add_cuda_internal.cuh
+#ifndef __ADD_CUDA_H__
+#define __ADD_CUDA_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include <cuda_fp16.h>
+
+namespace op::add::cuda {
+typedef struct AddOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hadd2(a, b);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hadd(a, b);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fadd_rd(a, b);
+        } else {
+            return a + b;
+        }
+    }
+} AddOp;
+} // namespace op::add::cuda
+
+#endif // __ADD_CUDA_H__
--- a/src/infiniop/ops/add/operator.cc
+++ b/src/infiniop/ops/add/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/add.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/add_cpu.h"
+#endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/add_cuda.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAddDescriptor(
+    infiniopHandle_t handle,
+    infiniopAddDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::add::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::add::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAdd(
+    infiniopAddDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
+++ b/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
@@ -6,22 +6,21 @@
 namespace op::causal_softmax::ascend {

 struct Descriptor::Opaque {
-    mutable aclOpExecutor *executor;
-    mutable aclOpExecutor *mask_executor;
    aclnnTensorDescriptor_t x;
    aclnnTensorDescriptor_t mask;
    aclnnTensorDescriptor_t y;
+    aclnnTensorDescriptor_t value;
    void *mask_addr;
-    size_t workspacesize_softmax;
-    size_t workspacesize_mask;
+    void *value_addr;

    ~Opaque() {
        delete x;
        delete mask;
        delete y;
+        delete value;

-        aclDestroyAclOpExecutor(executor);
-        aclDestroyAclOpExecutor(mask_executor);
+        aclrtFree(mask_addr);
+        aclrtFree(value_addr);
    }
 };

@@ -64,13 +63,13 @@ infiniStatus_t Descriptor::create(
        auto size = aclDataTypeSize(aclDataType::ACL_FLOAT16);
        CHECK_ACL(aclrtMalloc(&value_addr, size, ACL_MEM_MALLOC_HUGE_FIRST));
        CHECK_ACL(aclrtMemcpy(value_addr, size, &mask_value, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT16, {}, {}, value_addr);
+        value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT16, {}, {});
    } else {
        uint32_t mask_value = 0xff800000;
        auto size = aclDataTypeSize(aclDataType::ACL_FLOAT);
        CHECK_ACL(aclrtMalloc(&value_addr, size, ACL_MEM_MALLOC_HUGE_FIRST));
        CHECK_ACL(aclrtMemcpy(value_addr, size, &mask_value, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT, {}, {}, value_addr);
+        value = new aclnnTensorDescriptor(aclDataType::ACL_FLOAT, {}, {});
    }

    // Fill Mask Tensor
@@ -93,17 +92,19 @@ infiniStatus_t Descriptor::create(
    aclTensor *tvalue = value->tensor;

    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
-    aclSetAclOpExecutorRepeatable(mask_executor);
    int64_t dim = 2;

    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
-    aclSetAclOpExecutorRepeatable(executor);

    // Create the descriptor
    size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
-    *desc_ptr = new Descriptor(new Opaque{executor, mask_executor, x, mask, y, mask_addr, workspacesize_softmax, workspacesize_mask},
+    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);

+    // Delete useless executor
+    aclDestroyAclOpExecutor(executor);
+    aclDestroyAclOpExecutor(mask_executor);
+
    return INFINI_STATUS_SUCCESS;
 }

@@ -114,18 +115,24 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
    auto tx = _opaque->x->tensor;
    auto ty = _opaque->y->tensor;
    auto tmask = _opaque->mask->tensor;
-    auto executor = _opaque->executor;
-    auto mask_executor = _opaque->mask_executor;
-    auto mask_addr = _opaque->mask_addr;
+    auto tvalue = _opaque->value->tensor;
+    aclOpExecutor *executor = nullptr;
+    aclOpExecutor *mask_executor = nullptr;
+    size_t workspacesize_softmax = 0;
+    size_t workspacesize_mask = 0;
+    int64_t dim = 2;

    AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
-    AclSetTensorAddr(mask_executor, 1, tmask, mask_addr);
-    CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, _opaque->workspacesize_mask, mask_executor, stream));
+    AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
+    AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
+    CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
+    CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
    CHECK_ACL(aclrtSynchronizeStream(stream));

    AclSetTensorAddr(executor, 0, tx, (void *)x);
    AclSetTensorAddr(executor, 1, ty, y);
-    CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize_softmax, executor, stream));
+    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
+    CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));

    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
+++ b/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
@@ -6,7 +6,6 @@
 namespace op::gemm::ascend {

 struct Descriptor::Opaque {
-    mutable aclOpExecutor *executor;
    aclnnTensorDescriptor_t c, a, b;
    // cubeMathType
    // see doc:
@@ -17,7 +16,6 @@ struct Descriptor::Opaque {
        delete c;
        delete a;
        delete b;
-        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -56,8 +54,8 @@ infiniStatus_t Descriptor::create(
         ta = a->tensor,
         tb = b->tensor;

-    aclOpExecutor *executor;
-    size_t workspace_size;
+    aclOpExecutor *executor = nullptr;
+    size_t workspace_size = 0;
    // aclnnGemm support C = alpha * A @ B + beta * C
    // see
    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
@@ -69,13 +67,15 @@ infiniStatus_t Descriptor::create(
    *desc_ptr = new Descriptor(
        dtype, info, workspace_size,
        new Opaque{
-            executor,
            c,
            a,
            b,
            mt,
        },
        handle->device, handle->device_id);
+
+    aclDestroyAclOpExecutor(executor);
+
    return INFINI_STATUS_SUCCESS;
 }

@@ -93,22 +93,24 @@ infiniStatus_t Descriptor::calculate(
         ta = _opaque->a->tensor,
         tb = _opaque->b->tensor;

-    size_t workspace_size;
+    size_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+
    CHECK_ACL(aclnnGemmGetWorkspaceSize(
        ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
-        &workspace_size, &(_opaque->executor)));
+        &workspace_size, &executor));
    if (workspaceSize_ < workspace_size) {
        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
    }
-    aclSetAclOpExecutorRepeatable(_opaque->executor);
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    auto unit = infiniSizeOf(_dtype);
    for (size_t i = 0; i < _info.batch; ++i) {
-        AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
-        AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
-        AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
-        AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
-        CHECK_ACL(aclnnGemm(workspace, workspace_size, _opaque->executor, stream));
+        AclSetTensorAddr(executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
+        AclSetTensorAddr(executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
+        AclSetTensorAddr(executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
+        AclSetTensorAddr(executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
+        CHECK_ACL(aclnnGemm(workspace, workspace_size, executor, stream));
    }

    return INFINI_STATUS_SUCCESS;

--- a/src/infiniop/ops/mul/cpu/mul_cpu.cc
+++ b/src/infiniop/ops/mul/cpu/mul_cpu.cc
+#include "mul_cpu.h"
+
+namespace op::mul::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<MulOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<MulOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<MulOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::mul::cpu
--- a/src/infiniop/ops/mul/cpu/mul_cpu.h
+++ b/src/infiniop/ops/mul/cpu/mul_cpu.h
+#ifndef __MUL_CPU_H__
+#define __MUL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(mul, cpu)
+
+namespace op::mul::cpu {
+typedef struct MulOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a * b;
+    }
+} MulOp;
+} // namespace op::mul::cpu
+
+#endif // __MUL_CPU_H__
--- a/src/infiniop/ops/mul/cuda/mul_cuda.cu
+++ b/src/infiniop/ops/mul/cuda/mul_cuda.cu
+#include "mul_cuda.cuh"
+#include "mul_cuda_internal.cuh"
+
+namespace op::mul::cuda {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, MulOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, MulOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, MulOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::mul::cuda
--- a/src/infiniop/ops/mul/cuda/mul_cuda.cuh
+++ b/src/infiniop/ops/mul/cuda/mul_cuda.cuh
+#ifndef __MUL_CUDA_API_H__
+#define __MUL_CUDA_API_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(mul, cuda)
+
+#endif // __MUL_CUDA_API_H__
--- a/src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
+++ b/src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
+#ifndef __MUL_CUDA_H__
+#define __MUL_CUDA_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include <cuda_fp16.h>
+
+namespace op::mul::cuda {
+typedef struct MulOp {
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmul2(a, b);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hmul(a, b);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fmul_rn(a, b);
+        } else {
+            return a * b;
+        }
+    }
+} MulOp;
+
+} // namespace op::mul::cuda
+
+#endif // __MUL_CUDA_H__
--- a/src/infiniop/ops/mul/operator.cc
+++ b/src/infiniop/ops/mul/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/mul.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/mul_cpu.h"
+#endif
+
+#ifdef ENABLE_CUDA_API
+#include "cuda/mul_cuda.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateMulDescriptor(
+    infiniopHandle_t handle,
+    infiniopMulDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::mul::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::mul::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMul(
+    infiniopMulDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
+++ b/src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
@@ -15,20 +15,18 @@ infiniStatus_t Descriptor::create(
    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
    auto dtype = y_desc->dtype();
    auto ndim = y_desc->ndim();
-    auto shape = y_desc->shape().data();

-    CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
-    CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
+    auto y_shape = y_desc->shape();
+    auto x_shape = x_desc->shape();
+    CHECK_OR_RETURN(x_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(x_desc->ndim() == ndim, INFINI_STATUS_BAD_TENSOR_SHAPE);
+    CHECK_SAME_SHAPE(x_shape, y_shape);

-    for (size_t i = 0; i < ndim; ++i) {
-        CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
-    }
-
-    auto dst_strides = y_desc->strides().data();
-    auto src_strides = x_desc->strides().data();
+    auto dst_strides = y_desc->strides();
+    auto src_strides = x_desc->strides();
    auto element_size = infiniSizeOf(dtype);

-    auto result = utils::RearrangeMeta::create(shape, dst_strides, src_strides, ndim, element_size);
+    auto result = utils::RearrangeMeta::create(y_shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
    CHECK_RESULT(result);

    *desc_ptr = new Descriptor(

--- a/src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
+++ b/src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
+#include "../../../devices/cuda/cuda_common.cuh"
+#include "../../../devices/cuda/cuda_kernel_common.cuh"
+#include "../../../tensor.h"
+#include "rearrange_cuda.cuh"
+#include "rearrange_kernel.cuh"
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdint.h>
+#include <vector>
+
+namespace op::rearrange::cuda {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::cuda::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+    auto dtype = y_desc->dtype();
+    auto ndim = y_desc->ndim();
+
+    CHECK_OR_RETURN(x_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(x_desc->ndim() == ndim, INFINI_STATUS_BAD_TENSOR_SHAPE);
+    // 保存临时vector对象
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+    auto y_strides = y_desc->strides();
+    auto x_strides = x_desc->strides();
+
+    CHECK_SAME_SHAPE(x_shape, y_shape);
+
+    auto meta = utils::RearrangeMeta::create(
+        y_shape.data(),
+        y_strides.data(),
+        x_strides.data(),
+        ndim,
+        infiniSizeOf(dtype));
+
+    CHECK_RESULT(meta);
+
+    *desc_ptr = new Descriptor(
+        std::move(*meta),
+        new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 维度信息结构
+struct Dim {
+    size_t len;
+    ARRAY_TYPE_STRIDE src_stride;
+    ARRAY_TYPE_STRIDE dst_stride;
+};
+
+// 分割维度结构
+struct SplitDim {
+    size_t choose_idx;
+    size_t num_per_block;
+    size_t num_per_grid;
+    int array_struct_idx_block;
+    int array_struct_idx_grid;
+    size_t dim_len;
+};
+
+/**
+ * 根据给定的元数据准备张量重排参数，该函数主要完成以下工作：
+ * 1. 根据原始元数据调整单元大小，获取更适合GPU处理的单元大小
+ * 2. 将维度分配为CUDA块（block）维度和网格（grid）维度：
+ *    该步骤是核心，目标是为每个block分配尽可能多的相对连续的数据进行处理，
+ *    对无法完整放入块的维度进行分割，并记录分割维度信息，用于防止kernel访问越界，最大化内存访问局部性和计算效率
+ */
+utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta &original_meta, int max_threads) {
+    RearrangeParams params;
+
+    // 获取更适合GPU处理的单元大小，这里使用2的幂次方
+    auto meta_result = original_meta.distributeUnit({32, 16, 8, 4, 2, 1});
+
+    CHECK_RESULT(meta_result);
+
+    const utils::RearrangeMeta &meta = meta_result.take();
+
+    // 获取维度信息
+    const size_t ndim = meta.ndim();
+    const size_t unit = meta.unit();
+
+    // 特殊情况：无维度，只需要简单复制
+    if (ndim == 0) {
+        params.block_dim = 0;
+        params.block_len_total = 1;
+        params.block_len = {static_cast<ARRAY_TYPE_SIZE>(1)};
+        params.src_block_stride = {static_cast<ARRAY_TYPE_STRIDE>(0)};
+        params.dst_block_stride = {static_cast<ARRAY_TYPE_STRIDE>(0)};
+        params.grid_len = {static_cast<ARRAY_TYPE_SIZE>(1)};
+        params.src_grid_stride = {static_cast<ARRAY_TYPE_STRIDE>(0)};
+        params.dst_grid_stride = {static_cast<ARRAY_TYPE_STRIDE>(0)};
+        params.unit_size = unit;
+        return utils::Result<RearrangeParams>(params);
+    }
+
+    // 从元数据中提取必要的信息
+    const ptrdiff_t *idx_strides = meta.idx_strides();
+    const ptrdiff_t *dst_strides = meta.dst_strides();
+    const ptrdiff_t *src_strides = meta.src_strides();
+
+    // 准备维度信息
+    std::vector<Dim> dims;
+    std::vector<size_t> shape;
+    dims.reserve(ndim);
+    shape.reserve(ndim);
+
+    auto prev_idx_stride = meta.count();
+    for (size_t i = 0; i < ndim; ++i) {
+        size_t len = prev_idx_stride / idx_strides[i];
+        shape.push_back(len);
+        dims.push_back({len, src_strides[i], dst_strides[i]});
+        prev_idx_stride = idx_strides[i];
+    }
+
+    // 计算src_strides的降序排序索引，类似于Rust版本中的src_strides_desc_idx
+    std::vector<size_t> src_strides_desc_idx(ndim);
+    for (size_t i = 0; i < ndim; ++i) {
+        src_strides_desc_idx[i] = i;
+    }
+    std::sort(src_strides_desc_idx.begin(), src_strides_desc_idx.end(),
+              [&dims](size_t a, size_t b) {
+                  return std::abs(dims[a].src_stride) > std::abs(dims[b].src_stride);
+              });
+
+    // 根据最大线程数选择block和grid维度
+    const size_t block_size = max_threads;
+    std::vector<bool> block_dim_choose(ndim, false);
+
+    // 初始化计数器
+    size_t block_elements = 1;
+    size_t block_src_elements = 1;
+    size_t block_dst_elements = 1;
+    size_t src_choose_idx = ndim;
+    size_t dst_choose_idx = ndim;
+
+    // 用于存储分割维度信息
+    std::vector<SplitDim> split_dims;
+
+    // 维度选择循环
+    while (src_choose_idx > 0 && dst_choose_idx > 0) {
+        // 获取当前需要处理的维度索引
+        size_t src_idx = src_strides_desc_idx[src_choose_idx - 1];
+        size_t dst_idx = dst_choose_idx - 1;
+
+        if (src_idx == dst_idx) {
+            // 源和目标维度相同，可以一起处理
+            size_t idx = src_idx;
+            size_t len = shape[idx];
+
+            // 检查是否可以将此维度完全添加到block中
+            if (block_elements * len <= block_size) {
+                // 选择此维度
+                block_dim_choose[idx] = true;
+                block_elements *= len;
+                block_src_elements *= len;
+                block_dst_elements *= len;
+                src_choose_idx--;
+                dst_choose_idx--;
+            } else {
+                // 需要分割此维度
+                size_t num_per_block = block_size / block_elements;
+
+                // 确保num_per_block > 0且len >= num_per_block
+                if (num_per_block > 0 && len >= num_per_block && num_per_block > 1) {
+                    size_t num_per_grid = (len + num_per_block - 1) / num_per_block; // 向上取整
+
+                    SplitDim split_dim = {
+                        idx,           // choose_idx
+                        num_per_block, // num_per_block
+                        num_per_grid,  // num_per_grid
+                        0,             // array_struct_idx_block (待更新)
+                        0,             // array_struct_idx_grid (待更新)
+                        len            // 原始维度长度
+                    };
+                    split_dims.push_back(split_dim);
+                }
+                break;
+            }
+        } else {
+            // 源和目标维度不同，需要分别处理
+            // 计算块比例
+            double src_div_dst = static_cast<double>(block_src_elements) / block_dst_elements;
+            double src_num_per_block = std::sqrt(block_size / (double)block_elements / src_div_dst);
+            double dst_num_per_block = src_num_per_block * src_div_dst;
+
+            size_t src_current_dim_len = shape[src_idx];
+            size_t dst_current_dim_len = shape[dst_idx];
+
+            if (static_cast<double>(src_current_dim_len) < src_num_per_block) {
+                // 源维度可以完全添加到block
+                block_dim_choose[src_idx] = true;
+                block_elements *= src_current_dim_len;
+                block_src_elements *= src_current_dim_len;
+                src_choose_idx--;
+            } else if (static_cast<double>(dst_current_dim_len) < dst_num_per_block) {
+                // 目标维度可以完全添加到block
+                block_dim_choose[dst_idx] = true;
+                block_elements *= dst_current_dim_len;
+                block_dst_elements *= dst_current_dim_len;
+                dst_choose_idx--;
+            } else {
+                // 需要分割源和目标维度
+                size_t src_num_per_block_int = static_cast<size_t>(std::floor(src_num_per_block));
+                size_t dst_num_per_block_int = static_cast<size_t>(std::floor(dst_num_per_block));
+
+                // 计算网格尺寸
+                size_t src_num_per_grid = (src_current_dim_len + src_num_per_block_int - 1) / src_num_per_block_int; // 向上取整
+                size_t dst_num_per_grid = (dst_current_dim_len + dst_num_per_block_int - 1) / dst_num_per_block_int; // 向上取整
+
+                // 处理源维度
+                if (src_num_per_block_int > 1) {
+                    if (src_num_per_grid == 1) {
+                        // 可以完全放入块
+                        block_dim_choose[src_idx] = true;
+                        block_elements *= src_current_dim_len;
+                        block_src_elements *= src_current_dim_len;
+                        src_choose_idx--;
+                    } else {
+                        // 需要分割
+                        SplitDim split_dim = {
+                            src_idx,               // choose_idx
+                            src_num_per_block_int, // num_per_block
+                            src_num_per_grid,      // num_per_grid
+                            0,                     // array_struct_idx_block (待更新)
+                            0,                     // array_struct_idx_grid (待更新)
+                            src_current_dim_len    // 原始维度长度
+                        };
+                        split_dims.push_back(split_dim);
+                    }
+                }
+
+                // 处理目标维度
+                if (dst_num_per_block_int > 1) {
+                    if (dst_num_per_grid == 1) {
+                        // 可以完全放入块
+                        block_dim_choose[dst_idx] = true;
+                        block_elements *= dst_current_dim_len;
+                        block_dst_elements *= dst_current_dim_len;
+                        dst_choose_idx--;
+                    } else {
+                        // 需要分割
+                        SplitDim split_dim = {
+                            dst_idx,               // choose_idx
+                            dst_num_per_block_int, // num_per_block
+                            dst_num_per_grid,      // num_per_grid
+                            0,                     // array_struct_idx_block (待更新)
+                            0,                     // array_struct_idx_grid (待更新)
+                            dst_current_dim_len    // 原始维度长度
+                        };
+                        split_dims.push_back(split_dim);
+                    }
+                }
+
+                break;
+            }
+        }
+    }
+
+    // 准备block维度相关参数
+    size_t block_dim = 0;
+    size_t block_len_total = 1;
+
+    std::vector<ARRAY_TYPE_SIZE> block_len;
+    std::vector<ARRAY_TYPE_STRIDE> src_block_stride;
+    std::vector<ARRAY_TYPE_STRIDE> dst_block_stride;
+
+    std::vector<ARRAY_TYPE_SIZE> grid_len;
+    std::vector<ARRAY_TYPE_STRIDE> src_grid_stride;
+    std::vector<ARRAY_TYPE_STRIDE> dst_grid_stride;
+
+    // 处理block维度，填充block_len和block_stride
+    for (size_t i = 0; i < ndim; ++i) {
+        if (block_dim_choose[i]) {
+            block_len.push_back(shape[i]);
+            src_block_stride.push_back(dims[i].src_stride);
+            dst_block_stride.push_back(dims[i].dst_stride);
+            block_dim += 1;
+            block_len_total *= shape[i];
+        }
+
+        // 处理分割维度的block部分
+        for (size_t j = 0; j < split_dims.size(); ++j) {
+            if (i == split_dims[j].choose_idx) {
+                block_len.push_back(split_dims[j].num_per_block);
+                src_block_stride.push_back(dims[i].src_stride);
+                dst_block_stride.push_back(dims[i].dst_stride);
+                split_dims[j].array_struct_idx_block = block_dim;
+                block_dim += 1;
+                block_len_total *= split_dims[j].num_per_block;
+            }
+        }
+    }
+
+    // 处理grid维度，填充grid_len和grid_stride
+    for (size_t i = 0; i < ndim; ++i) {
+        if (!block_dim_choose[i]) {
+            bool is_split = false;
+
+            // 检查是否是分割维度
+            for (size_t j = 0; j < split_dims.size(); ++j) {
+                if (i == split_dims[j].choose_idx) {
+                    is_split = true;
+                    grid_len.push_back(split_dims[j].num_per_grid);
+                    src_grid_stride.push_back(dims[i].src_stride * split_dims[j].num_per_block);
+                    dst_grid_stride.push_back(dims[i].dst_stride * split_dims[j].num_per_block);
+                    split_dims[j].array_struct_idx_grid = grid_len.size() - 1;
+                }
+            }
+
+            // 如果不是分割维度，则作为完整的grid维度
+            if (!is_split) {
+                grid_len.push_back(shape[i]);
+                src_grid_stride.push_back(dims[i].src_stride);
+                dst_grid_stride.push_back(dims[i].dst_stride);
+            }
+        }
+    }
+
+    // 如果grid_len为空，添加一个默认值
+    if (grid_len.empty()) {
+        grid_len.push_back(1);
+        src_grid_stride.push_back(0);
+        dst_grid_stride.push_back(0);
+    }
+
+    // 处理约束条件 - 使用与Rust版本相似的逻辑
+    std::vector<Constraint<ARRAY_TYPE_SIZE>> constraints;
+
+    // 限制最多处理2个约束条件
+    for (size_t i = 0; i < split_dims.size(); ++i) {
+        if (split_dims[i].dim_len % split_dims[i].num_per_block == 0) {
+            continue;
+        }
+        Constraint<ARRAY_TYPE_SIZE> constraint;
+        constraint.grid_idx = split_dims[i].array_struct_idx_grid;
+        constraint.block_idx = split_dims[i].array_struct_idx_block;
+        constraint.grid_div_block = split_dims[i].num_per_block;
+        constraint.total_len = split_dims[i].dim_len;
+        constraints.push_back(constraint);
+    }
+
+    // 设置参数
+    params.block_dim = block_dim;
+    params.block_len_total = block_len_total;
+    params.block_len = block_len;
+    params.src_block_stride = src_block_stride;
+    params.dst_block_stride = dst_block_stride;
+    params.grid_len = grid_len;
+    params.src_grid_stride = src_grid_stride;
+    params.dst_grid_stride = dst_grid_stride;
+    params.constraints = constraints;
+    params.unit_size = unit;
+
+    return utils::Result<RearrangeParams>(params);
+}
+
+// 带约束的内核启动模板函数
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(
+    void *y,
+    const void *x,
+    size_t grid_size,
+    const RearrangeParams &params,
+    size_t unit_size,
+    cudaStream_t stream) {
+
+    // 获取内核函数
+    RearrangeParams params_copy = params; // 创建一个非const副本
+    auto kernel_func_result = getRearrangeKernel(params_copy);
+
+    CHECK_RESULT(kernel_func_result);
+    auto kernel_func = kernel_func_result.take();
+
+    // 创建非const的临时变量
+    size_t block_dim = params.block_dim;
+    size_t block_len_total = params.block_len_total;
+
+    // 检查向量尺寸是否合理
+    if (params.block_len.size() < block_dim || params.src_block_stride.size() < block_dim || params.dst_block_stride.size() < block_dim) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    if (params.grid_len.empty() || params.src_grid_stride.empty() || params.dst_grid_stride.empty()) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    const Constraint<ARRAY_TYPE_SIZE> *constraints_data;
+    auto empty_constraints = Constraint<ARRAY_TYPE_SIZE>();
+    if (params.constraints.empty()) {
+        constraints_data = &empty_constraints;
+    } else {
+        constraints_data = params.constraints.data();
+    }
+
+    void *args[]
+        = {
+            &y, &x,
+            &block_dim,
+            &block_len_total,
+            const_cast<void *>(static_cast<const void *>(params.block_len.data())),
+            const_cast<void *>(static_cast<const void *>(params.src_block_stride.data())),
+            const_cast<void *>(static_cast<const void *>(params.dst_block_stride.data())),
+            const_cast<void *>(static_cast<const void *>(params.grid_len.data())),
+            const_cast<void *>(static_cast<const void *>(params.src_grid_stride.data())),
+            const_cast<void *>(static_cast<const void *>(params.dst_grid_stride.data())),
+            const_cast<void *>(static_cast<const void *>(constraints_data))};
+
+    CHECK_OR_RETURN(cudaLaunchKernel(
+                        kernel_func,
+                        grid_size, BLOCK_SIZE,
+                        args, 0, stream)
+                        == cudaSuccess,
+                    INFINI_STATUS_INTERNAL_ERROR);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    // 如果没有维度，直接进行内存拷贝
+    if (_meta.ndim() == 0) {
+        auto err = cudaMemcpyAsync(y, x, _meta.unit(), cudaMemcpyDeviceToDevice, cuda_stream);
+        if (err != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+
+        CHECK_OR_RETURN(cudaMemcpyAsync(y, x, _meta.unit(), cudaMemcpyDeviceToDevice, cuda_stream) == cudaSuccess,
+                        INFINI_STATUS_INTERNAL_ERROR);
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    // 获取设备属性
+    int max_threads = _opaque->internal->maxThreadsPerBlock();
+
+    // 准备参数
+    auto params_result = prepareRearrangeParams(_meta, std::min(CUDA_BLOCK_SIZE_1024, max_threads));
+    CHECK_RESULT(params_result);
+    auto params = params_result.take();
+
+    // 计算grid大小
+    size_t grid_size = 1;
+    for (size_t i = 0; i < params.grid_len.size(); ++i) {
+        grid_size *= params.grid_len[i];
+    }
+
+    // 检查grid大小是否为0
+    if (grid_size == 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // 根据设备属性选择合适的内核
+    infiniStatus_t status = INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+
+    size_t block_size = params.block_len_total;
+
+    if (block_size <= CUDA_BLOCK_SIZE_512) {
+        status = launchKernel<CUDA_BLOCK_SIZE_512>(y, x, grid_size, params, _meta.unit(), cuda_stream);
+    } else if (block_size <= CUDA_BLOCK_SIZE_1024) {
+        status = launchKernel<CUDA_BLOCK_SIZE_1024>(y, x, grid_size, params, _meta.unit(), cuda_stream);
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+    return status;
+}
+
+} // namespace op::rearrange::cuda
--- a/src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
+++ b/src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
+#ifndef __REARRANGE_CUDA_H__
+#define __REARRANGE_CUDA_H__
+
+#include "../rearrange.h"
+
+DESCRIPTOR(cuda)
+
+#endif // __REARRANGE_CUDA_H__
--- a/src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
+++ b/src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
--- a/src/infiniop/ops/rearrange/operator.cc
+++ b/src/infiniop/ops/rearrange/operator.cc
@@ -6,6 +6,10 @@
 #include "cpu/rearrange_cpu.h"
 #endif

+#ifdef ENABLE_CUDA_API
+#include "cuda/rearrange_cuda.cuh"
+#endif
+
 __C infiniStatus_t infiniopCreateRearrangeDescriptor(
    infiniopHandle_t handle,
    infiniopRearrangeDescriptor_t *desc_ptr,
@@ -26,6 +30,10 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif

+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -50,6 +58,10 @@ __C infiniStatus_t infiniopRearrange(
        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif

+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -71,6 +83,10 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif

+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }

--- a/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+++ b/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
@@ -5,7 +5,6 @@
 namespace op::rms_norm::ascend {

 struct Descriptor::Opaque {
-    mutable aclOpExecutor *executor;
    aclnnTensorDescriptor_t y;
    aclnnTensorDescriptor_t x;
    aclnnTensorDescriptor_t w;
@@ -17,7 +16,6 @@ struct Descriptor::Opaque {
        delete x;
        delete w;
        delete rstd;
-        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -64,16 +62,17 @@ infiniStatus_t Descriptor::create(

    // Get WorkspaceSize and set executor
    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
-    aclSetAclOpExecutorRepeatable(executor);

    auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
    size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
    *desc_ptr = new Descriptor(
-        new Opaque{executor, y, x, w, rstd, workspace_size},
+        new Opaque{y, x, w, rstd, workspace_size},
        std::move(info),
        all_workspace_size,
        handle_ascend->device, handle_ascend->device_id);

+    aclDestroyAclOpExecutor(executor);
+
    return INFINI_STATUS_SUCCESS;
 }

@@ -89,16 +88,21 @@ infiniStatus_t Descriptor::calculate(
    auto tx = _opaque->x->tensor;
    auto ty = _opaque->y->tensor;
    auto trstd = _opaque->rstd->tensor;
+    size_t workspace_size_ = 0;
+    aclOpExecutor *executor = nullptr;
+
+    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);

    auto unit = infiniSizeOf(_info.atype);
-    AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
-    AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
+    AclSetTensorAddr(executor, 1, tw, (void *)w);
+    AclSetTensorAddr(executor, 3, trstd, rstdPtr);
    for (size_t i = 0; i < (_info.shape)[0]; ++i) {
-        AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
-        AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
-        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
+        AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
+        AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
+        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
    }
    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/rope/cuda/rope_cuda.cu
+++ b/src/infiniop/ops/rope/cuda/rope_cuda.cu
@@ -46,8 +46,9 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
                             const Tdata *sin_table,
                             const Tdata *cos_table,
                             cudaStream_t stream) {
-    auto dimx = static_cast<unsigned int>(info.seqlen);
-    auto dimy = static_cast<unsigned int>(info.nhead);
+
+    auto dimx = uint32_t(info.seqlen),
+         dimy = uint32_t(info.nhead);
    int nthreads = std::max(int(info.table_dim), block_size);

    ropeThreadPerItem<<<dim3(dimx, dimy), nthreads, 0, stream>>>(

--- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh
+++ b/src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh
@@ -14,7 +14,7 @@ private:
        } else if constexpr (std::is_same_v<T, half>) {
            return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
        } else if constexpr (std::is_same_v<T, float>) {
-            return __frcp_rd(__fadd_rd(1, __expf(-x)));
+            return __frcp_rn(__fadd_rn(1, __expf(-x)));
        } else {
            return 1 / (1 + std::exp(-x));
        }
@@ -29,7 +29,7 @@ public:
        } else if constexpr (std::is_same_v<T, half>) {
            return __hmul(__hmul(gate, sigmoid(gate)), up);
        } else if constexpr (std::is_same_v<T, float>) {
-            return __fmul_rd(__fmul_rd(gate, sigmoid(gate)), up);
+            return __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up);
        } else {
            return gate * sigmoid(gate) * up;
        }