Merge remote-tracking branch 'origin/main' into issue/150

9b32b4b1 · Catheriany · 15bcbdfc · 4799ddbf · 9b32b4b1 · 9b32b4b1
Commit 9b32b4b1 authored Jun 04, 2025 by Catheriany
20 changed files
--- a/src/infiniop/ops/clip/operator.cc
+++ b/src/infiniop/ops/clip/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/clip.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/clip_cpu.h"
+#endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/clip_cuda.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateClipDescriptor(
+    infiniopHandle_t handle,
+    infiniopClipDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t min_val,
+    infiniopTensorDescriptor_t max_val) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::clip::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::clip::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                              \
+            {x, min_val, max_val})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::clip::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda)
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopClip(
+    infiniopClipDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *min_val,
+    const void *max_val,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x, min_val, max_val}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
+++ b/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
@@ -3,6 +3,26 @@
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/level2/aclnn_gemm.h>

+#include <cstring>
+#include <unordered_map>
+
+// Custom hash function for alpha beta pair<float, float>
+struct FloatPairHash {
+    size_t operator()(const std::pair<float, float> &p) const {
+        uint64_t combined;
+        std::memcpy(reinterpret_cast<char *>(&combined), &p.first, sizeof(float));
+        std::memcpy(reinterpret_cast<char *>(&combined) + sizeof(float), &p.second, sizeof(float));
+
+        return std::hash<uint64_t>()(combined);
+    }
+};
+
+struct FloatPairEqual {
+    bool operator()(const std::pair<float, float> &a, const std::pair<float, float> &b) const {
+        return a.first == b.first && a.second == b.second;
+    }
+};
+
 namespace op::gemm::ascend {

 struct Descriptor::Opaque {
@@ -11,11 +31,17 @@ struct Descriptor::Opaque {
    // see doc:
    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
    int8_t mt;
+    // alpha&beta hashmap
+    std::unordered_map<std::pair<float, float>, aclOpExecutor *, FloatPairHash, FloatPairEqual> lookup;

    ~Opaque() {
        delete c;
        delete a;
        delete b;
+        for (auto &item : lookup) {
+            aclDestroyAclOpExecutor(item.second);
+        }
+        lookup.clear();
    }
 };

@@ -54,15 +80,16 @@ infiniStatus_t Descriptor::create(
         ta = a->tensor,
         tb = b->tensor;

+    std::unordered_map<std::pair<float, float>, aclOpExecutor *, FloatPairHash, FloatPairEqual> lookup;
    aclOpExecutor *executor = nullptr;
    size_t workspace_size = 0;
-    // aclnnGemm support C = alpha * A @ B + beta * C
-    // see
-    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
-    // use alpha = 0.5, beta = 0.5 temporarily
-
    int8_t mt = 1;
-    CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspace_size, &executor));
+    CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, 1., 0., 0, 0, tc, mt, &workspace_size, &executor));
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
+    lookup[std::make_pair(1.0f, 0.0f)] = executor;
+    CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, 1., 1., 0, 0, tc, mt, &workspace_size, &executor));
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
+    lookup[std::make_pair(1.0f, 1.0f)] = executor;

    *desc_ptr = new Descriptor(
        dtype, info, workspace_size,
@@ -71,11 +98,9 @@ infiniStatus_t Descriptor::create(
            a,
            b,
            mt,
-        },
+            std::move(lookup)},
        handle->device, handle->device_id);

-    aclDestroyAclOpExecutor(executor);
-
    return INFINI_STATUS_SUCCESS;
 }

@@ -93,16 +118,22 @@ infiniStatus_t Descriptor::calculate(
         ta = _opaque->a->tensor,
         tb = _opaque->b->tensor;

-    size_t workspace_size = 0;
-    aclOpExecutor *executor = nullptr;
+    size_t workspace_size = _workspace_size;
+    aclOpExecutor *executor;
+    auto key = std::make_pair(alpha, beta);
+    if (_opaque->lookup.find(key) != _opaque->lookup.end()) {
+        executor = _opaque->lookup[key];
+    } else {
+        CHECK_ACL(aclnnGemmGetWorkspaceSize(
+            ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
+            &workspace_size, &executor));
+        CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
+        _opaque->lookup[key] = executor;
+    }

-    CHECK_ACL(aclnnGemmGetWorkspaceSize(
-        ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
-        &workspace_size, &executor));
    if (workspaceSize_ < workspace_size) {
        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
    }
-    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    auto unit = infiniSizeOf(_dtype);
    for (size_t i = 0; i < _info.batch; ++i) {

--- a/src/infiniop/ops/gemm/kunlun/gemm_kunlun.cc
+++ b/src/infiniop/ops/gemm/kunlun/gemm_kunlun.cc
@@ -62,7 +62,7 @@ infiniStatus_t calculate(
        (kunlunStream_t)stream,
        [&](xdnnHandle_t handle) {
            for (size_t i = 0; i < info.batch; i++) {
-                CHECK_XDNN((xdnn::fc_fusion<Tdata, Tdata, Tdata, int16_t>(
+                CHECK_KUNLUN((xdnn::fc_fusion<Tdata, Tdata, Tdata, int16_t>(
                    handle,
                    (Tdata *)((char *)a + i * info.a_matrix.stride * unit),
                    (Tdata *)((char *)b + i * info.b_matrix.stride * unit),

--- a/src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
+++ b/src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
 #include "random_sample_cpu.h"
 #include "../../../devices/cpu/common_cpu.h"
-#include "../../../devices/cpu/cpu_handle.h"
-#include "../../../tensor.h"
+#include "../info.h"
+#include "infinicore.h"
 #include <algorithm>

 namespace op::random_sample::cpu {
@@ -15,29 +15,14 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t probs_desc) {
    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);

-    auto dt_i = result_desc->dtype();
-    auto dt_p = probs_desc->dtype();
-
-    CHECK_DTYPE(dt_i,
-                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
-                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
-    CHECK_DTYPE(dt_p, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-    CHECK_API_OR(result_desc->ndim(), 0,
-                 return INFINI_STATUS_BAD_TENSOR_SHAPE);
-    CHECK_API_OR(probs_desc->ndim(), 1,
-                 return INFINI_STATUS_BAD_TENSOR_SHAPE);
-    CHECK_API_OR(probs_desc->stride(0), 1,
-                 return INFINI_STATUS_BAD_TENSOR_STRIDES);
+    auto result = RandomSampleInfo::create(result_desc, probs_desc);
+    CHECK_RESULT(result);

    *desc_ptr = new Descriptor(
-        dt_i,
-        dt_p,
-        probs_desc->dim(0),
+        result.take(),
        0,
        nullptr,
-        handle->device,
-        handle->device_id);
+        handle->device, handle->device_id);
    return INFINI_STATUS_SUCCESS;
 }

@@ -55,36 +40,42 @@ struct ComputeType<fp16_t> {
    using type = float;
 };

-template <class Tidx, class Tval>
-struct Scheme {
-    using Tcompute = typename ComputeType<Tval>::type;
+struct Algo {

-    static Tcompute get(void const *ptr, size_t i) {
-        return utils::cast<Tcompute, Tval>(reinterpret_cast<Tval const *>(ptr)[i]);
+    template <class Tidx, class Tval>
+    static auto get(void const *ptr, size_t i) {
+        return utils::cast<typename ComputeType<Tval>::type, Tval>(reinterpret_cast<Tval const *>(ptr)[i]);
    }

-    static void argmax(
-        void *result, void const *probs, size_t n) {
-
+    template <class Tidx, class Tval>
+    infiniStatus_t argmax(
+        void *workspace, size_t workspace_size,
+        void *result, void const *probs, size_t n,
+        void *stream) {
        auto idx = reinterpret_cast<Tidx *>(result);
        *idx = 0;

-        auto max_val = get(probs, 0);
+        auto max_val = get<Tidx, Tval>(probs, 0);
        for (size_t i = 0; i < n; i++) {
-            if (auto val = get(probs, i); val > max_val) {
+            if (auto val = get<Tidx, Tval>(probs, i); val > max_val) {
                max_val = val;
                *idx = static_cast<Tidx>(i);
            }
        }
+
+        return INFINI_STATUS_SUCCESS;
    }

-    static void random(
+    template <class Tidx, class Tval>
+    infiniStatus_t random(
+        void *workspace, size_t workspace_size,
        void *result, void const *probs, size_t n,
-        float random_val, float topp, int topk, float temperature) {
+        float random_val, float topp, int topk, float temperature,
+        void *stream) {

        struct KVPair {
            Tidx idx;
-            Tcompute val;
+            typename ComputeType<Tval>::type val;

            bool operator<(const KVPair &other) const {
                return val > other.val;
@@ -95,7 +86,7 @@ struct Scheme {
        // build & sort
        std::vector<KVPair> pairs(n);
        for (size_t i = 0; i < n; i++) {
-            pairs[i] = {static_cast<Tidx>(i), get(probs, i)};
+            pairs[i] = {static_cast<Tidx>(i), get<Tidx, Tval>(probs, i)};
        }
        std::sort(pairs.begin(), pairs.end());
        // softmax & sum
@@ -115,68 +106,10 @@ struct Scheme {
                break;
            }
        }
-    }
-};
-
-template <class Tidx, class Tval>
-void switch_f(
-    size_t n,
-    void *result, const void *probs,
-    float random_val, float topp, int topk, float temperature) {
-    if (random_val == 0 || topp == 0 || topk == 1 || temperature == 0) {
-        Scheme<Tidx, Tval>::argmax(result, probs, n);
-    } else {
-        Scheme<Tidx, Tval>::random(result, probs, n, random_val, topp, topk, temperature);
-    }
-}
-
-template <class Tidx>
-void switch_val(
-    infiniDtype_t dt_p, size_t n,
-    void *result, void const *probs,
-    float random_val, float topp, int topk, float temperature) {
-    switch (dt_p) {
-    case INFINI_DTYPE_F16:
-        switch_f<Tidx, fp16_t>(n, result, probs, random_val, topp, topk, temperature);
-        break;
-    case INFINI_DTYPE_F32:
-        switch_f<Tidx, float>(n, result, probs, random_val, topp, topk, temperature);
-        break;
-    case INFINI_DTYPE_F64:
-        switch_f<Tidx, double>(n, result, probs, random_val, topp, topk, temperature);
-        break;
-    default:
-        // unreachable
-        std::abort();
-    }
-}

-void switch_idx(
-    infiniDtype_t dt_i, infiniDtype_t dt_p, size_t n,
-    void *result, void const *probs,
-    float random_val, float topp, int topk, float temperature) {
-
-#define CASE(DT_VAL, DT_TYP)                                                             \
-    case DT_VAL:                                                                         \
-        switch_val<DT_TYP>(dt_p, n, result, probs, random_val, topp, topk, temperature); \
-        break
-
-    switch (dt_i) {
-        CASE(INFINI_DTYPE_I8, int8_t);
-        CASE(INFINI_DTYPE_I16, int16_t);
-        CASE(INFINI_DTYPE_I32, int32_t);
-        CASE(INFINI_DTYPE_I64, int64_t);
-        CASE(INFINI_DTYPE_U8, uint8_t);
-        CASE(INFINI_DTYPE_U16, uint16_t);
-        CASE(INFINI_DTYPE_U32, uint32_t);
-        CASE(INFINI_DTYPE_U64, uint64_t);
-    default:
-        // unreachable
-        std::abort();
+        return INFINI_STATUS_SUCCESS;
    }
-
-#undef CASE
-}
+};

 infiniStatus_t Descriptor::calculate(
    void *workspace,
@@ -189,7 +122,11 @@ infiniStatus_t Descriptor::calculate(
    float temperature,
    void *stream) const {

-    switch_idx(_dt_i, _dt_p, _n, result, probs, random_val, topp, topk, temperature);
+    Calculate::calculate<Algo>(
+        Algo{}, _info, workspace, workspace_size,
+        result, probs,
+        random_val, topp, topk, temperature,
+        stream);

    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
+++ b/src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
+#include "../../../devices/cuda/cuda_handle.cuh"
+#include "../info.h"
+#include "random_sample_cuda.cuh"
+#include "random_sample_kernel.cuh"
+
+namespace op::random_sample::cuda {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::cuda::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t result_desc,
+    infiniopTensorDescriptor_t probs_desc) {
+    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+
+    auto result = RandomSampleInfo::create(result_desc, probs_desc);
+    CHECK_RESULT(result);
+
+    auto info = result.take();
+    size_t workspace_size;
+
+#define CASE_P(CASE, Tidx, Tval)                                        \
+    case CASE: {                                                        \
+        auto workspace_result = calculateWorkspace<Tidx, Tval>(info.n); \
+        CHECK_RESULT(workspace_result);                                 \
+        workspace_size = workspace_result.take();                       \
+    } break
+
+#define CASE_I(CASE, Tidx)                          \
+    case CASE:                                      \
+        switch (info.dt_p) {                        \
+            CASE_P(INFINI_DTYPE_F16, Tidx, half);   \
+            CASE_P(INFINI_DTYPE_F32, Tidx, float);  \
+            CASE_P(INFINI_DTYPE_F64, Tidx, double); \
+        default:                                    \
+            abort();                                \
+        }                                           \
+        break
+
+    switch (info.dt_i) {
+        CASE_I(INFINI_DTYPE_I8, int8_t);
+        CASE_I(INFINI_DTYPE_I16, int16_t);
+        CASE_I(INFINI_DTYPE_I32, int32_t);
+        CASE_I(INFINI_DTYPE_I64, int64_t);
+        CASE_I(INFINI_DTYPE_U8, uint8_t);
+        CASE_I(INFINI_DTYPE_U16, uint16_t);
+        CASE_I(INFINI_DTYPE_U32, uint32_t);
+        CASE_I(INFINI_DTYPE_U64, uint64_t);
+    default:
+        abort();
+    }
+
+#undef CASE_I
+#undef CASE_P
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::minWorkspaceSize() const {
+    return _min_workspace_size;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *result,
+    const void *probs,
+    float random_val,
+    float topp,
+    int topk,
+    float temperature,
+    void *stream) const {
+
+    if (workspace_size < _min_workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto block_size = _opaque->internal->blockSizeX();
+
+    Calculate::calculate<Algo>(
+        Algo{block_size}, _info, workspace, workspace_size,
+        result, probs,
+        random_val, topp, topk, temperature,
+        stream);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::random_sample::cuda
--- a/src/infiniop/ops/random_sample/cuda/random_sample_cuda.cuh
+++ b/src/infiniop/ops/random_sample/cuda/random_sample_cuda.cuh
+#ifndef __RANDOM_SAMPLE_CUDA_CUH__
+#define __RANDOM_SAMPLE_CUDA_CUH__
+
+#include "../random_sample.h"
+
+DESCRIPTOR(cuda)
+
+#endif // __RANDOM_SAMPLE_CUDA_CUH__
--- a/src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
+++ b/src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
+#include "../../../devices/cuda/cuda_kernel_common.cuh"
+#include "infinicore.h"
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_scan.cuh>
+
+namespace op::random_sample::cuda {
+
+// ↓↓↓ 重新封装 cub api，减少模板参数，方便调用
+
+template <class T>
+static cudaError argMax_(
+    cub::KeyValuePair<int, T> *kv_pair,
+    const T *logits,
+    int n,
+    void *workspace_ptr,
+    size_t &workspace_len,
+    cudaStream_t stream) {
+    return cub::DeviceReduce::ArgMax(
+        workspace_ptr, workspace_len,
+        logits, kv_pair, n,
+        stream);
+}
+
+template <class Tval, class Tidx>
+static cudaError radixSort(
+    void *workspace_ptr, size_t &workspace_len,
+    const Tval *key_in, Tval *key_out,
+    const Tidx *val_in, Tidx *val_out,
+    int n,
+    cudaStream_t stream) {
+    return cub::DeviceRadixSort::SortPairsDescending(
+        workspace_ptr, workspace_len,
+        key_in, key_out,
+        val_in, val_out,
+        n,
+        0, sizeof(Tval) * 8,
+        stream);
+}
+
+template <class T>
+static cudaError inclusiveSum(
+    void *workspace_ptr, size_t &workspace_len,
+    T *data, int n,
+    cudaStream_t stream) {
+    return cub::DeviceScan::InclusiveSum(
+        workspace_ptr, workspace_len,
+        data, data, n,
+        stream);
+}
+
+// ↑↑↑ 重新封装 cub api，减少模板参数，方便调用
+// ↓↓↓ 计算 workspace
+
+// 地址对齐到 256
+static constexpr size_t align256(size_t size) {
+    return (size + 255) & (~255);
+}
+
+template <class Tidx, class Tval>
+utils::Result<size_t> calculateWorkspace(size_t n_) {
+    const auto n = static_cast<int>(n_);
+
+    size_t argmax;
+    CHECK_CUDA(argMax_<Tval>(
+        nullptr, nullptr, n,
+        nullptr, argmax,
+        nullptr));
+    // 前 256 字节用于 kv pair
+    argmax += 256;
+
+    // indices
+    size_t size_random = align256(sizeof(Tidx) * n);
+    // sorted
+    size_random += align256(sizeof(Tval) * n);
+    // indices_out
+    size_random += align256(sizeof(Tidx) * n);
+    // cub device api
+    size_t size_radix_sort;
+    CHECK_CUDA((radixSort<Tval, Tidx>(
+        nullptr, size_radix_sort,
+        nullptr, nullptr,
+        nullptr, nullptr,
+        n,
+        nullptr)));
+
+    size_t size_inclusive_sum;
+    CHECK_CUDA(inclusiveSum<Tval>(
+        nullptr, size_inclusive_sum,
+        nullptr, n,
+        nullptr));
+    size_random += cub::Max()(size_radix_sort, size_inclusive_sum);
+
+    return utils::Result<size_t>(cub::Max()(argmax, size_random));
+}
+
+// ↑↑↑ 计算 workspace
+// ↓↓↓ 通过特化将 fp16_t 转换为 half
+
+template <class Tval>
+struct CudaTval {
+    using Type = Tval;
+};
+
+template <>
+struct CudaTval<fp16_t> {
+    using Type = half;
+};
+
+// ↑↑↑ 通过特化将 fp16_t 转换为 half
+// ↓↓↓ 用于采样过程的小型 kernel
+
+// cuda toolkit 11.x 带的 cub::DeviceReduce::ArgMax 只接受 cub::KeyValuePair<int, Tval> 输出。
+// 这个 kernel 用于取出序号
+template <class Tidx, class Tval>
+static __global__ void castIdx(Tidx *result, const cub::KeyValuePair<int, Tval> *kv_pair) {
+    *result = kv_pair->key;
+}
+
+// 填充排序要求的序号数组
+template <class Tidx>
+static __global__ void fillIndices(Tidx *indices, int n) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        indices[i] = i;
+    }
+}
+
+// random sample 使用的 softmax 可以简化为一个基本的线性映射
+// 由于已经排序，最大值就是第一个数字
+// 第一个数字需要被多个 block 读取，不能写
+template <class T>
+static __global__ void partialSoftmaxKernel(
+    T *__restrict__ data, int n,
+    float temperature) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (0 < i && i < n) {
+        float max = __ldg(data);
+        data[i] = (T)expf(((float)data[i] - max) / temperature);
+    }
+}
+
+// 将第一个数字写成 1，即 exp(0)
+template <class T>
+static __global__ void setSoftmaxMaxKernel(
+    T *__restrict__ data) {
+    *data = 1;
+}
+
+// 直接 for 循环遍历采样
+// 这个 kernel 仅用于避免将数据拷贝到 cpu
+template <class Tval, class Tidx>
+static __global__ void randomSampleKernel(
+    Tidx *__restrict__ result,
+    const Tval *__restrict__ sorted,
+    const Tidx *__restrict__ indices_out,
+    size_t n,
+    float random, float topp, size_t topk) {
+    topk = cub::Min()(topk, n);
+    auto p = (Tval)(random * cub::Min()(topp * (float)sorted[n - 1], (float)sorted[topk - 1]));
+    for (size_t i = 0;; ++i) {
+        if ((sorted[i]) >= p) {
+            *result = indices_out[i];
+            return;
+        }
+    }
+}
+
+// ↑↑↑ 用于采样过程的小型 kernel
+
+struct Algo {
+    int block_size;
+
+    template <class Tidx, class Tval_>
+    infiniStatus_t argmax(
+        void *workspace, size_t workspace_size,
+        void *result, const void *probs, size_t n,
+        void *stream_) const {
+
+        using Tval = typename CudaTval<Tval_>::Type;
+
+        auto stream = (cudaStream_t)stream_;
+        auto logits = (Tval *)probs;
+        auto kv_pair = (cub::KeyValuePair<int, Tval> *)workspace;
+        workspace = (void *)((char *)workspace + 256);
+        workspace_size -= 256;
+
+        argMax_(
+            kv_pair,
+            logits,
+            n,
+            workspace,
+            workspace_size, stream);
+        castIdx<<<1, 1, 0, stream>>>((Tidx *)result, kv_pair);
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    template <class Tidx, class Tval_>
+    infiniStatus_t random(
+        void *workspace_, size_t workspace_size,
+        void *result_, const void *probs, size_t n,
+        float random_val, float topp, int topk, float temperature,
+        void *stream_) const {
+
+        using Tval = typename CudaTval<Tval_>::Type;
+
+        auto stream = (cudaStream_t)stream_;
+        auto logits = (Tval *)probs;
+        auto result = (Tidx *)result_;
+
+        auto workspace = reinterpret_cast<size_t>(workspace_);
+        auto workspace_end = workspace + workspace_size;
+
+        auto indices = reinterpret_cast<Tidx *>(workspace);
+        workspace += align256(sizeof(Tidx) * n);
+
+        auto sorted = reinterpret_cast<Tval *>(workspace);
+        workspace += align256(sizeof(Tval) * n);
+
+        auto indices_out = reinterpret_cast<Tidx *>(workspace);
+        workspace += align256(sizeof(Tidx) * n);
+
+        workspace_ = reinterpret_cast<void *>(workspace);
+        workspace_size = workspace_end - workspace;
+
+        auto block = cub::Min()((size_t)block_size, n);
+        auto grid = (n + block - 1) / block;
+        // sort
+        fillIndices<<<grid, block, 0, stream>>>(indices, n);
+        CHECK_CUDA(radixSort(
+            workspace_, workspace_size,
+            logits, sorted,
+            indices, indices_out,
+            n,
+            stream));
+        // softmax
+        partialSoftmaxKernel<<<grid, block, 0, stream>>>(sorted, n, temperature);
+        setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted);
+        // sum
+        CHECK_CUDA(inclusiveSum(
+            workspace_, workspace,
+            sorted, n,
+            stream));
+        // sample
+        randomSampleKernel<<<1, 1, 0, stream>>>(
+            result,
+            sorted, indices_out, n,
+            random_val, topp, topk);
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+} // namespace op::random_sample::cuda
--- a/src/infiniop/ops/random_sample/info.h
+++ b/src/infiniop/ops/random_sample/info.h
+#ifndef __RANDOM_SAMPLE_INFO_H__
+#define __RANDOM_SAMPLE_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+namespace op::random_sample {
+
+struct RandomSampleInfo {
+    infiniDtype_t dt_i, dt_p;
+    size_t n;
+
+    static utils::Result<RandomSampleInfo> create(
+        infiniopTensorDescriptor_t result_desc,
+        infiniopTensorDescriptor_t probs_desc) {
+
+        auto dt_i = result_desc->dtype();
+        auto dt_p = probs_desc->dtype();
+
+        CHECK_DTYPE_ANY_INT(dt_i);
+        CHECK_DTYPE(dt_p, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(result_desc->ndim() == 0, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(probs_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(probs_desc->stride(0) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+        return utils::Result<RandomSampleInfo>({dt_i, dt_p, probs_desc->dim(0)});
+    }
+};
+
+} // namespace op::random_sample
+
+#endif // __RANDOM_SAMPLE_INFO_H__
--- a/src/infiniop/ops/random_sample/operator.cc
+++ b/src/infiniop/ops/random_sample/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/random_sample_cpu.h"
 #endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/random_sample_cuda.cuh"
+#endif

 __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
    infiniopHandle_t handle,
@@ -25,6 +28,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -38,9 +44,10 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
    size_t *size) {

 #define GET(CASE, NAMESPACE)                                          \
-    case CASE:                                                        \
+    case CASE: {                                                      \
        using Ptr = const op::random_sample::NAMESPACE::Descriptor *; \
        *size = reinterpret_cast<Ptr>(desc)->minWorkspaceSize();      \
+    }                                                                 \
        return INFINI_STATUS_SUCCESS

    switch (desc->device_type) {
@@ -48,6 +55,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
 #ifdef ENABLE_CPU_API
        GET(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -82,6 +92,9 @@ __C infiniStatus_t infiniopRandomSample(
 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -103,6 +116,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/ops/random_sample/random_sample.h
+++ b/src/infiniop/ops/random_sample/random_sample.h
 #ifndef __RANDOM_SAMPLE_H__
 #define __RANDOM_SAMPLE_H__

-#include "../../../utils.h"
 #include "../../operator.h"
+#include "info.h"

 #define DESCRIPTOR(NAMESPACE)                             \
                                                          \
@@ -11,22 +11,18 @@
        struct Opaque;                                    \
        Opaque *_opaque;                                  \
                                                          \
-        infiniDtype_t _dt_i, _dt_p;                       \
-        size_t _n, _min_workspace_size;                   \
+        RandomSampleInfo _info;                           \
+        size_t _min_workspace_size;                       \
                                                          \
        Descriptor(                                       \
-            infiniDtype_t dt_i,                           \
-            infiniDtype_t dt_p,                           \
-            size_t n,                                     \
+            RandomSampleInfo info,                        \
            size_t min_workspace_size,                    \
            Opaque *opaque,                               \
            infiniDevice_t device_type,                   \
            int device_id)                                \
            : InfiniopDescriptor{device_type, device_id}, \
              _opaque(opaque),                            \
-              _dt_i(dt_i),                                \
-              _dt_p(dt_p),                                \
-              _n(n),                                      \
+              _info(info),                                \
              _min_workspace_size(min_workspace_size) {}  \
                                                          \
    public:                                               \
@@ -53,4 +49,96 @@
    };                                                    \
    }

+namespace op::random_sample {
+
+struct CalculateArgs {
+    void *workspace;
+    size_t workspace_size;
+    void *result;
+    const void *probs;
+    float random_val, topp, temperature;
+    int topk;
+    void *stream;
+};
+
+class Calculate {
+
+    template <class Tidx, class Tval, class Algo>
+    static void switch_f(Algo algo, size_t n, CalculateArgs args) {
+        if (args.random_val == 0 || args.topp == 0 || args.topk == 1 || args.temperature == 0) {
+            algo.template argmax<Tidx, Tval>(
+                args.workspace, args.workspace_size,
+                args.result, args.probs, n,
+                args.stream);
+        } else {
+            algo.template random<Tidx, Tval>(
+                args.workspace, args.workspace_size,
+                args.result, args.probs, n,
+                args.random_val, args.topp, args.topk, args.temperature,
+                args.stream);
+        }
+    }
+
+    template <class Tidx, class Algo>
+    static void switch_val(
+        Algo algo,
+        infiniDtype_t dt_p, size_t n, CalculateArgs args) {
+        switch (dt_p) {
+        case INFINI_DTYPE_F16:
+            switch_f<Tidx, fp16_t>(algo, n, args);
+            break;
+        case INFINI_DTYPE_F32:
+            switch_f<Tidx, float>(algo, n, args);
+            break;
+        case INFINI_DTYPE_F64:
+            switch_f<Tidx, double>(algo, n, args);
+            break;
+        default:
+            // unreachable
+            std::abort();
+        }
+    }
+
+public:
+    template <class Algo>
+    static infiniStatus_t calculate(
+        Algo algo,
+        RandomSampleInfo info,
+        void *workspace, size_t workspace_size,
+        void *result, const void *probs,
+        float random_val, float topp, int topk, float temperature,
+        void *stream) {
+
+#define CASE(DT_VAL, DT_TYP)                      \
+    case DT_VAL:                                  \
+        switch_val<DT_TYP>(                       \
+            algo, info.dt_p, info.n,              \
+            {workspace, workspace_size,           \
+             result, probs,                       \
+             random_val, topp, temperature, topk, \
+             stream});                            \
+        break
+
+        switch (info.dt_i) {
+            CASE(INFINI_DTYPE_I8, int8_t);
+            CASE(INFINI_DTYPE_I16, int16_t);
+            CASE(INFINI_DTYPE_I32, int32_t);
+            CASE(INFINI_DTYPE_I64, int64_t);
+            CASE(INFINI_DTYPE_U8, uint8_t);
+            CASE(INFINI_DTYPE_U16, uint16_t);
+            CASE(INFINI_DTYPE_U32, uint32_t);
+            CASE(INFINI_DTYPE_U64, uint64_t);
+        default:
+            // unreachable
+            std::abort();
+        }
+
+#undef CASE
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+} // namespace op::random_sample
+
 #endif // __RANDOM_SAMPLE_H__
--- a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+++ b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+#include "rearrange_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+#include <aclnnop/aclnn_copy.h>
+
+namespace op::rearrange::ascend {
+
+struct Descriptor::Opaque {
+    aclnnTensorDescriptor_t dst;
+    aclnnTensorDescriptor_t src;
+    void *workspace; // aclnnInplaceCopy workspace
+    uint64_t workspace_size;
+    ~Opaque() {
+        delete dst;
+        delete src;
+
+        aclrtFree(workspace);
+    }
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+};
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+    auto ndim = y_desc->ndim();
+    auto shape = y_desc->shape();
+    CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    for (size_t i = 0; i < ndim; ++i) {
+        CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+    auto dst_strides = y_desc->strides();
+    auto src_strides = x_desc->strides();
+    auto element_size = infiniSizeOf(dtype);
+
+    auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
+    CHECK_RESULT(result);
+
+    aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
+    aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+    void *workspace = nullptr;
+    aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
+                                     &workspace_size, &executor);
+    if (workspace_size != 0) {
+        CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        new Opaque{
+            dst,
+            src,
+            workspace,
+            workspace_size},
+        handle->device,
+        handle->device_id);
+
+    // Delete useless executor
+    aclDestroyAclOpExecutor(executor);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *y,
+    const void *x,
+    void *stream) const {
+    auto tdst = _opaque->dst->tensor;
+    auto tsrc = _opaque->src->tensor;
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+
+    AclSetTensorAddr(executor, 0, tdst, y);
+    AclSetTensorAddr(executor, 1, tsrc, (void *)x);
+    CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
+    // Execute InplaceCopy
+    CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
+                               executor, stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::rearrange::ascend
--- a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+++ b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+#ifndef __REARRANGE_ASCEND_H__
+#define __REARRANGE_ASCNED_H__
+
+#include "../rearrange.h"
+
+DESCRIPTOR(ascend)
+
+#endif // __REARRANGE_ASCEND_H__
--- a/src/infiniop/ops/rearrange/operator.cc
+++ b/src/infiniop/ops/rearrange/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/rearrange_cpu.h"
 #endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/rearrange_ascend.h"
+#endif

 #ifdef ENABLE_CUDA_API
 #include "cuda/rearrange_cuda.cuh"
@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        CREATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif

 #ifdef ENABLE_CUDA_API
        DELETE(INFINI_DEVICE_NVIDIA, cuda);

--- a/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+++ b/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
    aclnnTensorDescriptor_t w;
    aclnnTensorDescriptor_t rstd;
    size_t workspaceSize;
+    aclOpExecutor *executor;

    ~Opaque() {
        delete y;
        delete x;
        delete w;
        delete rstd;
+
+        aclDestroyAclOpExecutor(executor);
    }
 };

@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(

    // Get WorkspaceSize and set executor
    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
+    aclSetAclOpExecutorRepeatable(executor);

    auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
    size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
    *desc_ptr = new Descriptor(
-        new Opaque{y, x, w, rstd, workspace_size},
+        new Opaque{y, x, w, rstd, workspace_size, executor},
        std::move(info),
        all_workspace_size,
        handle_ascend->device, handle_ascend->device_id);

-    aclDestroyAclOpExecutor(executor);
-
    return INFINI_STATUS_SUCCESS;
 }

@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
    auto tx = _opaque->x->tensor;
    auto ty = _opaque->y->tensor;
    auto trstd = _opaque->rstd->tensor;
-    size_t workspace_size_ = 0;
-    aclOpExecutor *executor = nullptr;
-
-    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
-    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);

    auto unit = infiniSizeOf(_info.atype);
-    AclSetTensorAddr(executor, 1, tw, (void *)w);
-    AclSetTensorAddr(executor, 3, trstd, rstdPtr);
+    AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
+    AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
    for (size_t i = 0; i < (_info.shape)[0]; ++i) {
-        AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
-        AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
-        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
+        AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
+        AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
+        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
    }
    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/rms_norm/kunlun/rms_norm_kernel.xpu
+++ b/src/infiniop/ops/rms_norm/kunlun/rms_norm_kernel.xpu
 #ifndef __RMS_NORM_KUNLUN_KERNEL_XPU__
 #define __RMS_NORM_KUNLUN_KERNEL_XPU__

-#include "../../../devices/kunlun/kunlun_common.h"
+#include "../../../devices/kunlun/kunlun_kernel_common.h"
 #include "../../../reduce/kunlun/reduce_kunlun.h"

+using namespace device::kunlun::kernel;
+
 // Element wise mul used in x * w
 static inline __device__ void elementwiseMulRms(float *x, float *w, float *y, int count, float rms) {
    int remain = count % 16;

--- a/src/infiniop/ops/rope/ascend/rope_ascend.cc
+++ b/src/infiniop/ops/rope/ascend/rope_ascend.cc
+#include "rope_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+
+namespace op::rope::ascend {
+
+Descriptor::~Descriptor()
+    = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t pos_desc,
+    infiniopTensorDescriptor_t sin_desc,
+    infiniopTensorDescriptor_t cos_desc) {
+    auto handle_ascned = reinterpret_cast<device::ascend::Handle *>(handle);
+    auto result = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
+    CHECK_RESULT(result);
+
+    size_t workspace_size = 0;
+    *desc_ptr = new Descriptor(std::move(result.take()), workspace_size, nullptr, handle_ascned->device, handle_ascned->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) const {
+    CHECK_DTYPE(_info.data_type, INFINI_DTYPE_F32, INFINI_DTYPE_F16);
+
+    auto data_type = _info.data_type;
+    auto pos_type = _info.pos_type;
+    auto seq_len = _info.seqlen;
+    auto nhead = _info.nhead;
+    auto dhead = _info.dhead;
+
+    auto y_stride_seqlen = _info.y_stride_seqlen;
+    auto y_stride_nhead = _info.y_stride_nhead;
+    auto x_stride_seqlen = _info.x_stride_seqlen;
+    auto x_stride_nhead = _info.x_stride_nhead;
+
+    return rope_kernel_launch(y, (void *)x, (void *)pos_ids, (void *)sin_table, (void *)cos_table, seq_len, nhead, dhead, data_type, pos_type, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead, stream);
+}
+} // namespace op::rope::ascend
--- a/src/infiniop/ops/rope/ascend/rope_ascend.h
+++ b/src/infiniop/ops/rope/ascend/rope_ascend.h
+#ifndef __ACLNN_ROPE_H__
+#define __ACLNN_ROPE_H__
+
+#include "../rope.h"
+
+extern "C" infiniStatus_t rope_kernel_launch(
+    void *y,
+    void *x,
+    void *pos,
+    void *sin,
+    void *cos,
+    size_t seq_len,
+    size_t nhead,
+    size_t dhead,
+    infiniDtype_t data_type,
+    infiniDtype_t pos_type,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead,
+    void *stream);
+
+DESCRIPTOR(ascend)
+
+#endif // __ACLNN_ROPE_H__
--- a/src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
+++ b/src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
+#include "../../../devices/ascend/ascend_kernel_common.h"
+
+using namespace AscendC;
+
+template <typename T, typename U>
+class RoPEKernel {
+public:
+    __aicore__ inline RoPEKernel() {}
+    // Init op
+    // pos position vector
+    // x input tensor
+    // y output tensor
+    // tensor shape [nt, nh, dh]
+    // make block_num = nh, tile_len = dh
+    __aicore__ inline void init(GM_ADDR y,
+                                GM_ADDR x,
+                                GM_ADDR pos,
+                                GM_ADDR sin,
+                                GM_ADDR cos,
+                                size_t dh,
+                                ptrdiff_t st_ynt,
+                                ptrdiff_t st_ynh,
+                                ptrdiff_t st_xnt,
+                                ptrdiff_t st_xnh);
+    __aicore__ inline void process(size_t seq_len);
+
+private:
+    // Copy a tile into UB
+    __aicore__ inline void copyIn(size_t i);
+    __aicore__ inline void compute(size_t i);
+    __aicore__ inline void copyOut(size_t i);
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _in_que;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _sin_que;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _cos_que;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> _out_que;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf;
+    TBuf<TPosition::VECCALC> _tmp_even_buf;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf1;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf2;
+    TBuf<TPosition::VECCALC> _tmp_even_buf1;
+    TBuf<TPosition::VECCALC> _tmp_even_buf2;
+
+    GlobalTensor<T> _x_gm, _y_gm;
+    GlobalTensor<U> _p_gm;
+    GlobalTensor<T> _sin_gm;
+    GlobalTensor<T> _cos_gm;
+
+    size_t _block_idx;
+    size_t _tile_len;
+    size_t _copy_len;
+    size_t _half_copy_len;
+
+    // stridey[_st_ynt, _st_ynh, 1]
+    ptrdiff_t _st_ynt;
+    ptrdiff_t _st_ynh;
+    // stridex[_st_xnt, _st_xnh, 1]
+    ptrdiff_t _st_xnt;
+    ptrdiff_t _st_xnh;
+};
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::init(GM_ADDR y,
+                                              GM_ADDR x,
+                                              GM_ADDR pos,
+                                              GM_ADDR sin,
+                                              GM_ADDR cos,
+                                              size_t dh,
+                                              ptrdiff_t st_ynt,
+                                              ptrdiff_t st_ynh,
+                                              ptrdiff_t st_xnt,
+                                              ptrdiff_t st_xnh) {
+    this->_tile_len = dh;
+    this->_st_ynt = st_ynt;
+    this->_st_ynh = st_ynh;
+    this->_st_xnt = st_xnt;
+    this->_st_xnh = st_xnh;
+    _copy_len = alignTileLen<T>(dh, BYTE_ALIGN);
+    _half_copy_len = alignTileLen<T>(dh, BYTE_ALIGN);
+
+    _block_idx = GetBlockIdx();
+
+    // Init global buffer
+    _x_gm.SetGlobalBuffer((__gm__ T *)x);
+    _p_gm.SetGlobalBuffer((__gm__ U *)pos);
+    _sin_gm.SetGlobalBuffer((__gm__ T *)sin);
+    _cos_gm.SetGlobalBuffer((__gm__ T *)cos);
+    _y_gm.SetGlobalBuffer((__gm__ T *)y);
+
+    // Init Queue buffer
+    pipe.InitBuffer(_in_que, BUFFER_NUM, _copy_len * sizeof(T));
+    pipe.InitBuffer(_out_que, BUFFER_NUM, _tile_len * sizeof(T));
+    pipe.InitBuffer(_sin_que, BUFFER_NUM, _half_copy_len * sizeof(T));
+    pipe.InitBuffer(_cos_que, BUFFER_NUM, _half_copy_len * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf1, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf2, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf1, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf2, _tile_len / 2 * sizeof(T));
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::copyIn(size_t i) {
+    LocalTensor<T> input_ub = _in_que.AllocTensor<T>();
+    LocalTensor<T> sin_ub = _sin_que.AllocTensor<T>();
+    LocalTensor<T> cos_ub = _cos_que.AllocTensor<T>();
+    // Get idx of current tile in total input
+    auto idx = i * _st_xnt + _block_idx * _st_xnh;
+    // Copy tile current tile into UB
+    DataCopy(input_ub, _x_gm[idx], _copy_len);
+    // Copy sin cos tile
+    auto pos_idx = _p_gm(i);
+    DataCopy(sin_ub, _sin_gm[pos_idx * _tile_len / 2], _half_copy_len);
+    DataCopy(cos_ub, _cos_gm[pos_idx * _tile_len / 2], _half_copy_len);
+    // Push in operands
+    _in_que.EnQue(input_ub);
+    _sin_que.EnQue(sin_ub);
+    _cos_que.EnQue(cos_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::compute(size_t i) {
+    LocalTensor<T> input_ub = _in_que.DeQue<T>();
+    LocalTensor<T> sin_ub = _sin_que.DeQue<T>();
+    LocalTensor<T> cos_ub = _cos_que.DeQue<T>();
+    LocalTensor<T> output_ub = _out_que.AllocTensor<T>();
+
+    LocalTensor<T> tmp_odd = _tmp_odd_buf.Get<T>();
+    LocalTensor<T> tmp_even = _tmp_even_buf.Get<T>();
+    LocalTensor<T> tmp_odd1 = _tmp_odd_buf1.Get<T>();
+    LocalTensor<T> tmp_odd2 = _tmp_odd_buf2.Get<T>();
+    LocalTensor<T> tmp_even1 = _tmp_even_buf1.Get<T>();
+    LocalTensor<T> tmp_even2 = _tmp_even_buf2.Get<T>();
+
+    // separate odd and even bit elements
+    uint64_t rsvdCnt = 0;
+    GatherMaskParams gMaskParams = {
+        1,
+        static_cast<uint16_t>((_tile_len * sizeof(T) + 255) / 256), // no more than 256(<=255)
+        8,
+        8,
+    };
+    GatherMask<T>(tmp_odd, input_ub, 1, false, 0, gMaskParams, rsvdCnt);
+    GatherMask<T>(tmp_even, input_ub, 2, false, 0, gMaskParams, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+
+    // compute odd bit elements
+    // y_odd = x_odd * cos - x_even * sin
+    Mul<T>(tmp_odd1, tmp_odd, cos_ub, _tile_len / 2);
+    Mul<T>(tmp_odd2, tmp_even, sin_ub, _tile_len / 2);
+    PipeBarrier<PIPE_V>();
+    Sub<T>(tmp_odd1, tmp_odd1, tmp_odd2, _tile_len / 2);
+
+    // compute even bit elements
+    // y_even = x_odd * sin + x_even * cos
+    Mul<T>(tmp_even1, tmp_odd, sin_ub, _tile_len / 2);
+    Mul<T>(tmp_even2, tmp_even, cos_ub, _tile_len / 2);
+    PipeBarrier<PIPE_V>();
+    Add<T>(tmp_even1, tmp_even1, tmp_even2, _tile_len / 2);
+
+    // combine odd and even bit elements
+    for (uint32_t j = 0; j < _tile_len / 2; j += 1) {
+        output_ub(j * 2) = tmp_odd1(j);
+        output_ub(j * 2 + 1) = tmp_even1(j);
+    }
+
+    _out_que.EnQue<T>(output_ub);
+    _in_que.FreeTensor(input_ub);
+    _sin_que.FreeTensor(sin_ub);
+    _cos_que.FreeTensor(cos_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::copyOut(size_t i) {
+    LocalTensor<T> output_ub = _out_que.DeQue<T>();
+    auto idy = i * _st_ynt + _block_idx * _st_ynh;
+    DataCopyExtParams params = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
+    DataCopyPad(_y_gm[idy], output_ub, params);
+    _out_que.FreeTensor(output_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::process(size_t seq_len) {
+
+    for (size_t i = 0; i < seq_len; ++i) {
+        copyIn(i);
+        compute(i);
+        copyOut(i);
+    }
+}
+
+#define ROPE_KERNEL_INIT_ARGS y, x, pos, sin, cos, dhead,      \
+                              y_stride_seqlen, y_stride_nhead, \
+                              x_stride_seqlen, x_stride_nhead
+
+#define CASE_POSTYPE(POS_TYPE_ENUM, TYPE, POS_T) \
+    case POS_TYPE_ENUM: {                        \
+        RoPEKernel<TYPE, POS_T> op;              \
+        op.init(ROPE_KERNEL_INIT_ARGS);          \
+        op.process(seq_len);                     \
+        break;                                   \
+    }
+
+#define ROPE_KERNEL(TYPE, POSTYPE)                     \
+    switch (POSTYPE) {                                 \
+        CASE_POSTYPE(INFINI_DTYPE_I8, TYPE, int8_t)    \
+        CASE_POSTYPE(INFINI_DTYPE_I16, TYPE, int16_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_I32, TYPE, int32_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_I64, TYPE, int64_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_U8, TYPE, uint8_t)   \
+        CASE_POSTYPE(INFINI_DTYPE_U16, TYPE, uint16_t) \
+        CASE_POSTYPE(INFINI_DTYPE_U32, TYPE, uint32_t) \
+        CASE_POSTYPE(INFINI_DTYPE_U64, TYPE, uint64_t) \
+    default:                                           \
+        break;                                         \
+    }
+
+#define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE)                         \
+    __global__ __aicore__ void KERNEL_NAME(GM_ADDR y,                 \
+                                           GM_ADDR x,                 \
+                                           GM_ADDR pos,               \
+                                           GM_ADDR sin,               \
+                                           GM_ADDR cos,               \
+                                           size_t seq_len,            \
+                                           size_t dhead,              \
+                                           ptrdiff_t y_stride_seqlen, \
+                                           ptrdiff_t y_stride_nhead,  \
+                                           ptrdiff_t x_stride_seqlen, \
+                                           ptrdiff_t x_stride_nhead,  \
+                                           int32_t pos_type) {        \
+        ROPE_KERNEL(TYPE, pos_type)                                   \
+    }
+
+DEFINE_ROPE_KERNEL(rope_kernel_float, float)
+DEFINE_ROPE_KERNEL(rope_kernel_half, half)
+
+#undef DEFINE_ROPE_KERNEL
+#undef ROPE_KERNEL
+#undef CASE_POSTYPE
+#undef ROPE_KERNEL_INIT_ARGS
+
+extern "C" infiniStatus_t rope_kernel_launch(
+    void *y,
+    void *x,
+    void *pos,
+    void *sin,
+    void *cos,
+    size_t seq_len,
+    size_t nhead,
+    size_t dhead,
+    infiniDtype_t dtype,
+    infiniDtype_t pos_type,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead,
+    void *stream) {
+
+#define LAUNCH_ROPE_KERNEL(DTYPE_ENUM, KERNEL_NAME)                  \
+    case DTYPE_ENUM:                                                 \
+        KERNEL_NAME<<<nhead, nullptr, stream>>>(y, x, pos, sin, cos, \
+                                                seq_len,             \
+                                                dhead,               \
+                                                y_stride_seqlen,     \
+                                                y_stride_nhead,      \
+                                                x_stride_seqlen,     \
+                                                x_stride_nhead,      \
+                                                pos_type);           \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (dtype) {
+        LAUNCH_ROPE_KERNEL(INFINI_DTYPE_F16, rope_kernel_half)
+        LAUNCH_ROPE_KERNEL(INFINI_DTYPE_F32, rope_kernel_float)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
--- a/src/infiniop/ops/rope/operator.cc
+++ b/src/infiniop/ops/rope/operator.cc
@@ -8,6 +8,9 @@
 #ifdef ENABLE_CUDA_API
 #include "cuda/rope_cuda.cuh"
 #endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/rope_ascend.h"
+#endif

 __C infiniStatus_t infiniopCreateRoPEDescriptor(
    infiniopHandle_t handle,
@@ -43,12 +46,8 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
                                        pos_ids, sin_table, cos_table);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu: {
-        return ascendCreateRoPEDescriptor((AscendHandle_t)handle,
-                                          (RoPEAscendDescriptor_t *)desc_ptr, t,
-                                          pos_ids, sin_table, cos_table);
-    }
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
@@ -90,10 +89,8 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
        return bangGetRoPEWorkspaceSize((RoPEBangDescriptor_t)desc, size);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu: {
-        return ascendGetRoPEWorkspaceSize((RoPEAscendDescriptor_t)desc, size);
-    }
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
@@ -141,12 +138,8 @@ __C infiniStatus_t infiniopRoPE(
                        t, pos_ids, sin_table, cos_table, stream);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu: {
-        return ascendRoPE((RoPEAscendDescriptor_t)desc, workspace,
-                          workspace_size, t, pos_ids, sin_table, cos_table,
-                          stream);
-    }
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
@@ -187,10 +180,8 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
        return bangDestroyRoPEDescriptor((RoPEBangDescriptor_t)desc);
    }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-    case DevAscendNpu: {
-        return ascendDestroyRoPEDescriptor((RoPEAscendDescriptor_t)desc);
-    }
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {

--- a/src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
+++ b/src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
+#include "swiglu_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+
+namespace op::swiglu::ascend {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t c_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
+
+    auto dtype = c_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    const auto &a_desc = input_descs[0];
+    const auto &b_desc = input_descs[1];
+
+    auto result = SwigluInfo::create(c_desc, a_desc, b_desc);
+    CHECK_RESULT(result);
+    SwigluInfo info = result.take();
+
+    // https://www.hiascend.com/document/detail/zh/canncommercial/800/apiref/ascendcopapi/atlasascendc_api_07_0777.html
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(std::move(info), workspace_size, handle_ascend->device, handle_ascend->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace,
+                                     size_t workspace_size,
+                                     void *c,
+                                     std::vector<const void *> inputs,
+                                     void *stream) const {
+    auto batch = _info.ndim == 2 ? 1 : _info.shape[0];
+    auto seq_len = _info.ndim == 2 ? _info.shape[0] : _info.shape[1];
+    auto hidden_size = _info.shape[_info.ndim - 1];
+    auto stride_batch_c = _info.ndim == 2 ? 1 : _info.c_strides[0];
+    auto stride_batch_a = _info.ndim == 2 ? 1 : _info.a_strides[0];
+    auto stride_batch_b = _info.ndim == 2 ? 1 : _info.b_strides[0];
+    auto stride_seq_c = _info.ndim == 2 ? _info.c_strides[0] : _info.c_strides[1];
+    auto stride_seq_a = _info.ndim == 2 ? _info.a_strides[0] : _info.a_strides[1];
+    auto stride_seq_b = _info.ndim == 2 ? _info.b_strides[0] : _info.b_strides[1];
+    auto status = swiglu_kernel_launch(c, (void *)inputs[0], (void *)inputs[1], _info.dtype, batch, seq_len, hidden_size, stride_batch_c, stride_batch_a, stride_batch_b, stride_seq_c, stride_seq_a, stride_seq_b, stream);
+    return status;
+}
+
+} // namespace op::swiglu::ascend