issue/228: clip算子更新

ce2c4813 · Catheriany · 6bb801f6 · 6ca0e313 · ce2c4813 · ce2c4813
Commit ce2c4813 authored Jun 03, 2025 by Catheriany
20 changed files
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -6,6 +6,7 @@
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/avg_pool.h"
 #include "infiniop/ops/causal_softmax.h"
+#include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/expand.h"
 #include "infiniop/ops/gemm.h"

--- a/include/infiniop/ops/clip.h
+++ b/include/infiniop/ops/clip.h
+#ifndef __INFINIOP_CLIP_API_H__
+#define __INFINIOP_CLIP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopClipDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
+                                                         infiniopClipDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y,
+                                                         infiniopTensorDescriptor_t x,
+                                                         infiniopTensorDescriptor_t min_val,
+                                                         infiniopTensorDescriptor_t max_val);
+
+__C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *y,
+                                         const void *x,
+                                         const void *min_val,
+                                         const void *max_val,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/mul.h
+++ b/include/infiniop/ops/mul.h
@@ -6,10 +6,10 @@
 typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;

 __C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
-                                                       infiniopMulDescriptor_t *desc_ptr,
-                                                       infiniopTensorDescriptor_t c,
-                                                       infiniopTensorDescriptor_t a,
-                                                       infiniopTensorDescriptor_t b);
+                                                        infiniopMulDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);

 __C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);

@@ -20,7 +20,7 @@ __C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
                                        const void *a,
                                        const void *b,
                                        void *stream);
-                
+
 __C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);

 #endif
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -8,6 +8,7 @@
 DECLARE_INFINIOP_TEST(gemm)
 DECLARE_INFINIOP_TEST(random_sample)
 DECLARE_INFINIOP_TEST(mul)
+DECLARE_INFINIOP_TEST(clip)
 DECLARE_INFINIOP_TEST(swiglu)
 DECLARE_INFINIOP_TEST(add)

@@ -30,6 +31,7 @@ DECLARE_INFINIOP_TEST(add)
        REGISTER_INFINIOP_TEST(random_sample) \
        REGISTER_INFINIOP_TEST(add)           \
        REGISTER_INFINIOP_TEST(mul)           \
+        REGISTER_INFINIOP_TEST(clip)          \
        REGISTER_INFINIOP_TEST(swiglu)        \
    }


--- a/src/infiniop-test/src/ops/clip.cpp
+++ b/src/infiniop-test/src/ops/clip.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::clip {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> min_val;
+    std::shared_ptr<Tensor> max_val;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("min_val") == tensors.end()
+        || tensors.find("max_val") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->x = tensors["x"];
+    test->_attributes->min_val = tensors["min_val"];
+    test->_attributes->max_val = tensors["max_val"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopClipDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto min_val = _attributes->min_val->to(device, device_id);
+    auto max_val = _attributes->max_val->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateClipDescriptor(handle, &op_desc,
+                                          y->desc(),
+                                          x->desc(),
+                                          min_val->desc(),
+                                          max_val->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create clip descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetClipWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopClip(op_desc, workspace, workspace_size,
+                          y->data(),
+                          x->data(),
+                          min_val->data(),
+                          max_val->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopClip(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                min_val->data(),
+                max_val->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyClipDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "min_val", "max_val", "y", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- min_val: " << _attributes->min_val->info() << std::endl;
+    oss << "- max_val: " << _attributes->max_val->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::clip
--- a/src/infiniop/devices/ascend/CMakeLists.txt
+++ b/src/infiniop/devices/ascend/CMakeLists.txt
@@ -23,10 +23,9 @@ include_directories(
    ${CMAKE_SOURCE_DIR}/../../../../include/infiniop/ 
 )

-
 ascendc_library(ascend_kernels STATIC
    ../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
-    # ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
+    ../../ops/rope/ascend/rope_ascend_kernel.cpp
    # ../../ops/random_sample/ascend/random_sample_kernel.cpp
 )

--- a/src/infiniop/devices/ascend/ascend_kernel_common.h
+++ b/src/infiniop/devices/ascend/ascend_kernel_common.h
@@ -4,8 +4,17 @@
 #include "../../../../include/infinicore.h"
 #include "kernel_operator.h"

-constexpr int32_t BLOCK_NUM = 8;
-constexpr int32_t BUFFER_NUM = 2;
-constexpr int32_t BYTE_ALIGN = 32;
+constexpr size_t BLOCK_NUM = 8;
+constexpr size_t BUFFER_NUM = 2;
+constexpr size_t BYTE_ALIGN = 32;
+
+template <typename T>
+__aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) {
+    size_t bytes = tile_len * sizeof(T);
+    size_t aligned_bytes = (bytes % byte_align == 0)
+                             ? bytes
+                             : (bytes + (byte_align - bytes % byte_align));
+    return aligned_bytes / sizeof(T);
+}

 #endif
--- a/src/infiniop/ops/attention/operator.cc
+++ b/src/infiniop/ops/attention/operator.cc
@@ -20,15 +20,14 @@ struct InfiniopAttentionDescriptor {
    infiniopGemmDescriptor_t matmul_desc1;
    infiniopGemmDescriptor_t matmul_desc2;
    infiniopCausalSoftmaxDescriptor_t softmax_desc;
-    uint64_t workspace_size;
-    uint64_t rearranged_q_size;
-    uint64_t matmul1_workspace_size;
-    uint64_t matmul1_tensor_size;
-    uint64_t matmul2_workspace_size;
-    uint64_t matmul2_tensor_size;
-    uint64_t softmax_workspace_size;
-    uint64_t k_cache_offset;
-    uint64_t v_cache_offset;
+    size_t workspace_size;
+    size_t op_workspace_offset;
+    size_t op_workspace_size;
+    size_t q_cont_offset;
+    size_t att_score_offset;
+    size_t att_val_offset;
+    size_t k_cache_offset;
+    size_t v_cache_offset;
    float qk_alpha;
 };

@@ -40,7 +39,7 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
                                                              infiniopTensorDescriptor_t v_desc,
                                                              infiniopTensorDescriptor_t k_cache_desc,
                                                              infiniopTensorDescriptor_t v_cache_desc,
-                                                              uint64_t pos) {
+                                                              size_t pos) {
    if (out_desc->ndim() != 3 || q_desc->ndim() != 3 || k_desc->ndim() != 3 || v_desc->ndim() != 3 || k_cache_desc->ndim() != 3 || v_cache_desc->ndim() != 3) {
        return INFINI_STATUS_BAD_TENSOR_SHAPE;
    }
@@ -53,13 +52,14 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
        return INFINI_STATUS_BAD_TENSOR_STRIDES;
    }

-    uint64_t n_q_head = q_desc->shape()[0];
-    uint64_t seq_len = q_desc->shape()[1];
-    uint64_t head_dim = q_desc->shape()[2];
-    uint64_t hidden_size = n_q_head * head_dim;
-    uint64_t n_kv_head = k_desc->shape()[0];
-    uint64_t total_seq_len = seq_len + pos;
-    uint64_t n_group = n_q_head / n_kv_head;
+    size_t n_q_head = q_desc->shape()[0];
+    size_t seq_len = q_desc->shape()[1];
+    size_t head_dim = q_desc->shape()[2];
+    size_t hidden_size = n_q_head * head_dim;
+    size_t n_kv_head = k_desc->shape()[0];
+    size_t total_seq_len = seq_len + pos;
+    size_t n_group = n_q_head / n_kv_head;
+    size_t alignment = 256;

    if (out_desc->shape()[0] != seq_len || out_desc->shape()[1] != n_q_head || out_desc->shape()[2] != head_dim) {
        return INFINI_STATUS_BAD_PARAM;
@@ -98,12 +98,12 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
    CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_v, dst_v_desc, v_desc));

    infiniopRearrangeDescriptor_t rearrange_desc_q = nullptr;
-    uint64_t rearranged_q_size = 0;
+    size_t q_cont_size = 0;
    infiniopTensorDescriptor_t rearranged_q_desc;
    // Rearrange q into contiguous
    if (!q_desc->isContiguous(0, 1)) {
        CHECK_STATUS(infiniopCreateTensorDescriptor(&rearranged_q_desc, 3, q_desc->shape().data(), nullptr, q_desc->dtype()));
-        rearranged_q_size = rearranged_q_desc->numel() * infiniSizeOf(rearranged_q_desc->dtype());
+        q_cont_size = utils::align(rearranged_q_desc->numel() * infiniSizeOf(rearranged_q_desc->dtype()), alignment);
        rearrange_desc_q = new InfiniopDescriptor;
        CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_q, rearranged_q_desc, q_desc));
    }
@@ -116,12 +116,12 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
    TRANSFORM_TENSOR_DESC(reshaped_q_desc, dimMerge(1, 2));
    //      full_k: [n_kv_head, head_dim, total_seq_len]
    infiniopTensorDescriptor_t full_k_desc;
-    uint64_t full_k_shape[3] = {n_kv_head, total_seq_len, head_dim};
+    size_t full_k_shape[3] = {n_kv_head, total_seq_len, head_dim};
    CHECK_STATUS(infiniopCreateTensorDescriptor(&full_k_desc, 3, full_k_shape, k_cache_desc->strides().data(), k_cache_desc->dtype()));
    TRANSFORM_TENSOR_DESC(full_k_desc, dimPermute({0, 2, 1}));
    //      qk: [n_kv_head, n_group * seq_len, total_seq_len]
    infiniopTensorDescriptor_t qk_desc;
-    uint64_t qk_shape[3] = {n_kv_head, n_group * seq_len, total_seq_len};
+    size_t qk_shape[3] = {n_kv_head, n_group * seq_len, total_seq_len};
    CHECK_STATUS(infiniopCreateTensorDescriptor(&qk_desc, 3, qk_shape, nullptr, q_desc->dtype()));
    //      matmul1_desc
    //          qk_alpha
@@ -129,10 +129,11 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
    infiniopGemmDescriptor_t matmul1_desc;
    CHECK_STATUS(infiniopCreateGemmDescriptor(handle, &matmul1_desc, qk_desc, reshaped_q_desc, full_k_desc));
    //      matmul1 workspace size
-    uint64_t matmul1_workspace_size;
+    size_t matmul1_workspace_size;
    CHECK_STATUS(infiniopGetGemmWorkspaceSize(matmul1_desc, &matmul1_workspace_size));
-    //      matmul1 tensor size
-    uint64_t matmul1_tensor_size = qk_desc->numel() * infiniSizeOf(qk_desc->dtype());
+    matmul1_workspace_size = utils::align(matmul1_workspace_size, alignment);
+    //      attention score tensor size
+    size_t attn_score_size = utils::align(qk_desc->numel() * infiniSizeOf(qk_desc->dtype()), alignment);

    // CausalSoftmax: softmax(qk)
    //      qk: [n_kv_head, n_group * seq_len, total_seq_len] -> [n_q_head, seq_len, total_seq_len]
@@ -141,8 +142,9 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
    infiniopCausalSoftmaxDescriptor_t softmax_desc;
    CHECK_STATUS(infiniopCreateCausalSoftmaxDescriptor(handle, &softmax_desc, qk_desc, qk_desc));
    //      softmax workspace size
-    uint64_t softmax_workspace_size;
+    size_t softmax_workspace_size;
    CHECK_STATUS(infiniopGetCausalSoftmaxWorkspaceSize(softmax_desc, &softmax_workspace_size));
+    softmax_workspace_size = utils::align(softmax_workspace_size, alignment);

    // Matmul2: softmax(qk) * full_v
    //      softmax(qk): [n_q_head, seq_len, total_seq_len] -> [n_kv_head, n_group * seq_len, total_seq_len]
@@ -150,41 +152,44 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
    TRANSFORM_TENSOR_DESC(qk_desc, dimSplit(0, {n_kv_head, n_group}));
    TRANSFORM_TENSOR_DESC(qk_desc, dimMerge(1, 2));
    infiniopTensorDescriptor_t full_v_desc;
-    uint64_t full_v_shape[3] = {n_kv_head, total_seq_len, head_dim};
+    size_t full_v_shape[3] = {n_kv_head, total_seq_len, head_dim};
    CHECK_STATUS(infiniopCreateTensorDescriptor(&full_v_desc, 3, full_v_shape, v_cache_desc->strides().data(), v_cache_desc->dtype()));
    //      temp_out: [n_kv_head, n_group * seq_len, head_dim]
-    infiniopTensorDescriptor_t temp_out_desc;
-    uint64_t temp_out_shape[3] = {n_kv_head, n_group * seq_len, head_dim};
-    CHECK_STATUS(infiniopCreateTensorDescriptor(&temp_out_desc, 3, temp_out_shape, nullptr, q_desc->dtype()));
+    infiniopTensorDescriptor_t att_val_desc;
+    size_t temp_out_shape[3] = {n_kv_head, n_group * seq_len, head_dim};
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&att_val_desc, 3, temp_out_shape, nullptr, q_desc->dtype()));
    //      matmul2_desc
    infiniopGemmDescriptor_t matmul2_desc;
-    CHECK_STATUS(infiniopCreateGemmDescriptor(handle, &matmul2_desc, temp_out_desc, qk_desc, full_v_desc));
+    CHECK_STATUS(infiniopCreateGemmDescriptor(handle, &matmul2_desc, att_val_desc, qk_desc, full_v_desc));
    //      matmul2 workspace size
-    uint64_t matmul2_workspace_size;
+    size_t matmul2_workspace_size;
    CHECK_STATUS(infiniopGetGemmWorkspaceSize(matmul2_desc, &matmul2_workspace_size));
-    //      matmul2 tensor size
-    uint64_t matmul2_tensor_size = temp_out_desc->numel() * infiniSizeOf(temp_out_desc->dtype());
+    matmul2_workspace_size = utils::align(matmul2_workspace_size, alignment);
+    //      attention value tensor size
+    size_t att_val_size = utils::align(att_val_desc->numel() * infiniSizeOf(att_val_desc->dtype()), alignment);

    // Rearrange temp_out into out
    //      out: [seq_len, n_q_head, head_dim]
    //      temp_out: [n_kv_head, n_group * seq_len, head_dim] -> [n_q_head, seq_len, head_dim] -> [seq_len, n_q_head, head_dim]
-    TRANSFORM_TENSOR_DESC(temp_out_desc, dimSplit(1, {n_group, seq_len}));
-    TRANSFORM_TENSOR_DESC(temp_out_desc, dimMerge(0, 1));
-    TRANSFORM_TENSOR_DESC(temp_out_desc, dimPermute({1, 0, 2}));
+    TRANSFORM_TENSOR_DESC(att_val_desc, dimSplit(1, {n_group, seq_len}));
+    TRANSFORM_TENSOR_DESC(att_val_desc, dimMerge(0, 1));
+    TRANSFORM_TENSOR_DESC(att_val_desc, dimPermute({1, 0, 2}));
    infiniopRearrangeDescriptor_t rearrange_desc_out;
-    CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_out, out_desc, temp_out_desc));
+    CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_out, out_desc, att_val_desc));

    // workspace size
-    uint64_t workspace_size = rearranged_q_size + std::max(std::max(matmul1_workspace_size + matmul1_tensor_size, matmul1_tensor_size + softmax_workspace_size), matmul1_tensor_size + matmul2_workspace_size + matmul2_tensor_size);
+    size_t op_workspace_size = utils::align(std::max(std::max(matmul1_workspace_size, matmul2_workspace_size), softmax_workspace_size), alignment);
+    size_t temp_tensors_size = attn_score_size + std::max(q_cont_size, att_val_size);
+    size_t workspace_size = temp_tensors_size + op_workspace_size;

    // k_cache_offset
-    uint64_t k_cache_offset = 0;
+    size_t k_cache_offset = 0;
    if (pos > 0) {
        k_cache_offset = pos * k_cache_desc->getByteStrides()[1];
    }

    // v_cache_offset
-    uint64_t v_cache_offset = 0;
+    size_t v_cache_offset = 0;
    if (pos > 0) {
        v_cache_offset = pos * v_cache_desc->getByteStrides()[1];
    }
@@ -200,12 +205,11 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
        matmul2_desc,
        softmax_desc,
        workspace_size,
-        rearranged_q_size,
-        matmul1_workspace_size,
-        matmul1_tensor_size,
-        matmul2_workspace_size,
-        matmul2_tensor_size,
-        softmax_workspace_size,
+        temp_tensors_size,
+        op_workspace_size,
+        attn_score_size,
+        0,
+        attn_score_size,
        k_cache_offset,
        v_cache_offset,
        1.f / std::sqrt(float(head_dim)),
@@ -214,14 +218,14 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
    return INFINI_STATUS_SUCCESS;
 }

-__C __export infiniStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, uint64_t *size) {
+__C __export infiniStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, size_t *size) {
    *size = ((InfiniopAttentionDescriptor *)desc)->workspace_size;
    return INFINI_STATUS_SUCCESS;
 }

 __C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc_,
-                                              void *workspace,
-                                              uint64_t workspace_size,
+                                              void *workspace_,
+                                              size_t workspace_size_,
                                              void *out,
                                              void const *q,
                                              void const *k,
@@ -230,11 +234,14 @@ __C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc
                                              void *v_cache,
                                              void *stream) {
    auto desc = (InfiniopAttentionDescriptor *)desc_;
-    void *workspace_ = workspace;
-    if (workspace_size < desc->workspace_size) {
+    if (workspace_size_ < desc->workspace_size) {
        return INFINI_STATUS_INSUFFICIENT_WORKSPACE; // STATUS_MEMORY_NOT_ALLOCATED
    }
-
+    void *workspace = (char *)workspace_ + desc->op_workspace_offset;
+    size_t workspace_size = desc->op_workspace_size;
+    void *att_score = (char *)workspace_ + desc->att_score_offset;
+    void *att_val = (char *)workspace_ + desc->att_val_offset;
+    void const *q_ = q;
    // concat k and v to k_cache and v_cache
    CHECK_STATUS(infiniopRearrange(desc->rearrange_desc_k,
                                   (char *)k_cache + desc->k_cache_offset, k, stream));
@@ -243,28 +250,26 @@ __C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc
                                   (char *)v_cache + desc->v_cache_offset, v, stream));

    // rearrange q into contiguous
-    void const *_q = q;
    if (desc->rearrange_desc_q) {
-        CHECK_STATUS(infiniopRearrange(desc->rearrange_desc_q, (char *)workspace_, q, stream));
-        _q = workspace_;
-        workspace_ = (char *)workspace_ + desc->rearranged_q_size;
+        void *q_cont = (char *)workspace_ + desc->q_cont_offset;
+        CHECK_STATUS(infiniopRearrange(desc->rearrange_desc_q, q_cont, q, stream));
+        q_ = q_cont;
    }

    // matmul1: q * full_k
    CHECK_STATUS(infiniopGemm(desc->matmul_desc1,
-                              (char *)workspace_ + desc->matmul1_tensor_size, workspace_size - desc->matmul1_tensor_size,
-                              workspace_, _q, k_cache, desc->qk_alpha, 0.0, stream));
+                              workspace, workspace_size,
+                              att_score, q_, k_cache, desc->qk_alpha, 0.0, stream));
    // softmax(qk)
    CHECK_STATUS(infiniopCausalSoftmax(desc->softmax_desc,
-                                       (char *)workspace_ + desc->matmul1_tensor_size, workspace_size - desc->matmul1_tensor_size,
-                                       workspace_, workspace_, stream));
+                                       workspace, workspace_size,
+                                       att_score, att_score, stream));
    // matmul2: softmax(qk) * full_v
    CHECK_STATUS(infiniopGemm(desc->matmul_desc2,
-                              (char *)workspace_ + desc->matmul1_tensor_size + desc->matmul2_tensor_size,
-                              workspace_size - desc->matmul1_tensor_size - desc->matmul2_tensor_size,
-                              (char *)workspace_ + desc->matmul1_tensor_size, workspace_, v_cache, 1.0, 0.0, stream));
+                              workspace, workspace_size,
+                              att_val, att_score, v_cache, 1.0, 0.0, stream));
    // rearrange out
-    CHECK_STATUS(infiniopRearrange(desc->rearrange_desc_out, out, (char *)workspace_ + desc->matmul1_tensor_size, stream));
+    CHECK_STATUS(infiniopRearrange(desc->rearrange_desc_out, out, att_val, stream));

    return INFINI_STATUS_SUCCESS;
 }

--- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
+++ b/src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
@@ -18,7 +18,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(

    // [Reduce] Find max value in each row and store in shared memory
    __shared__ Tdata max_;
-    Tdata max_0 = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, width);
+    Tdata max_0 = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, width - height + 1 + blockIdx.x);
    if (threadIdx.x == 0) {
        max_ = max_0;
    }

--- a/src/infiniop/ops/clip/cpu/clip_cpu.cc
+++ b/src/infiniop/ops/clip/cpu/clip_cpu.cc
+#include "clip_cpu.h"
+
+namespace op::clip::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &in_desc = input_desc_vec.at(0);
+    const auto &min_desc = input_desc_vec.at(1);
+    const auto &max_desc = input_desc_vec.at(2);
+    const auto &out_shape = out_desc->shape();
+    const auto &in_shape = in_desc->shape();
+    const auto &min_shape = min_desc->shape();
+    const auto &max_shape = max_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(out_shape, in_shape);
+    CHECK_SAME_SHAPE(out_shape, min_shape);
+    CHECK_SAME_SHAPE(out_shape, max_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ClipOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ClipOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ClipOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::clip::cpu
--- a/src/infiniop/ops/clip/cpu/clip_cpu.h
+++ b/src/infiniop/ops/clip/cpu/clip_cpu.h
+#ifndef __CLIP_CPU_H__
+#define __CLIP_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "infiniop/ops/clip.h"
+
+ELEMENTWISE_DESCRIPTOR(clip, cpu)
+
+namespace op::clip::cpu {
+
+typedef struct ClipOp {
+public:
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    T operator()(const T &x, const T &min_val, const T &max_val) const {
+        return std::max(std::min(x, max_val), min_val);
+    }
+} ClipOp;
+
+} // namespace op::clip::cpu
+
+#endif // __CLIP_CPU_H__
--- a/src/infiniop/ops/clip/cuda/clip_cuda.cu
+++ b/src/infiniop/ops/clip/cuda/clip_cuda.cu
+#include "clip_cuda.cuh"
+#include "clip_cuda_internal.cuh"
+
+namespace op::clip::cuda {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &in_desc = input_desc_vec.at(0);
+    const auto &min_desc = input_desc_vec.at(1);
+    const auto &max_desc = input_desc_vec.at(2);
+    const auto &out_shape = out_desc->shape();
+    const auto &in_shape = in_desc->shape();
+    const auto &min_shape = min_desc->shape();
+    const auto &max_shape = max_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(out_shape, in_shape);
+    CHECK_SAME_SHAPE(out_shape, min_shape);
+    CHECK_SAME_SHAPE(out_shape, max_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, ClipOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, ClipOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, ClipOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::clip::cuda
--- a/src/infiniop/ops/clip/cuda/clip_cuda.cuh
+++ b/src/infiniop/ops/clip/cuda/clip_cuda.cuh
+#ifndef __CLIP_CUDA_API_H__
+#define __CLIP_CUDA_API_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+#include "infiniop/ops/clip.h"
+
+ELEMENTWISE_DESCRIPTOR(clip, cuda)
+
+#endif // __CLIP_CUDA_API_H__
--- a/src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
+++ b/src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
+#ifndef __CLIP_CUDA_H__
+#define __CLIP_CUDA_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include <cuda_fp16.h>
+
+namespace op::clip::cuda {
+
+typedef struct ClipOp {
+public:
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, const T &min_val, const T &max_val) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmax2(__hmin2(x, max_val), min_val);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hmax(__hmin(x, max_val), min_val);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fmaxf(fminf(x, max_val), min_val);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return fmax(fmin(x, max_val), min_val);
+        } else {
+            return std::max(std::min(x, max_val), min_val);
+        }
+    }
+} ClipOp;
+} // namespace op::clip::cuda
+
+#endif // __CLIP_CUDA_H__
--- a/src/infiniop/ops/clip/operator.cc
+++ b/src/infiniop/ops/clip/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/clip.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/clip_cpu.h"
+#endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/clip_cuda.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateClipDescriptor(
+    infiniopHandle_t handle,
+    infiniopClipDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t min_val,
+    infiniopTensorDescriptor_t max_val) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::clip::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::clip::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                              \
+            {x, min_val, max_val})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::clip::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda)
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopClip(
+    infiniopClipDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *min_val,
+    const void *max_val,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x, min_val, max_val}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
+++ b/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
@@ -3,6 +3,26 @@
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/level2/aclnn_gemm.h>

+#include <cstring>
+#include <unordered_map>
+
+// Custom hash function for alpha beta pair<float, float>
+struct FloatPairHash {
+    size_t operator()(const std::pair<float, float> &p) const {
+        uint64_t combined;
+        std::memcpy(reinterpret_cast<char *>(&combined), &p.first, sizeof(float));
+        std::memcpy(reinterpret_cast<char *>(&combined) + sizeof(float), &p.second, sizeof(float));
+
+        return std::hash<uint64_t>()(combined);
+    }
+};
+
+struct FloatPairEqual {
+    bool operator()(const std::pair<float, float> &a, const std::pair<float, float> &b) const {
+        return a.first == b.first && a.second == b.second;
+    }
+};
+
 namespace op::gemm::ascend {

 struct Descriptor::Opaque {
@@ -11,11 +31,17 @@ struct Descriptor::Opaque {
    // see doc:
    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
    int8_t mt;
+    // alpha&beta hashmap
+    std::unordered_map<std::pair<float, float>, aclOpExecutor *, FloatPairHash, FloatPairEqual> lookup;

    ~Opaque() {
        delete c;
        delete a;
        delete b;
+        for (auto &item : lookup) {
+            aclDestroyAclOpExecutor(item.second);
+        }
+        lookup.clear();
    }
 };

@@ -54,15 +80,16 @@ infiniStatus_t Descriptor::create(
         ta = a->tensor,
         tb = b->tensor;

+    std::unordered_map<std::pair<float, float>, aclOpExecutor *, FloatPairHash, FloatPairEqual> lookup;
    aclOpExecutor *executor = nullptr;
    size_t workspace_size = 0;
-    // aclnnGemm support C = alpha * A @ B + beta * C
-    // see
-    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
-    // use alpha = 0.5, beta = 0.5 temporarily
-
    int8_t mt = 1;
-    CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspace_size, &executor));
+    CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, 1., 0., 0, 0, tc, mt, &workspace_size, &executor));
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
+    lookup[std::make_pair(1.0f, 0.0f)] = executor;
+    CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, 1., 1., 0, 0, tc, mt, &workspace_size, &executor));
+    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
+    lookup[std::make_pair(1.0f, 1.0f)] = executor;

    *desc_ptr = new Descriptor(
        dtype, info, workspace_size,
@@ -71,11 +98,9 @@ infiniStatus_t Descriptor::create(
            a,
            b,
            mt,
-        },
+            std::move(lookup)},
        handle->device, handle->device_id);

-    aclDestroyAclOpExecutor(executor);
-
    return INFINI_STATUS_SUCCESS;
 }

@@ -93,16 +118,22 @@ infiniStatus_t Descriptor::calculate(
         ta = _opaque->a->tensor,
         tb = _opaque->b->tensor;

-    size_t workspace_size = 0;
-    aclOpExecutor *executor = nullptr;
+    size_t workspace_size = _workspace_size;
+    aclOpExecutor *executor;
+    auto key = std::make_pair(alpha, beta);
+    if (_opaque->lookup.find(key) != _opaque->lookup.end()) {
+        executor = _opaque->lookup[key];
+    } else {
+        CHECK_ACL(aclnnGemmGetWorkspaceSize(
+            ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
+            &workspace_size, &executor));
+        CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
+        _opaque->lookup[key] = executor;
+    }

-    CHECK_ACL(aclnnGemmGetWorkspaceSize(
-        ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
-        &workspace_size, &executor));
    if (workspaceSize_ < workspace_size) {
        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
    }
-    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));

    auto unit = infiniSizeOf(_dtype);
    for (size_t i = 0; i < _info.batch; ++i) {

--- a/src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
+++ b/src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
@@ -26,10 +26,12 @@ infiniStatus_t Descriptor::create(
    auto info = result.take();
    size_t workspace_size;

-#define CASE_P(CASE, Tidx, Tval)                                 \
-    case CASE:                                                   \
-        workspace_size = calculateWorkspace<Tidx, Tval>(info.n); \
-        break
+#define CASE_P(CASE, Tidx, Tval)                                        \
+    case CASE: {                                                        \
+        auto workspace_result = calculateWorkspace<Tidx, Tval>(info.n); \
+        CHECK_RESULT(workspace_result);                                 \
+        workspace_size = workspace_result.take();                       \
+    } break

 #define CASE_I(CASE, Tidx)                          \
    case CASE:                                      \

--- a/src/infiniop/ops/rope/ascend/rope_ascend.cc
+++ b/src/infiniop/ops/rope/ascend/rope_ascend.cc
+#include "rope_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+
+namespace op::rope::ascend {
+
+Descriptor::~Descriptor()
+    = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t pos_desc,
+    infiniopTensorDescriptor_t sin_desc,
+    infiniopTensorDescriptor_t cos_desc) {
+    auto handle_ascned = reinterpret_cast<device::ascend::Handle *>(handle);
+    auto result = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
+    CHECK_RESULT(result);
+
+    size_t workspace_size = 0;
+    *desc_ptr = new Descriptor(std::move(result.take()), workspace_size, nullptr, handle_ascned->device, handle_ascned->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) const {
+    CHECK_DTYPE(_info.data_type, INFINI_DTYPE_F32, INFINI_DTYPE_F16);
+
+    auto data_type = _info.data_type;
+    auto pos_type = _info.pos_type;
+    auto seq_len = _info.seqlen;
+    auto nhead = _info.nhead;
+    auto dhead = _info.dhead;
+
+    auto y_stride_seqlen = _info.y_stride_seqlen;
+    auto y_stride_nhead = _info.y_stride_nhead;
+    auto x_stride_seqlen = _info.x_stride_seqlen;
+    auto x_stride_nhead = _info.x_stride_nhead;
+
+    return rope_kernel_launch(y, (void *)x, (void *)pos_ids, (void *)sin_table, (void *)cos_table, seq_len, nhead, dhead, data_type, pos_type, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead, stream);
+}
+} // namespace op::rope::ascend
--- a/src/infiniop/ops/rope/ascend/rope_ascend.h
+++ b/src/infiniop/ops/rope/ascend/rope_ascend.h
+#ifndef __ACLNN_ROPE_H__
+#define __ACLNN_ROPE_H__
+
+#include "../rope.h"
+
+extern "C" infiniStatus_t rope_kernel_launch(
+    void *y,
+    void *x,
+    void *pos,
+    void *sin,
+    void *cos,
+    size_t seq_len,
+    size_t nhead,
+    size_t dhead,
+    infiniDtype_t data_type,
+    infiniDtype_t pos_type,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead,
+    void *stream);
+
+DESCRIPTOR(ascend)
+
+#endif // __ACLNN_ROPE_H__
--- a/src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
+++ b/src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
+#include "../../../devices/ascend/ascend_kernel_common.h"
+
+using namespace AscendC;
+
+template <typename T, typename U>
+class RoPEKernel {
+public:
+    __aicore__ inline RoPEKernel() {}
+    // Init op
+    // pos position vector
+    // x input tensor
+    // y output tensor
+    // tensor shape [nt, nh, dh]
+    // make block_num = nh, tile_len = dh
+    __aicore__ inline void init(GM_ADDR y,
+                                GM_ADDR x,
+                                GM_ADDR pos,
+                                GM_ADDR sin,
+                                GM_ADDR cos,
+                                size_t dh,
+                                ptrdiff_t st_ynt,
+                                ptrdiff_t st_ynh,
+                                ptrdiff_t st_xnt,
+                                ptrdiff_t st_xnh);
+    __aicore__ inline void process(size_t seq_len);
+
+private:
+    // Copy a tile into UB
+    __aicore__ inline void copyIn(size_t i);
+    __aicore__ inline void compute(size_t i);
+    __aicore__ inline void copyOut(size_t i);
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _in_que;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _sin_que;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _cos_que;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> _out_que;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf;
+    TBuf<TPosition::VECCALC> _tmp_even_buf;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf1;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf2;
+    TBuf<TPosition::VECCALC> _tmp_even_buf1;
+    TBuf<TPosition::VECCALC> _tmp_even_buf2;
+
+    GlobalTensor<T> _x_gm, _y_gm;
+    GlobalTensor<U> _p_gm;
+    GlobalTensor<T> _sin_gm;
+    GlobalTensor<T> _cos_gm;
+
+    size_t _block_idx;
+    size_t _tile_len;
+    size_t _copy_len;
+    size_t _half_copy_len;
+
+    // stridey[_st_ynt, _st_ynh, 1]
+    ptrdiff_t _st_ynt;
+    ptrdiff_t _st_ynh;
+    // stridex[_st_xnt, _st_xnh, 1]
+    ptrdiff_t _st_xnt;
+    ptrdiff_t _st_xnh;
+};
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::init(GM_ADDR y,
+                                              GM_ADDR x,
+                                              GM_ADDR pos,
+                                              GM_ADDR sin,
+                                              GM_ADDR cos,
+                                              size_t dh,
+                                              ptrdiff_t st_ynt,
+                                              ptrdiff_t st_ynh,
+                                              ptrdiff_t st_xnt,
+                                              ptrdiff_t st_xnh) {
+    this->_tile_len = dh;
+    this->_st_ynt = st_ynt;
+    this->_st_ynh = st_ynh;
+    this->_st_xnt = st_xnt;
+    this->_st_xnh = st_xnh;
+    _copy_len = alignTileLen<T>(dh, BYTE_ALIGN);
+    _half_copy_len = alignTileLen<T>(dh, BYTE_ALIGN);
+
+    _block_idx = GetBlockIdx();
+
+    // Init global buffer
+    _x_gm.SetGlobalBuffer((__gm__ T *)x);
+    _p_gm.SetGlobalBuffer((__gm__ U *)pos);
+    _sin_gm.SetGlobalBuffer((__gm__ T *)sin);
+    _cos_gm.SetGlobalBuffer((__gm__ T *)cos);
+    _y_gm.SetGlobalBuffer((__gm__ T *)y);
+
+    // Init Queue buffer
+    pipe.InitBuffer(_in_que, BUFFER_NUM, _copy_len * sizeof(T));
+    pipe.InitBuffer(_out_que, BUFFER_NUM, _tile_len * sizeof(T));
+    pipe.InitBuffer(_sin_que, BUFFER_NUM, _half_copy_len * sizeof(T));
+    pipe.InitBuffer(_cos_que, BUFFER_NUM, _half_copy_len * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf1, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf2, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf1, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf2, _tile_len / 2 * sizeof(T));
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::copyIn(size_t i) {
+    LocalTensor<T> input_ub = _in_que.AllocTensor<T>();
+    LocalTensor<T> sin_ub = _sin_que.AllocTensor<T>();
+    LocalTensor<T> cos_ub = _cos_que.AllocTensor<T>();
+    // Get idx of current tile in total input
+    auto idx = i * _st_xnt + _block_idx * _st_xnh;
+    // Copy tile current tile into UB
+    DataCopy(input_ub, _x_gm[idx], _copy_len);
+    // Copy sin cos tile
+    auto pos_idx = _p_gm(i);
+    DataCopy(sin_ub, _sin_gm[pos_idx * _tile_len / 2], _half_copy_len);
+    DataCopy(cos_ub, _cos_gm[pos_idx * _tile_len / 2], _half_copy_len);
+    // Push in operands
+    _in_que.EnQue(input_ub);
+    _sin_que.EnQue(sin_ub);
+    _cos_que.EnQue(cos_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::compute(size_t i) {
+    LocalTensor<T> input_ub = _in_que.DeQue<T>();
+    LocalTensor<T> sin_ub = _sin_que.DeQue<T>();
+    LocalTensor<T> cos_ub = _cos_que.DeQue<T>();
+    LocalTensor<T> output_ub = _out_que.AllocTensor<T>();
+
+    LocalTensor<T> tmp_odd = _tmp_odd_buf.Get<T>();
+    LocalTensor<T> tmp_even = _tmp_even_buf.Get<T>();
+    LocalTensor<T> tmp_odd1 = _tmp_odd_buf1.Get<T>();
+    LocalTensor<T> tmp_odd2 = _tmp_odd_buf2.Get<T>();
+    LocalTensor<T> tmp_even1 = _tmp_even_buf1.Get<T>();
+    LocalTensor<T> tmp_even2 = _tmp_even_buf2.Get<T>();
+
+    // separate odd and even bit elements
+    uint64_t rsvdCnt = 0;
+    GatherMaskParams gMaskParams = {
+        1,
+        static_cast<uint16_t>((_tile_len * sizeof(T) + 255) / 256), // no more than 256(<=255)
+        8,
+        8,
+    };
+    GatherMask<T>(tmp_odd, input_ub, 1, false, 0, gMaskParams, rsvdCnt);
+    GatherMask<T>(tmp_even, input_ub, 2, false, 0, gMaskParams, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+
+    // compute odd bit elements
+    // y_odd = x_odd * cos - x_even * sin
+    Mul<T>(tmp_odd1, tmp_odd, cos_ub, _tile_len / 2);
+    Mul<T>(tmp_odd2, tmp_even, sin_ub, _tile_len / 2);
+    PipeBarrier<PIPE_V>();
+    Sub<T>(tmp_odd1, tmp_odd1, tmp_odd2, _tile_len / 2);
+
+    // compute even bit elements
+    // y_even = x_odd * sin + x_even * cos
+    Mul<T>(tmp_even1, tmp_odd, sin_ub, _tile_len / 2);
+    Mul<T>(tmp_even2, tmp_even, cos_ub, _tile_len / 2);
+    PipeBarrier<PIPE_V>();
+    Add<T>(tmp_even1, tmp_even1, tmp_even2, _tile_len / 2);
+
+    // combine odd and even bit elements
+    for (uint32_t j = 0; j < _tile_len / 2; j += 1) {
+        output_ub(j * 2) = tmp_odd1(j);
+        output_ub(j * 2 + 1) = tmp_even1(j);
+    }
+
+    _out_que.EnQue<T>(output_ub);
+    _in_que.FreeTensor(input_ub);
+    _sin_que.FreeTensor(sin_ub);
+    _cos_que.FreeTensor(cos_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::copyOut(size_t i) {
+    LocalTensor<T> output_ub = _out_que.DeQue<T>();
+    auto idy = i * _st_ynt + _block_idx * _st_ynh;
+    DataCopyExtParams params = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
+    DataCopyPad(_y_gm[idy], output_ub, params);
+    _out_que.FreeTensor(output_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::process(size_t seq_len) {
+
+    for (size_t i = 0; i < seq_len; ++i) {
+        copyIn(i);
+        compute(i);
+        copyOut(i);
+    }
+}
+
+#define ROPE_KERNEL_INIT_ARGS y, x, pos, sin, cos, dhead,      \
+                              y_stride_seqlen, y_stride_nhead, \
+                              x_stride_seqlen, x_stride_nhead
+
+#define CASE_POSTYPE(POS_TYPE_ENUM, TYPE, POS_T) \
+    case POS_TYPE_ENUM: {                        \
+        RoPEKernel<TYPE, POS_T> op;              \
+        op.init(ROPE_KERNEL_INIT_ARGS);          \
+        op.process(seq_len);                     \
+        break;                                   \
+    }
+
+#define ROPE_KERNEL(TYPE, POSTYPE)                     \
+    switch (POSTYPE) {                                 \
+        CASE_POSTYPE(INFINI_DTYPE_I8, TYPE, int8_t)    \
+        CASE_POSTYPE(INFINI_DTYPE_I16, TYPE, int16_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_I32, TYPE, int32_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_I64, TYPE, int64_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_U8, TYPE, uint8_t)   \
+        CASE_POSTYPE(INFINI_DTYPE_U16, TYPE, uint16_t) \
+        CASE_POSTYPE(INFINI_DTYPE_U32, TYPE, uint32_t) \
+        CASE_POSTYPE(INFINI_DTYPE_U64, TYPE, uint64_t) \
+    default:                                           \
+        break;                                         \
+    }
+
+#define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE)                         \
+    __global__ __aicore__ void KERNEL_NAME(GM_ADDR y,                 \
+                                           GM_ADDR x,                 \
+                                           GM_ADDR pos,               \
+                                           GM_ADDR sin,               \
+                                           GM_ADDR cos,               \
+                                           size_t seq_len,            \
+                                           size_t dhead,              \
+                                           ptrdiff_t y_stride_seqlen, \
+                                           ptrdiff_t y_stride_nhead,  \
+                                           ptrdiff_t x_stride_seqlen, \
+                                           ptrdiff_t x_stride_nhead,  \
+                                           int32_t pos_type) {        \
+        ROPE_KERNEL(TYPE, pos_type)                                   \
+    }
+
+DEFINE_ROPE_KERNEL(rope_kernel_float, float)
+DEFINE_ROPE_KERNEL(rope_kernel_half, half)
+
+#undef DEFINE_ROPE_KERNEL
+#undef ROPE_KERNEL
+#undef CASE_POSTYPE
+#undef ROPE_KERNEL_INIT_ARGS
+
+extern "C" infiniStatus_t rope_kernel_launch(
+    void *y,
+    void *x,
+    void *pos,
+    void *sin,
+    void *cos,
+    size_t seq_len,
+    size_t nhead,
+    size_t dhead,
+    infiniDtype_t dtype,
+    infiniDtype_t pos_type,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead,
+    void *stream) {
+
+#define LAUNCH_ROPE_KERNEL(DTYPE_ENUM, KERNEL_NAME)                  \
+    case DTYPE_ENUM:                                                 \
+        KERNEL_NAME<<<nhead, nullptr, stream>>>(y, x, pos, sin, cos, \
+                                                seq_len,             \
+                                                dhead,               \
+                                                y_stride_seqlen,     \
+                                                y_stride_nhead,      \
+                                                x_stride_seqlen,     \
+                                                x_stride_nhead,      \
+                                                pos_type);           \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (dtype) {
+        LAUNCH_ROPE_KERNEL(INFINI_DTYPE_F16, rope_kernel_half)
+        LAUNCH_ROPE_KERNEL(INFINI_DTYPE_F32, rope_kernel_float)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}