issue/48 rope cpu

07279a25 · PanZezhong · b985bc5e · 07279a25 · 07279a25 · 07279a25
Commit 07279a25 authored Apr 16, 2025 by PanZezhong
7 changed files
--- a/src/infiniop/ops/rope/cpu/rope_cpu.cc
+++ b/src/infiniop/ops/rope/cpu/rope_cpu.cc
+#include "rope_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::rope::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t pos_desc,
+    infiniopTensorDescriptor_t sin_desc,
+    infiniopTensorDescriptor_t cos_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
+    CHECK_RESULT(info);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata, typename Tindex>
+infiniStatus_t calculateRoPE(const RoPEInfo &info,
+                             Tdata *y,
+                             const Tdata *x,
+                             const Tindex *pos_ids,
+                             const Tdata *sin_table,
+                             const Tdata *cos_table) {
+#pragma omp parallel for
+    for (ptrdiff_t h = 0; h < ptrdiff_t(info.nhead); h++) {
+        for (size_t tok = 0; tok < info.seqlen; tok++) {
+            size_t x_offset = tok * info.x_stride_seqlen + h * info.x_stride_nhead;
+            size_t y_offset = tok * info.y_stride_seqlen + h * info.y_stride_nhead;
+            size_t pos_id = size_t(pos_ids[tok]);
+            size_t table_offset = pos_id * info.table_dim;
+
+            for (size_t i = 0; i < info.table_dim; i++) {
+                size_t pos0 = 2 * i;
+                size_t pos1 = 2 * i + 1;
+
+                if constexpr (std::is_same<Tdata, fp16_t>::value) {
+                    float x0 = utils::cast<float>(x[x_offset + pos0]),
+                          x1 = utils::cast<float>(x[x_offset + pos1]),
+                          sin__ = utils::cast<float>(sin_table[table_offset + i]),
+                          cos__ = utils::cast<float>(cos_table[table_offset + i]);
+
+                    y[y_offset + pos0] = utils::cast<fp16_t>(x0 * cos__ - x1 * sin__);
+                    y[y_offset + pos1] = utils::cast<fp16_t>(x0 * sin__ + x1 * cos__);
+                } else {
+                    Tdata x0 = x[x_offset + pos0],
+                          x1 = x[x_offset + pos1],
+                          sin__ = sin_table[table_offset + i],
+                          cos__ = cos_table[table_offset + i];
+
+                    y[y_offset + pos0] = x0 * cos__ - x1 * sin__;
+                    y[y_offset + pos1] = x0 * sin__ + x1 * cos__;
+                }
+            }
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROPE(TDATA, TINDEX) \
+    calculateRoPE(_info, (TDATA *)y, (const TDATA *)x, (const TINDEX *)pos_ids, (const TDATA *)sin_table, (const TDATA *)cos_table)
+
+#define ROPE_TYPE(TDATA)                        \
+    switch (_info.pos_type) {                   \
+    case INFINI_DTYPE_U8:                       \
+        return CALCULATE_ROPE(TDATA, uint8_t);  \
+    case INFINI_DTYPE_U16:                      \
+        return CALCULATE_ROPE(TDATA, uint16_t); \
+    case INFINI_DTYPE_U32:                      \
+        return CALCULATE_ROPE(TDATA, uint32_t); \
+    case INFINI_DTYPE_U64:                      \
+        return CALCULATE_ROPE(TDATA, uint64_t); \
+    default:                                    \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;  \
+    }
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) const {
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        ROPE_TYPE(fp16_t);
+    case INFINI_DTYPE_F32:
+        ROPE_TYPE(float);
+    case INFINI_DTYPE_F64:
+        ROPE_TYPE(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef ROPE_TYPE
+#undef CALCULATE_ROPE
+
+} // namespace op::rope::cpu
\ No newline at end of file
--- a/src/infiniop/ops/rope/cpu/rope_cpu.h
+++ b/src/infiniop/ops/rope/cpu/rope_cpu.h
+#ifndef __INFINIOP_ROPE_CPU_H__
+#define __INFINIOP_ROPE_CPU_H__
+
+#include "../rope.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __INFINIOP_ROPE_CPU_H__
--- a/src/infiniop/ops/rope/operator.cc
+++ b/src/infiniop/ops/rope/operator.cc
@@ -2,6 +2,10 @@
 #include "../../handle.h"
 #include "infiniop/ops/rope.h"

+#ifdef ENABLE_CPU_API
+#include "cpu/rope_cpu.h"
+#endif
+
 __C infiniStatus_t infiniopCreateRoPEDescriptor(
    infiniopHandle_t handle,
    infiniopRoPEDescriptor_t *desc_ptr,
@@ -10,12 +14,21 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
    infiniopTensorDescriptor_t pos_ids,
    infiniopTensorDescriptor_t sin_table,
    infiniopTensorDescriptor_t cos_table) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::rope::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::rope::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                              \
+            x,                                                              \
+            pos_ids,                                                        \
+            sin_table,                                                      \
+            cos_table)
+
    switch (handle->device) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuCreateRoPEDescriptor((CpuHandle_t)handle,
-                                       (RoPECpuDescriptor_t *)desc_ptr, t,
-                                       pos_ids, sin_table, cos_table);
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu: {
@@ -54,15 +67,22 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
    }
 #endif
    }
+
+#undef CREATE
+
    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }

 __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
                                                size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<const op::rope::NAMESPACE::Descriptor *>(desc)->workspace_size; \
+        return INFINI_STATUS_SUCCESS
+
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuGetRoPEWorkspaceSize((RoPECpuDescriptor_t)desc, size);
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu: {
@@ -91,6 +111,9 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
    }
 #endif
    }
+
+#undef GET
+
    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }

@@ -100,15 +123,19 @@ __C infiniStatus_t infiniopRoPE(
    size_t workspace_size,
    void *y,
    const void *x,
-    void const *pos_ids,
-    void const *sin_table,
-    void const *cos_table,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::rope::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, pos_ids, sin_table, cos_table, stream)
+
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuRoPE((RoPECpuDescriptor_t)desc, workspace, workspace_size, t,
-                       pos_ids, sin_table, cos_table, stream);
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu: {
@@ -143,15 +170,23 @@ __C infiniStatus_t infiniopRoPE(
    }
 #endif
    }
+
+#undef CALCULATE
+
    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }

 __C infiniStatus_t
 infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::rope::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
    switch (desc->device_type) {
-#ifdef ENABLE_CPU
-    case DevCpu:
-        return cpuDestroyRoPEDescriptor((RoPECpuDescriptor_t)desc);
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu: {
@@ -180,5 +215,8 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
    }
 #endif
    }
+
+#undef DELETE
+
    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
--- a/src/infiniop/ops/rope/rope.h
+++ b/src/infiniop/ops/rope/rope.h
+#ifndef __ROPE_H__
+#define __ROPE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+#define DESCRIPTOR(NAMESPACE)                             \
+                                                          \
+    namespace op::rope::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+        RoPEInfo _info;                                   \
+                                                          \
+        Descriptor(                                       \
+            RoPEInfo info,                                \
+            size_t workspace_size_,                       \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _info(info),                                \
+              workspace_size(workspace_size_) {}          \
+                                                          \
+    public:                                               \
+        size_t workspace_size;                            \
+                                                          \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t y_desc,            \
+            infiniopTensorDescriptor_t x_desc,            \
+            infiniopTensorDescriptor_t pos_desc,          \
+            infiniopTensorDescriptor_t sin_desc,          \
+            infiniopTensorDescriptor_t cos_desc);         \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *workspace,                              \
+            size_t workspace_size,                        \
+            void *y,                                      \
+            const void *x,                                \
+            const void *pos_ids,                          \
+            const void *sin_table,                        \
+            const void *cos_table,                        \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+class RoPEInfo {
+private:
+    RoPEInfo() = default;
+
+public:
+    infiniDtype_t data_type, pos_type;
+    size_t seqlen, nhead, dhead, table_len, table_dim;
+    ptrdiff_t
+        y_stride_seqlen,
+        y_stride_nhead,
+        x_stride_seqlen,
+        x_stride_nhead;
+
+    static utils::Result<RoPEInfo> createRoPEInfo(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t pos_desc,
+        infiniopTensorDescriptor_t sin_desc,
+        infiniopTensorDescriptor_t cos_desc) {
+        CHECK_OR_RETURN(
+            y_desc != nullptr && pos_desc != nullptr && sin_desc != nullptr && cos_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t data_type = y_desc->dtype();
+        const infiniDtype_t pos_type = pos_desc->dtype();
+        CHECK_OR_RETURN(data_type == x_desc->dtype() && data_type == sin_desc->dtype() && data_type == cos_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_DTYPE(pos_type, INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64);
+
+        CHECK_OR_RETURN(y_desc->ndim() == 3
+                            && x_desc->ndim() == 3
+                            && pos_desc->ndim() == 1
+                            && sin_desc->ndim() == 2
+                            && cos_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        const auto seqlen = y_desc->dim(0),
+                   nhead = y_desc->dim(1),
+                   dhead = y_desc->dim(2),
+                   table_len = sin_desc->dim(0),
+                   table_dim = sin_desc->dim(1);
+
+        CHECK_OR_RETURN(seqlen == x_desc->dim(0)
+                            && seqlen == pos_desc->dim(0)
+                            && nhead == x_desc->dim(1) && dhead == x_desc->dim(2)
+                            && table_len == cos_desc->dim(0) && table_dim == cos_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        CHECK_OR_RETURN(dhead == table_dim * 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        // Last dimension of x and y must be contiguous
+        CHECK_OR_RETURN(y_desc->stride(2) == 1 && x_desc->stride(2) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
+        // sin table and cos table must be totally contiguous
+        CHECK_OR_RETURN(sin_desc->stride(1) == 1
+                            && cos_desc->stride(1) == 1
+                            && sin_desc->stride(0) == table_dim
+                            && cos_desc->stride(0) == table_dim,
+                        INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+        return utils::Result<RoPEInfo>(RoPEInfo{
+            data_type,
+            pos_type,
+            seqlen,
+            nhead,
+            dhead,
+            table_len,
+            table_dim,
+            y_desc->stride(0),
+            y_desc->stride(1),
+            x_desc->stride(0),
+            x_desc->stride(1),
+        });
+    }
+};
+
+#endif
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -3,6 +3,16 @@
 #include <iostream>
 #include <tuple>

+#define CHECK_OR_RETURN(CONDITION, ERROR)                                    \
+    do {                                                                     \
+        if (!(CONDITION)) {                                                  \
+            std::cerr << "Check Failed: `(" << #CONDITION << ")` is False"   \
+                      << " from " << __func__                                \
+                      << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
+            return ERROR;                                                    \
+        }                                                                    \
+    } while (0)
+
 #define CHECK_API_OR(API, EXPECT, ACTION)                                       \
    do {                                                                        \
        auto api_result_ = (API);                                               \

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -10,7 +10,7 @@ def check_error(status):
        raise Exception("Error code " + str(status))


-def to_tensor(tensor, lib):
+def to_tensor(tensor, lib, force_unsigned=False):
    """
    Convert a PyTorch tensor to a library Tensor(descriptor, data).
    """
@@ -37,6 +37,16 @@ def to_tensor(tensor, lib):
        InfiniDtype.U64 if tensor.dtype == torch.uint64 else
        None
    )
+    
+    if force_unsigned:
+        dt = (
+            InfiniDtype.U8 if dt == InfiniDtype.I8 else
+            InfiniDtype.U16 if dt == InfiniDtype.I16 else
+            InfiniDtype.U32 if dt == InfiniDtype.I32 else
+            InfiniDtype.U64 if dt == InfiniDtype.I64 else
+            dt
+        )
+
    # fmt: on
    assert dt is not None
    # Create TensorDecriptor

--- a/test/infiniop/rope.py
+++ b/test/infiniop/rope.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
 from libinfiniop import (
-    InfiniDtype,
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
    open_lib,
@@ -18,30 +17,49 @@ from libinfiniop import (
    profile_operation,
    synchronize_device,
 )
+from enum import Enum, auto

 # ==============================================================================
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
-_TEST_CASES = [
-    # (t_shape, t_strides)
-    ((1, 32, 128), None),
-    ((1, 32, 64), None),
+_TEST_CASES_ = [
+    # (shape, x_strides, y_strides)
+    ((1, 32, 128), None, None),
+    ((1, 32, 64), None, None),
    # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
    # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
-    ((4, 1, 32), None),
-    ((1, 32, 128), None),
-    ((3, 32, 128), (8000, 200, 1)),
+    ((4, 1, 32), (64, 64, 1), None),
+    ((11, 33, 128), None, (8000, 200, 1)),
+    ((3, 32, 128), (8000, 200, 1), (7000, 128, 1)),
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16]
+_TENSOR_DTYPES = [torch.float16, torch.float32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
    torch.float16: {"atol": 1e-4, "rtol": 1e-2},
+    torch.float32: {"atol": 1e-4, "rtol": 1e-3},
 }

+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
 DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
@@ -55,23 +73,14 @@ class RoPEDescriptor(Structure):
 infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor)


-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[0], x.shape[-1])
-    shape = [d if i == 0 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(*shape)
-
-
-def rotary_embedding(t, pos, theta, torch_device):
+def rotary_embedding(t, sin, cos, torch_device):
    dh = t.shape[2]
+    dt = t.dtype
    assert dh % 2 == 0, "Embedding dimension must be even."
-    t_even = t[..., 0::2]  # [seq_len, n_head, dh // 2]
-    t_odd = t[..., 1::2]  # [seq_len, n_head, dh // 2]
-    freqs = (1.0 / (theta ** (torch.arange(0, dh, 2).float() / dh))).to(torch_device)
-    freqs = torch.outer(pos, freqs)  # [seq_len, dh // 2]
-    cos = torch.cos(freqs).unsqueeze(1)  # [seq_len, 1, dh // 2]
-    sin = torch.sin(freqs).unsqueeze(1)  # [seq_len, 1, dh // 2]
+    t_even = t[..., 0::2].float()  # [seq_len, n_head, dh // 2]
+    t_odd = t[..., 1::2].float()  # [seq_len, n_head, dh // 2]
+    cos = cos.unsqueeze(1).float()  # [seq_len, 1, dh // 2]
+    sin = sin.unsqueeze(1).float()  # [seq_len, 1, dh // 2]

    t_out_even = t_even * cos - t_odd * sin
    t_out_odd = t_even * sin + t_odd * cos
@@ -80,51 +89,56 @@ def rotary_embedding(t, pos, theta, torch_device):
    t_out[..., 0::2] = t_out_even
    t_out[..., 1::2] = t_out_odd

-    return t_out
+    return t_out.to(dt).to(torch_device)


-def sin_cos_table(max_seq_len, dim, torch_device, theta):
-    pos = torch.arange(
-        0, max_seq_len, dtype=torch.float32, device=torch.device(torch_device)
-    )
+def sin_cos_table(pos, dim, torch_device, theta, dtype):
+    assert dim % 2 == 0, "Embedding dimension must be even."
    freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to(
        torch_device
    )
-    # (a0, a1, a2) -> (a0, a0, a1, a1, a2, a2)
-    freqs = torch.repeat_interleave(freqs, repeats=2)
    angles = torch.outer(pos, freqs)
-    return torch.sin(angles), torch.cos(angles)
-
-
-def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
+    return torch.sin(angles).to(dtype), torch.cos(angles).to(dtype)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    x_strides=None,
+    y_strides=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float32,
+):
+    if inplace == Inplace.INPLACE_X:
+        y_strides = x_strides
    print(
-        f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
+        f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{dtype} inplace:{inplace}"
    )

-    t = torch.rand(shape, dtype=dtype)
+    x = torch.rand(shape, dtype=dtype).to(torch_device)
+    x = rearrange_if_needed(x, x_strides)
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = torch.rand(shape, dtype=dtype).to(torch_device)
+        y = rearrange_if_needed(y, y_strides)
+    theta = 1e5
+    pos = torch.arange(0, x.shape[0], dtype=torch.int32).to(torch_device)
+    sin_table, cos_table = sin_cos_table(pos, x.shape[2], x.device, theta, dtype)

-    t = rearrange_if_needed(t, strides)
-
-    posTmp = torch.arange(0, t.shape[0]).to(torch_device)
-    pos = torch.zeros(2 * posTmp.shape[0], dtype=torch.int32)
-    for i in range(posTmp.shape[0]):
-        pos[2 * i] = posTmp[i]
-        pos[2 * i + 1] = 0
-    pos = pos.to(torch_device)
-    theta = 1e4
-
-    ans = rotary_embedding(t, posTmp, theta, torch_device)
+    ans = rotary_embedding(x, sin_table, cos_table, torch_device)

    descriptor = infiniopRoPEDescriptor_t()
-    # 2x table length for test
-    sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
-
-    t_tensor, sin_table_tensor, cos_table_tensor = [
-        to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table]
+    x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor = [
+        to_tensor(tensor, lib, force_unsigned=True)
+        for tensor in [x, pos, sin_table, cos_table]
    ]
-
-    pos_tensor = to_tensor(pos[: t.shape[0]], lib)
-    pos_tensor.descriptor.contents.dtype = InfiniDtype.U64
+    if inplace == Inplace.INPLACE_X:
+        y_tensor = x_tensor
+    else:
+        y_tensor = to_tensor(y, lib)

    if torch_device == "npu":
        synchronize_device(torch_device)
@@ -133,7 +147,8 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
        lib.infiniopCreateRoPEDescriptor(
            handle,
            ctypes.byref(descriptor),
-            t_tensor.descriptor,
+            y_tensor.descriptor,
+            x_tensor.descriptor,
            pos_tensor.descriptor,
            sin_table_tensor.descriptor,
            cos_table_tensor.descriptor,
@@ -141,14 +156,14 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [t_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
-        tensor.descriptor.contents.invalidate()
+    for tensor in [y_tensor, x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
+        tensor.destroyDesc(lib)

    workspace_size = c_uint64(0)
    check_error(
        lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size))
    )
-    workspace = create_workspace(workspace_size.value, t.device)
+    workspace = create_workspace(workspace_size.value, x.device)

    def lib_rope():
        check_error(
@@ -156,7 +171,8 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
                descriptor,
                workspace.data_ptr() if workspace is not None else None,
                workspace_size.value,
-                t_tensor.data,
+                y_tensor.data,
+                x_tensor.data,
                pos_tensor.data,
                sin_table_tensor.data,
                cos_table_tensor.data,
@@ -168,13 +184,13 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(t, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(t, ans, atol=atol, rtol=rtol)
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)

    if PROFILE:
        profile_operation(
            "PyTorch",
-            lambda: rotary_embedding(t, posTmp, theta, torch_device),
+            lambda: rotary_embedding(x, pos, theta, torch_device),
            torch_device,
            NUM_PRERUN,
            NUM_ITERATIONS,