Merge branch 'main' into issue/300

0166515c · PanZezhong1725 · GitHub · f0300ff3 · a23c4d13 · 0166515c
Unverified Commit 0166515c authored Aug 07, 2025 by PanZezhong1725 Committed by GitHub Aug 07, 2025
20 changed files
--- a/src/infinirt/maca/infinirt_maca.cc
+++ b/src/infinirt/maca/infinirt_maca.cc
-#include "infinirt_maca.h"
+#include "infinirt_metax.h"
 #include "../../utils.h"
 #include <hcr/hc_runtime.h>
 #include <hcr/hc_runtime_api.h>

 #define CHECK_MACART(RT_API) CHECK_INTERNAL(RT_API, hcSuccess)

-namespace infinirt::maca {
+namespace infinirt::metax {
 infiniStatus_t getDeviceCount(int *count) {
    CHECK_MACART(hcGetDeviceCount(count));
    return INFINI_STATUS_SUCCESS;
@@ -124,4 +124,4 @@ infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {
    CHECK_MACART(hcFreeAsync(ptr, (hcStream_t)stream));
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace infinirt::maca
+} // namespace infinirt::metax
--- a/src/infinirt/maca/infinirt_maca.h
+++ b/src/infinirt/maca/infinirt_maca.h
@@ -2,12 +2,12 @@
 #define __INFINIRT_MACA_H__
 #include "../infinirt_impl.h"

-namespace infinirt::maca {
+namespace infinirt::metax {
 #ifdef ENABLE_METAX_API
 INFINIRT_DEVICE_API_IMPL
 #else
 INFINIRT_DEVICE_API_NOOP
 #endif
-} // namespace infinirt::maca
+} // namespace infinirt::metax

 #endif // __INFINIRT_MACA_H__
--- a/test/infiniop-test/README.md
+++ b/test/infiniop-test/README.md
@@ -65,11 +65,11 @@ Name: test.0.ans, NDims: 2, Shape: [6, 4], DataType: F64, DataOffset: 320
 - `Meta` 中必须包含 `test_count` ，表示测例数量。
 - 每个测例的 `Meta` 和 `Tensor` 名字以 `test.[id].` 开头，后接具体信息名称。数字 `[id]` 表示测例编号。编号必须为 0 到 test_count-1.
 - `Tensor` 名字接 `.strides` 表示步长，若没有则默认为连续。
+- 注意：gguf 中的 shape 和 stride 的存储方向是反向的，第一个数代表最后一维。

 ### GGUF测例构建要求

 不参与计算的 `Tensor` 不应存储数据，避免 `GGUF` 文件中出现冗余内容。
 此类 `Tensor` 应使用 `np.empty(tuple(0 for _ in shape), dtype=dtype)` 构造其数据字段,  且 `GGUF` 需存储此张量的形状数据 `.shape`、步长数据 `.strides`，否则无法成功构建，可使用 `contiguous_gguf_strides(shape)` 计算步长数据。

-
-对于 `Elementwise` 算子，需包含零步长（zero-stride）测试。对于步长为0的张量，`GGUF` 不应存储冗余广播数据，可使用 `process_zero_stride_tensor`进行冗余数据移除，同时必须在 `GGUF` 中提供此张量的实际形状数据 `.shape`，否则无法成功构建。
\ No newline at end of file
+对于 `Elementwise` 算子，需包含零步长（zero-stride）测试。对于步长为0的张量，`GGUF` 不应存储冗余广播数据，可使用 `process_zero_stride_tensor`进行冗余数据移除，同时必须在 `GGUF` 中提供此张量的实际形状数据 `.shape`，否则无法成功构建。
--- a/test/infiniop-test/test_generate/infiniop_test.py
+++ b/test/infiniop-test/test_generate/infiniop_test.py
-import gguf
 from typing import List
+
+import gguf
 import numpy as np
 from gguf import GGMLQuantizationType
+from ml_dtypes import bfloat16


 def np_dtype_to_ggml(tensor_dtype: np.dtype):
-    if tensor_dtype == np.float16:
+    if tensor_dtype == bfloat16:
+        return GGMLQuantizationType.BF16
+    elif tensor_dtype == np.float16:
        return GGMLQuantizationType.F16
    elif tensor_dtype == np.float32:
        return GGMLQuantizationType.F32
    elif tensor_dtype == np.float64:
        return GGMLQuantizationType.F64
+    elif tensor_dtype == np.bool:
+        return GGMLQuantizationType.Q8_K
    elif tensor_dtype == np.int8:
        return GGMLQuantizationType.I8
    elif tensor_dtype == np.int16:
@@ -21,7 +27,7 @@ def np_dtype_to_ggml(tensor_dtype: np.dtype):
        return GGMLQuantizationType.I64
    else:
        raise ValueError(
-            "Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now"
+            "Only BF16, F16, F32, F64, BOOL, I8, I16, I32, I64 tensors are supported for now"
        )


@@ -37,6 +43,7 @@ def contiguous_gguf_strides(shape: tuple[int, ...]) -> list[int]:
        acc *= size
    return strides[::-1]

+
 def process_zero_stride_tensor(tensor, stride=None):
    if stride:
        slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
@@ -44,6 +51,7 @@ def process_zero_stride_tensor(tensor, stride=None):
    else:
        return tensor

+
 class InfiniopTestCase:
    op_name: str


--- a/test/infiniop/add.py
+++ b/test/infiniop/add.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
-    create_workspace,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto

@@ -58,12 +59,13 @@ _TEST_CASES = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
-    torch.float32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
 }

 DEBUG = False
@@ -72,52 +74,13 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class AddDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopAddDescriptor_t = POINTER(AddDescriptor)
-
-
-def add(ans, x, y):
-    torch.add(x, y, out=ans)
-
-
-def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
-    """
-    rearrange the tensors if needed and apply the inplace config.
-    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
-    the inplace config is ignored and out-of-place is used
-    """
-    original_c_strides = c_strides if c_strides else c.stride()
-
-    def _rearrange(tensor, strides):
-        if strides and 0 in strides:
-            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
-            return tensor
-        else:
-            return rearrange_if_needed(tensor, strides)
-
-    a, b, c = [
-        _rearrange(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
-    ]
-    c = (
-        c
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a if inplace == Inplace.INPLACE_A else b)
-    )
-    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
-    if 0 in c.stride():
-        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
-
-    return a, b, c
+def add(c, a, b):
+    torch.add(a, b, out=c)


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    a_stride=None,
    b_stride=None,
@@ -126,58 +89,64 @@ def test(
    dtype=torch.float16,
    sync=None,
 ):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
    print(
-        f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{dtype} inplace:{inplace}"
+        f"Testing Add on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
    )

-    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-    c = torch.rand(shape, dtype=dtype).to(torch_device)
-    ans = torch.zeros(shape, dtype=dtype).to(torch_device)
-    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
+    add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())

-    add(ans, a, b)
-
-    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
-    c_tensor = (
-        to_tensor(c, lib)
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
-    )
    if sync is not None:
        sync()

-    descriptor = infiniopAddDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateAddDescriptor(
+        LIBINFINIOP.infiniopCreateAddDescriptor(
            handle,
            ctypes.byref(descriptor),
-            c_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor, c_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetAddWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, c.device)
+    workspace = TestWorkspace(workspace_size.value, c.device)

    def lib_add():
        check_error(
-            lib.infiniopAdd(
+            LIBINFINIOP.infiniopAdd(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
-                workspace_size.value,
-                c_tensor.data,
-                a_tensor.data,
-                b_tensor.data,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
                None,
            )
        )
@@ -186,52 +155,20 @@ def test(

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(c, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: add(ans, a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_add(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyAddDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyAddDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateAddDescriptor.restype = c_int32
-    lib.infiniopCreateAddDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopAddDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetAddWorkspaceSize.restype = c_int32
-    lib.infiniopGetAddWorkspaceSize.argtypes = [
-        infiniopAddDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopAdd.restype = c_int32
-    lib.infiniopAdd.argtypes = [
-        infiniopAddDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyAddDescriptor.restype = c_int32
-    lib.infiniopDestroyAddDescriptor.argtypes = [
-        infiniopAddDescriptor_t,
-    ]

    # Configure testing options
    DEBUG = args.debug
@@ -240,6 +177,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations

    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/attention.py
+++ b/test/infiniop/attention.py
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+from ctypes import c_uint64
 import ctypes
 import sys
 import os

 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from libinfiniop import (
-    open_lib,
-    to_tensor,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    check_error,
-    rearrange_tensor,
-    create_workspace,
-    get_args,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
+    check_error,
    test_operator,
+    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )

 import torch


-class AttentionDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
-
-
 def causal_softmax(x):
    type = x.dtype
    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
@@ -85,9 +79,8 @@ def attention(q, k, v, k_cache, v_cache, pos):


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    n_q_head,
    n_kv_head,
    seq_len,
@@ -100,94 +93,79 @@ def test(
    v_stride=None,
    k_cache_stride=None,
    v_cache_stride=None,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
    print(
-        f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
-        f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
+        f"Testing Attention on {InfiniDeviceNames[device]} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
+        f"dtype:{InfiniDtypeNames[dtype]} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
    )

-    out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
-    q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
-    k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
-    v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
-    k_cache = (
-        torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
-        * 0.1
+    out = TestTensor([seq_len, n_q_head, head_dim], None, dtype, device, mode="zeros")
+    q = TestTensor([n_q_head, seq_len, head_dim], q_stride, dtype, device, scale=0.1)
+    k = TestTensor([n_kv_head, seq_len, head_dim], k_stride, dtype, device, scale=0.1)
+    v = TestTensor([n_kv_head, seq_len, head_dim], v_stride, dtype, device, scale=0.1)
+    k_cache = TestTensor(
+        [n_kv_head, k_cache_buf_len, head_dim], k_cache_stride, dtype, device, scale=0.1
    )
-    v_cache = (
-        torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
-        * 0.1
+    v_cache = TestTensor(
+        [n_kv_head, v_cache_buf_len, head_dim], v_cache_stride, dtype, device, scale=0.1
    )

-    ans = attention(q, k, v, k_cache, v_cache, pos)
-
-    if q_stride is not None:
-        q = rearrange_tensor(q, q_stride)
-    if k_stride is not None:
-        k = rearrange_tensor(k, k_stride)
-    if v_stride is not None:
-        v = rearrange_tensor(v, v_stride)
-    if k_cache_stride is not None:
-        k_cache = rearrange_tensor(k_cache, k_cache_stride)
-    if v_cache_stride is not None:
-        v_cache = rearrange_tensor(v_cache, v_cache_stride)
-
-    out_tensor = to_tensor(out, lib)
-    q_tensor = to_tensor(q, lib)
-    k_tensor = to_tensor(k, lib)
-    v_tensor = to_tensor(v, lib)
-    k_cache_tensor = to_tensor(k_cache, lib)
-    v_cache_tensor = to_tensor(v_cache, lib)
+    def torch_attention():
+        return attention(
+            q.torch_tensor(),
+            k.torch_tensor(),
+            v.torch_tensor(),
+            k_cache.torch_tensor(),
+            v_cache.torch_tensor(),
+            pos,
+        )
+
+    ans = torch_attention()

    if sync is not None:
        sync()

-    descriptor = infiniopAttentionDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateAttentionDescriptor(
+        LIBINFINIOP.infiniopCreateAttentionDescriptor(
            handle,
            ctypes.byref(descriptor),
-            out_tensor.descriptor,
-            q_tensor.descriptor,
-            k_tensor.descriptor,
-            v_tensor.descriptor,
-            k_cache_tensor.descriptor,
-            v_cache_tensor.descriptor,
+            out.descriptor,
+            q.descriptor,
+            k.descriptor,
+            v.descriptor,
+            k_cache.descriptor,
+            v_cache.descriptor,
            pos,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [
-        out_tensor,
-        q_tensor,
-        k_tensor,
-        v_tensor,
-        k_cache_tensor,
-        v_cache_tensor,
-    ]:
-        tensor.destroyDesc(lib)
+    for tensor in [out, q, k, v, k_cache, v_cache]:
+        tensor.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetAttentionWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, out.device)
+    workspace = TestWorkspace(workspace_size.value, out.device)

    def lib_attention():
        check_error(
-            lib.infiniopAttention(
+            LIBINFINIOP.infiniopAttention(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                out_tensor.data,
-                q_tensor.data,
-                k_tensor.data,
-                v_tensor.data,
-                k_cache_tensor.data,
-                v_cache_tensor.data,
+                out.data(),
+                q.data(),
+                k.data(),
+                v.data(),
+                k_cache.data(),
+                v_cache.data(),
                None,
            )
        )
@@ -197,25 +175,25 @@ def test(
    # Validate results
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(out, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(out, ans, atol=atol, rtol=rtol)
+        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: torch_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyAttentionDescriptor(descriptor))


 if __name__ == "__main__":
-    _TENSOR_DTYPES = [torch.float16, torch.float32]
+    _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]

    # Tolerance map for different data types
    _TOLERANCE_MAP = {
-        torch.float16: {"atol": 1e-4, "rtol": 1e-2},
-        torch.float32: {"atol": 1e-5, "rtol": 1e-3},
+        InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
+        InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-3},
    }

    DEBUG = False
@@ -284,45 +262,6 @@ if __name__ == "__main__":
        ),
    ]
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateAttentionDescriptor.restype = c_int32
-    lib.infiniopCreateAttentionDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopAttentionDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_uint64,
-    ]
-
-    lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
-    lib.infiniopGetAttentionWorkspaceSize.argtypes = [
-        infiniopAttentionDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopAttention.restype = c_int32
-    lib.infiniopAttention.argtypes = [
-        infiniopAttentionDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyAttentionDescriptor.restype = c_int32
-    lib.infiniopDestroyAttentionDescriptor.argtypes = [
-        infiniopAttentionDescriptor_t,
-    ]

    # Configure testing options
    DEBUG = args.debug
@@ -332,5 +271,5 @@ if __name__ == "__main__":

    # Execute tests
    for device in get_test_devices(args):
-        test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
+        test_operator(device, test, test_cases, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
-import ctypes
-import sys
-import os
-import time
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-)
-
-from operatorspy.tests.test_utils import get_args
-import torch
-from typing import Tuple
-
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-class AvgPoolDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
-
-
-def pool(x, k, padding, stride, dilation=1):
-    pooling_layers = {
-        1: torch.nn.AvgPool1d,
-        2: torch.nn.AvgPool2d,
-        3: torch.nn.AvgPool3d,
-    }
-
-    ndim = len(x.shape) - 2
-    if ndim not in pooling_layers:
-        print("Error: Pytorch -> Unsupported tensor dimension")
-        return None
-
-    if ndim == 3 and x.dtype == torch.float16:
-        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(
-            x.to(torch.float32)
-        ).to(torch.float16)
-    else:
-        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
-    if PROFILE:
-        torch.cuda.synchronize()
-    return ans
-
-
-def inferShape(x_shape, kernel_shape, padding, strides):
-    assert (
-        len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
-    ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
-    input_shape = x_shape[2:]
-    output_shape = []
-
-    for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
-        output_dim = (dim + 2 * p - k) // s + 1
-        output_shape.append(output_dim)
-
-    return x_shape[:2] + tuple(output_shape)
-
-
-# convert a python tuple to a ctype void pointer
-def tuple_to_void_p(py_tuple: Tuple):
-    array = ctypes.c_int64 * len(py_tuple)
-    data_array = array(*py_tuple)
-    return ctypes.cast(data_array, ctypes.c_void_p)
-
-
-def test(
-    lib,
-    handle,
-    torch_device,
-    x_shape,
-    k_shape,
-    padding,
-    strides,
-    tensor_dtype=torch.float16,
-    sync=None
-):
-    print(
-        f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
-    )
-
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(
-        inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
-    ).to(torch_device)
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = pool(x, k_shape, padding, strides)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = pool(x, k_shape, padding, strides)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
-
-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib)
-    
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopAvgPoolDescriptor_t()
-
-    check_error(
-        lib.infiniopCreateAvgPoolDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-            tuple_to_void_p(k_shape),
-            tuple_to_void_p(padding),
-            tuple_to_void_p(strides),
-            len(k_shape),
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
-
-    workspaceSize = ctypes.c_uint64(0)
-    check_error(
-        lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
-    )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
-        torch_device
-    )
-    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopAvgPool(
-                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                None,
-            )
-        )
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopAvgPool(
-                    descriptor,
-                    workspace_ptr,
-                    workspaceSize,
-                    y_tensor.data,
-                    x_tensor.data,
-                    None,
-                )
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-if __name__ == "__main__":
-    test_cases = [
-        # fmt: off
-        # x_shape, kernel_shape, padding, strides
-        ((1, 1, 10), (3,), (1,), (1,)),
-        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
-        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
-        # fmt: on
-    ]
-    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
-    lib.infiniopCreateAvgPoolDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopAvgPoolDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_uint64,
-    ]
-    lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
-    lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
-        infiniopAvgPoolDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopAvgPool.restype = c_int32
-    lib.infiniopAvgPool.argtypes = [
-        infiniopAvgPoolDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
-    lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
-        infiniopAvgPoolDescriptor_t,
-    ]
-
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    create_workspace,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto

@@ -34,13 +35,13 @@ _TEST_CASES_ = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
-    torch.bfloat16: {"atol": 5e-3, "rtol": 5e-2},
-    torch.float32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
 }


@@ -66,13 +67,6 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class CausalSoftmaxDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
-
-
 def causal_softmax(x):
    type = x.dtype
    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
@@ -81,66 +75,57 @@ def causal_softmax(x):


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    x_stride=None,
    y_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
    print(
-        f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
+        f"Testing CausalSoftmax on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
    )

-    x = torch.rand(shape, dtype=dtype).to(torch_device)
-    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
-    x = torch.where(mask == 1, torch.full_like(x, torch.finfo(x.dtype).max), x)
-    ans = causal_softmax(x)
-
-    x = rearrange_if_needed(x, x_stride)
-
-    x_tensor = to_tensor(x, lib)
+    x = TestTensor(shape, x_stride, dtype, device)
+    ans = causal_softmax(x.torch_tensor())

    if inplace == Inplace.INPLACE_X:
        y = x
-        y_tensor = x_tensor
    else:
-        y = torch.zeros(shape, dtype=dtype).to(torch_device)
-        y = rearrange_if_needed(y, y_stride)
-        y_tensor = to_tensor(y, lib)
+        y = TestTensor(shape, x_stride, dtype, device)

    if sync is not None:
        sync()

-    descriptor = infiniopCausalSoftmaxDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateCausalSoftmaxDescriptor(
-            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
+        LIBINFINIOP.infiniopCreateCausalSoftmaxDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.destroyDesc(lib)
+    x.destroy_desc()
+    y.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetCausalSoftmaxWorkspaceSize(
+        LIBINFINIOP.infiniopGetCausalSoftmaxWorkspaceSize(
            descriptor, ctypes.byref(workspace_size)
        )
    )
-    workspace = create_workspace(workspace_size.value, x.device)
+    workspace = TestWorkspace(workspace_size.value, x.device)

    def lib_causal_softmax():
        check_error(
-            lib.infiniopCausalSoftmax(
+            LIBINFINIOP.infiniopCausalSoftmax(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                y_tensor.data,
-                x_tensor.data,
+                y.data(),
+                x.data(),
                None,
            )
        )
@@ -152,49 +137,21 @@ def test(

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(y, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: causal_softmax(x), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_causal_softmax(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on

-    check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyCausalSoftmaxDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
-    lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopCausalSoftmaxDescriptor_t),
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
-    lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
-        infiniopCausalSoftmaxDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopCausalSoftmax.restype = c_int32
-    lib.infiniopCausalSoftmax.argtypes = [
-        infiniopCausalSoftmaxDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
-    lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
-        infiniopCausalSoftmaxDescriptor_t,
-    ]

    # Configure testing options
    DEBUG = args.debug
@@ -203,6 +160,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations

    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/clip.py
+++ b/test/infiniop/clip.py
@@ -2,21 +2,22 @@

 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    create_workspace,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto

@@ -38,29 +39,26 @@ _TEST_CASES_ = [
    ((5, 10), None, None, -2.0, 0.0),
    ((2, 3, 4), None, None, -2.0, 0.0),
    # 奇怪形状测试
-    ((7, 13), None, None, -1.0, 1.0),     # 质数维度
-    ((3, 5, 7), None, None, -1.0, 1.0),   # 三维质数
+    ((7, 13), None, None, -1.0, 1.0),  # 质数维度
+    ((3, 5, 7), None, None, -1.0, 1.0),  # 三维质数
    # 非标准形状测试
-    ((1, 1), None, None, -1.0, 1.0),       # 最小形状
-    ((100, 100), None, None, -1.0, 1.0),   # 大形状
-    ((16, 16, 16), None, None, -1.0, 1.0), # 大三维
+    ((1, 1), None, None, -1.0, 1.0),  # 最小形状
+    ((100, 100), None, None, -1.0, 1.0),  # 大形状
+    ((16, 16, 16), None, None, -1.0, 1.0),  # 大三维
    # 极端值测试
    ((10,), None, None, -1000.0, 1000.0),  # 大范围
-    ((10,), None, None, -0.001, 0.001),    # 小范围
-    ((10,), None, None, 0.0, 0.0),         # min=max
-    # 特殊形状测试
-    ((0,), None, None, -1.0, 1.0),         # 空张量
-    ((1, 0), None, None, -1.0, 1.0),       # 空维度
-
+    ((10,), None, None, -0.001, 0.001),  # 小范围
+    ((10,), None, None, 0.0, 0.0),  # min=max
 ]


-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]


 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
-    torch.float32: {"atol": 1e-7, "rtol": 1e-6},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
 }


@@ -86,154 +84,108 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class ClipDescriptor(Structure):
-    _fields_ = [("device_type", c_int32), ("device_id", c_int32)]
-infiniopClipDescriptor_t = POINTER(ClipDescriptor)
-
-
-def clip(x, min_val, max_val):
-    return torch.clamp(x, min_val, max_val)
-
-
-def create_tensor_with_stride(shape, stride, dtype, device):
-    """Create a tensor with specific stride without using view() that might cause errors."""
-    x = torch.rand(shape, dtype=dtype, device=device) * 4.0 - 2.0  # Range: [-2, 2]
-    if stride is None:
-        return x
-    if len(shape) == 2 and len(stride) == 2:
-        if stride == (shape[1], 1):
-            return x.contiguous()
-        elif stride == (1, shape[0]):
-            return x.transpose(0, 1).contiguous().transpose(0, 1)
-        else:
-            y = torch.zeros(shape, dtype=dtype, device=device)
-            for i in range(shape[0]):
-                for j in range(shape[1]):
-                    y[i, j] = x[i, j]
-            return y.contiguous()
-    return x
+def clip(y, x, min_val, max_val):
+    torch.clamp(x, min_val, max_val, out=y)


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    x_stride=None,
    y_stride=None,
    min_val=-1.0,
    max_val=1.0,
    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float32,
+    dtype=InfiniDtype.F32,
+    sync=None,
 ):
-    print(
-        f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
-        f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
+    x = TestTensor(shape, x_stride, dtype, device)
+    min_ = TestTensor(
+        shape, [0 for _ in shape], dtype, device, mode="zeros", bias=min_val
+    )
+    max_ = TestTensor(
+        shape, [0 for _ in shape], dtype, device, mode="zeros", bias=max_val
    )
-    x = create_tensor_with_stride(shape, x_stride, dtype, torch_device)
-    ans = clip(x, min_val, max_val)
-    x = rearrange_if_needed(x, x_stride)
-    x_tensor = to_tensor(x, lib)
+
    if inplace == Inplace.INPLACE_X:
+        if x_stride != y_stride:
+            return
        y = x
-        y_tensor = x_tensor
    else:
-        y = torch.zeros(shape, dtype=dtype).to(torch_device)
-        y = rearrange_if_needed(y, y_stride)
-        y_tensor = to_tensor(y, lib)
-    descriptor = infiniopClipDescriptor_t()
+        y = TestTensor(shape, y_stride, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Clip on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"min_val:{min_val} max_val:{max_val} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+
    check_error(
-        lib.infiniopCreateClipDescriptor(
-            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
+        LIBINFINIOP.infiniopCreateClipDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            min_.descriptor,
+            max_.descriptor,
        )
    )

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetClipWorkspaceSize(
+        LIBINFINIOP.infiniopGetClipWorkspaceSize(
            descriptor, ctypes.byref(workspace_size)
        )
    )
-    workspace = create_workspace(workspace_size.value, x.device)
+    workspace = TestWorkspace(workspace_size.value, x.device)

    def lib_clip():
        check_error(
-            lib.infiniopClip(
+            LIBINFINIOP.infiniopClip(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data() if workspace is not None else None,
                workspace_size.value,
-                y_tensor.data,
-                x_tensor.data,
-                c_float(min_val),
-                c_float(max_val),
+                y.data(),
+                x.data(),
+                min_.data(),
+                max_.data(),
                None,
            )
        )

    lib_clip()

-    # Now we can destroy the tensor descriptors
-    x_tensor.destroyDesc(lib)
-    if inplace != Inplace.INPLACE_X:
-        y_tensor.destroyDesc(lib)
+    # Destroy the tensor descriptors
+    for tensor in [x, y, min_, max_]:
+        tensor.destroy_desc()

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
-        print("\nExpected:")
-        print(ans)
-        print("\nActual:")
-        print(y)
-        print("\nDifference:")
-        print(torch.abs(y - ans))
-        print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
-        debug(y, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_clip(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on

-    check_error(lib.infiniopDestroyClipDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyClipDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateClipDescriptor.restype = c_int32
-    lib.infiniopCreateClipDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopClipDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetClipWorkspaceSize.restype = c_int32
-    lib.infiniopGetClipWorkspaceSize.argtypes = [
-        infiniopClipDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopClip.restype = c_int32
-    lib.infiniopClip.argtypes = [
-        infiniopClipDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_float,
-        c_float,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyClipDescriptor.restype = c_int32
-    lib.infiniopDestroyClipDescriptor.argtypes = [
-        infiniopClipDescriptor_t,
-    ]
-
    # Configure testing options
    DEBUG = args.debug
    PROFILE = args.profile
@@ -241,6 +193,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations

    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import torch
 import ctypes
-import sys
-import os
-import time
+from ctypes import c_uint64

-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
-
-from operatorspy.tests.test_utils import get_args
-import torch
+from enum import Enum, auto
+from typing import List, Tuple
 import math
-import ctypes
 from torch.nn import functional as F
-from typing import List, Tuple

 # constant for control whether profile the pytorch and lib functions
 # NOTE: need to manually add synchronization function to the lib function,
@@ -29,36 +29,104 @@ from typing import List, Tuple
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
+_TEST_CASES = [
+    # x_shape, x_stride, w_shape, w_stride, pads, strides, dilations, x_strides
+    (
+        (32, 3, 4),
+        (12, 4, 1),
+        (32, 3, 5),
+        (15, 5, 1),
+        (1,),
+        (1,),
+        (1,),
+    ),
+    (
+        (1, 3, 4, 4),
+        (48, 16, 4, 1),
+        (2, 3, 3, 3),
+        (27, 9, 3, 1),
+        (1, 1),
+        (1, 2),
+        (2, 1),
+    ),
+    (
+        (32, 3, 32, 32),
+        (32 * 32 * 3, 32 * 32, 32, 1),
+        (64, 3, 5, 5),
+        (75, 25, 5, 1),
+        (2, 2),
+        (2, 2),
+        (1, 1),
+    ),
+    (
+        (1, 1, 4, 4, 4),
+        (64, 64, 16, 4, 1),
+        (1, 1, 5, 5, 5),
+        (125, 125, 25, 5, 1),
+        (1, 1, 1),
+        (1, 1, 1),
+        (1, 1, 1),
+    ),
+    (
+        (32, 3, 32, 32, 32),
+        (32 * 32 * 32 * 3, 32 * 32 * 32, 32 * 32, 32, 1),
+        (64, 3, 5, 5, 5),
+        (375, 125, 25, 5, 1),
+        (3, 2, 2),
+        (4, 3, 3),
+        (2, 2, 1),
+    ),
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000


-class ConvDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopConvDescriptor_t = POINTER(ConvDescriptor)
-
-
-def conv(x, w, stride, padding, dilation):
+def conv(x, w, stride, padding, dilation, y_tensor, bias=None):
    match len(x.shape) - 2:
        case 1:
-            return F.conv1d(x, w, stride=stride, padding=padding, dilation=dilation)
+            y_tensor.copy_(
+                F.conv1d(
+                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+                )
+            )
        case 2:
-            return F.conv2d(x, w, stride=stride, padding=padding, dilation=dilation)
+            y_tensor.copy_(
+                F.conv2d(
+                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+                )
+            )
        case 3:
-            return F.conv3d(x, w, stride=stride, padding=padding, dilation=dilation)
+            y_tensor.copy_(
+                F.conv3d(
+                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+                )
+            )
        case _:
            print("Error: Pytorch -> Unsupported tensor dimension")
-            return None


 # infer the shape of the output given the inputs for a N-ary convolution
-def inferShape(
+def inferShapeStride(
    x_shape: List[int],
    w_shape: List[int],
    pads: List[int],
    strides: List[int],
    dilations: List[int],
-) -> Tuple[int, ...]:
+) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
    assert (
        len(x_shape)
        == len(w_shape)
@@ -74,7 +142,12 @@ def inferShape(
        )
        for i in range(len(pads))
    ]
-    return (x_shape[0], w_shape[0]) + tuple(output_dims)
+    output_shape = (x_shape[0], w_shape[0]) + tuple(output_dims)
+    output_strides = [1]
+    for s in reversed(output_shape[1:]):
+        output_strides.insert(0, output_strides[0] * s)
+    output_strides = tuple(output_strides)
+    return output_shape, output_strides


 # convert a python tuple to a ctype void pointer
@@ -85,52 +158,54 @@ def tuple_to_void_p(py_tuple: Tuple):


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    x_shape,
+    x_stride,
    w_shape,
+    w_stride,
    pads,
    strides,
    dilations,
-    tensor_stride=None,
-    tensor_dtype=torch.float16,
-    sync=None
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
 ):
    assert len(pads) == len(strides) == len(dilations)
+    x = TestTensor(x_shape, x_stride, dt=tensor_dtype, device=device, scale=0.01)
+    w = TestTensor(w_shape, w_stride, dt=tensor_dtype, device=device, scale=0.01)
+    y_shape, y_stride = inferShapeStride(x_shape, w_shape, pads, strides, dilations)
+    y = TestTensor(y_shape, y_stride, dt=tensor_dtype, device=device)
+
+    b = (
+        TestTensor((w.shape[0],), (1,), dt=tensor_dtype, device=device, scale=0.01)
+        if w.shape[0] > 1
+        else None
+    )
    print(
-        f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
+        f"Testing Conv on {InfiniDeviceNames[device]} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {x_stride} dtype:{InfiniDtypeNames[tensor_dtype]}"
+    )
+    conv(
+        x.torch_tensor(),
+        w.torch_tensor(),
+        strides,
+        pads,
+        dilations,
+        y.torch_tensor(),
+        b.torch_tensor() if b is not None else None,
    )
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.zeros(
-        inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
-    ).to(torch_device)
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = conv(x, w, strides, pads, dilations)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = conv(x, w, strides, pads, dilations)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")

-    x_tensor = to_tensor(x, lib)
-    w_tensor = to_tensor(w, lib)
-    y_tensor = to_tensor(y, lib)
-    
    if sync is not None:
        sync()

-    descriptor = infiniopConvDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateConvDescriptor(
+        LIBINFINIOP.infiniopCreateConvDescriptor(
            handle,
            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-            w_tensor.descriptor,
+            y.descriptor,
+            x.descriptor,
+            w.descriptor,
+            b.descriptor if b is not None else None,
            tuple_to_void_p(pads),
            tuple_to_void_p(strides),
            tuple_to_void_p(dilations),
@@ -139,169 +214,56 @@ def test(
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    w_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
+    for tensor in [x, y, w, b]:
+        if tensor is not None:
+            tensor.destroy_desc()

-    workspaceSize = ctypes.c_uint64(0)
+    workspace_size = ctypes.c_uint64(0)
    check_error(
-        lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
-    )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
-        torch_device
+        LIBINFINIOP.infiniopGetConvWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+    workspace = TestWorkspace(workspace_size.value, y.device)

-    for i in range(NUM_PRERUN if PROFILE else 1):
+    def lib_conv():
        check_error(
-            lib.infiniopConv(
+            LIBINFINIOP.infiniopConv(
                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                w_tensor.data,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                w.data(),
+                b.data() if b is not None else None,
                None,
            )
        )
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopConv(
-                    descriptor,
-                    workspace_ptr,
-                    workspaceSize,
-                    y_tensor.data,
-                    x_tensor.data,
-                    w_tensor.data,
-                    None,
-                )
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-
-    if tensor_dtype == torch.float16:
-        assert torch.allclose(y, ans, atol=0, rtol=1e-2)
-    else:
-        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyConvDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)

+    lib_conv()
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)

-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+    # Profiling workflow
+    if PROFILE:
        # fmt: off
-        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        profile_operation("PyTorch", lambda: conv(x.torch_tensor(), w.torch_tensor(), strides, pads, dilations, b.torch_tensor() if b is not None else None), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_conv(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    destroy_handle(lib, handle)
+    check_error(LIBINFINIOP.infiniopDestroyConvDescriptor(descriptor))


 if __name__ == "__main__":
-    test_cases = [
-        # x_shape, w_shape, pads, strides, dilations, x_strides
-        (
-            (32, 3, 4),
-            (32, 3, 5),
-            (1,),
-            (1,),
-            (1,),
-            None,
-        ),
-        (
-            (1, 3, 4, 4),
-            (2, 3, 3, 3),
-            (1, 1),
-            (1, 2),
-            (2, 1),
-            None,
-        ),
-        (
-            (32, 3, 128, 128),
-            (64, 3, 5, 5),
-            (2, 2),
-            (2, 2),
-            (1, 1),
-            None,
-        ),
-        (
-            (1, 1, 4, 4, 4),
-            (1, 1, 5, 5, 5),
-            (1, 1, 1),
-            (1, 1, 1),
-            (1, 1, 1),
-            None,
-        ),
-        (
-            (32, 3, 32, 32, 32),
-            (64, 3, 5, 5, 5),
-            (3, 2, 2),
-            (4, 3, 3),
-            (2, 2, 1),
-            None,
-        ),
-    ]
    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateConvDescriptor.restype = c_int32
-    lib.infiniopCreateConvDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopConvDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_uint64,
-    ]
-    lib.infiniopConv.restype = c_int32
-    lib.infiniopConv.argtypes = [
-        infiniopConvDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyConvDescriptor.restype = c_int32
-    lib.infiniopDestroyConvDescriptor.argtypes = [
-        infiniopConvDescriptor_t,
-    ]

-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
-from ctypes import POINTER, Structure, c_int32, c_void_p
-import ctypes
-import sys
-import os
-import time
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-    rearrange_tensor,
-)
-
-from operatorspy.tests.test_utils import get_args
-import torch
-
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-class ExpandDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
-
-
-def expand(x, y):
-    if PROFILE:
-        ans = x.expand_as(y).clone()
-        torch.cuda.synchronize()
-        return ans
-    return x.expand_as(y)
-
-
-def test(
-    lib,
-    handle,
-    torch_device,
-    y_shape,
-    x_shape,
-    y_stride=None,
-    x_stride=None,
-    tensor_dtype=torch.float16,
-    sync=None
-):
-    print(
-        f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
-    )
-
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
-
-    if x_stride is not None:
-        x = rearrange_tensor(x, x_stride)
-    if y_stride is not None:
-        y = rearrange_tensor(y, y_stride)
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = expand(x, y)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = expand(x, y)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
-
-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib)
-    
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopExpandDescriptor_t()
-    check_error(
-        lib.infiniopCreateExpandDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, y_stride, x_stride in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, y_stride, x_stride in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, y_stride, x_stride in test_cases:
-        # fmt: off
-        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-if __name__ == "__main__":
-    test_cases = [
-        # fmt: off
-        # y_shape, x_shape, y_stride, x_stride
-        ((), (), None, None),
-        ((3, 3), (1,), None, None),
-        ((5, 4, 3), (4, 3,), None, (6, 1)),
-        ((99, 111), (111,), None, None),
-        ((2, 4, 3), (1, 3), None, None),
-        ((2, 20, 3), (2, 1, 3), None, None),
-        ((2, 3, 4, 5), (5,), None, None),
-        ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
-        ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
-        # fmt: on
-    ]
-    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateExpandDescriptor.restype = c_int32
-    lib.infiniopCreateExpandDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopExpandDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopExpand.restype = c_int32
-    lib.infiniopExpand.argtypes = [
-        infiniopExpandDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyExpandDescriptor.restype = c_int32
-    lib.infiniopDestroyExpandDescriptor.argtypes = [
-        infiniopExpandDescriptor_t,
-    ]
-
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    create_workspace,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )

 # ==============================================================================
@@ -31,13 +32,13 @@ _TEST_CASES = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32, torch.bfloat16]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
-    torch.float32: {"atol": 0, "rtol": 1e-3},
-    torch.bfloat16: {"atol": 0, "rtol": 5e-2},
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 5e-2},
 }

 DEBUG = False
@@ -46,16 +47,6 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-# ==============================================================================
-#  Definitions
-# ==============================================================================
-class GemmDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopGemmDescriptor_t = POINTER(GemmDescriptor)
-
-
 # PyTorch implementation for matrix multiplication
 def gemm(d, _c, beta, _a, _b, alpha):
    try:
@@ -73,9 +64,8 @@ def gemm(d, _c, beta, _a, _b, alpha):
 # The argument list should be (lib, handle, torch_device, <param list>, dtype)
 # The <param list> should keep the same order as the one specified in _TEST_CASES
 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    alpha,
    beta,
    a_shape,
@@ -84,65 +74,71 @@ def test(
    a_stride=None,
    b_stride=None,
    c_stride=None,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
    print(
-        f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
+        f"Testing Gemm on {InfiniDeviceNames[device]} with alpha:{alpha}, beta:{beta},"
        f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
-        f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}"
+        f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{InfiniDtypeNames[dtype]}"
    )

    # Initialize tensors
-    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
-    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
-    c = torch.ones(c_shape, dtype=dtype).to(torch_device)
-    ans = torch.zeros(c_shape, dtype=dtype).to(torch_device)
+    a = TestTensor(a_shape, a_stride, dtype, device)
+    b = TestTensor(b_shape, b_stride, dtype, device)
+    c = TestTensor(c_shape, c_stride, dtype, device, mode="ones")
+    ans = TestTensor(c_shape, c_stride, dtype, device, mode="zeros")

    # Compute the PyTorch reference result
-    gemm(ans, c, beta, a, b, alpha)
+    def torch_gemm():
+        gemm(
+            ans.torch_tensor(),
+            c.torch_tensor(),
+            beta,
+            a.torch_tensor(),
+            b.torch_tensor(),
+            alpha,
+        )

-    a, b, c = [
-        rearrange_if_needed(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
-    ]
-    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
+    torch_gemm()

    if sync is not None:
        sync()

-    descriptor = infiniopGemmDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateGemmDescriptor(
+        LIBINFINIOP.infiniopCreateGemmDescriptor(
            handle,
            ctypes.byref(descriptor),
-            c_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor, c_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()

    # Get workspace size and create workspace
    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetGemmWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetGemmWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, a.device)
+    workspace = TestWorkspace(workspace_size.value, device)

    # Execute infiniop gemm operator
    def lib_gemm():
        check_error(
-            lib.infiniopGemm(
+            LIBINFINIOP.infiniopGemm(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                c_tensor.data,
-                a_tensor.data,
-                b_tensor.data,
+                c.data(),
+                a.data(),
+                b.data(),
                alpha,
                beta,
                None,
@@ -155,17 +151,17 @@ def test(
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)

    if DEBUG:
-        debug(c, ans, atol=atol, rtol=rtol)
+        debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)

-    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: gemm(ans, c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_gemm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: torch_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyGemmDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyGemmDescriptor(descriptor))


 # ==============================================================================
@@ -173,40 +169,6 @@ def test(
 # ==============================================================================
 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateGemmDescriptor.restype = c_int32
-    lib.infiniopCreateGemmDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopGemmDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetGemmWorkspaceSize.restype = c_int32
-    lib.infiniopGetGemmWorkspaceSize.argtypes = [
-        infiniopGemmDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    lib.infiniopGemm.restype = c_int32
-    lib.infiniopGemm.argtypes = [
-        infiniopGemmDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_float,
-        c_float,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyGemmDescriptor.restype = c_int32
-    lib.infiniopDestroyGemmDescriptor.argtypes = [
-        infiniopGemmDescriptor_t,
-    ]

    # Configure testing options
    DEBUG = args.debug
@@ -216,6 +178,6 @@ if __name__ == "__main__":

    # Execute tests
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
-import ctypes
-import sys
-import os
-import time
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-)
-
-from operatorspy.tests.test_utils import get_args
-import torch, time
-
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-class GlobalAvgPoolDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
-
-
-def inferShape(x):
-    return x.shape[:2] + (1,) * (x.dim() - 2)
-
-
-def globalAvgPool(x):
-    y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
-    if PROFILE:
-        torch.cuda.synchronize()
-    return y.view(*inferShape(x))
-
-
-def test(
-    lib,
-    handle,
-    torch_device,
-    x_shape,
-    tensor_dtype=torch.float16,
-    sync=None
-):
-    print(
-        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
-    )
-
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = globalAvgPool(x)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = globalAvgPool(x)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
-
-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib)
-    
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopGlobalAvgPoolDescriptor_t()
-    check_error(
-        lib.infiniopCreateGlobalAvgPoolDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
-
-    workspaceSize = ctypes.c_uint64(0)
-    check_error(
-        lib.infiniopGetGlobalAvgPoolWorkspaceSize(
-            descriptor, ctypes.byref(workspaceSize)
-        )
-    )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
-        torch_device
-    )
-    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopGlobalAvgPool(
-                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                None,
-            )
-        )
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopGlobalAvgPool(
-                    descriptor,
-                    workspace_ptr,
-                    workspaceSize,
-                    y_tensor.data,
-                    x_tensor.data,
-                    None,
-                )
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for x_shape in test_cases:
-        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for x_shape in test_cases:
-        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for x_shape in test_cases:
-        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
-    destroy_handle(lib, handle)
-
-
-if __name__ == "__main__":
-    test_cases = [
-        # x_shape
-        ((1, 3, 3)),
-        ((1, 3, 1, 1, 3)),
-        ((1, 3, 1, 1, 257)),
-        ((1, 2, 1, 1, 514)),
-        ((1, 3, 1, 1, 1025)),
-        ((32, 256, 1, 112, 112)),
-        ((2, 3, 2048000)),
-        ((2, 1, 10243)),
-        ((2, 20, 100)),
-        ((3, 33, 333)),
-        ((32, 20, 512)),
-        ((3, 3, 11, 11, 11, 3, 2)),
-        ((32, 256, 1, 112, 112)),
-        ((32, 256, 112, 112)),
-    ]
-    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
-    lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopGlobalAvgPoolDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
-    lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
-        infiniopGlobalAvgPoolDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopGlobalAvgPool.restype = c_int32
-    lib.infiniopGlobalAvgPool.argtypes = [
-        infiniopGlobalAvgPoolDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
-    lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
-        infiniopGlobalAvgPoolDescriptor_t,
-    ]
-
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/libinfiniop/__init__.py
+++ b/test/infiniop/libinfiniop/__init__.py
@@ -4,10 +4,11 @@ import sys
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
 from .liboperators import (
    open_lib,
-    CTensor,
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
+    LIBINFINIOP,
 )
 from .devices import *
 from .utils import *
 from .datatypes import *
+from .structs import *
--- a/test/infiniop/libinfiniop/datatypes.py
+++ b/test/infiniop/libinfiniop/datatypes.py
@@ -19,3 +19,27 @@ class InfiniDtype:
    C32 = 17
    C64 = 18
    BF16 = 19
+
+
+InfiniDtypeNames = {
+    InfiniDtype.INVALID: "INVALID",
+    InfiniDtype.BYTE: "BYTE",
+    InfiniDtype.BOOL: "BOOL",
+    InfiniDtype.I8: "I8",
+    InfiniDtype.I16: "I16",
+    InfiniDtype.I32: "I32",
+    InfiniDtype.I64: "I64",
+    InfiniDtype.U8: "U8",
+    InfiniDtype.U16: "U16",
+    InfiniDtype.U32: "U32",
+    InfiniDtype.U64: "U64",
+    InfiniDtype.F8: "F8",
+    InfiniDtype.F16: "F16",
+    InfiniDtype.F32: "F32",
+    InfiniDtype.F64: "F64",
+    InfiniDtype.C8: "C8",
+    InfiniDtype.C16: "C16",
+    InfiniDtype.C32: "C32",
+    InfiniDtype.C64: "C64",
+    InfiniDtype.BF16: "BF16",
+}
--- a/test/infiniop/libinfiniop/devices.py
+++ b/test/infiniop/libinfiniop/devices.py
@@ -10,8 +10,20 @@ class InfiniDeviceEnum:
    SUGON = 8


+InfiniDeviceNames = {
+    InfiniDeviceEnum.CPU: "CPU",
+    InfiniDeviceEnum.NVIDIA: "NVIDIA",
+    InfiniDeviceEnum.CAMBRICON: "Cambricon",
+    InfiniDeviceEnum.ASCEND: "Ascend",
+    InfiniDeviceEnum.METAX: "Metax",
+    InfiniDeviceEnum.MOORE: "Moore",
+    InfiniDeviceEnum.ILUVATAR: "Iluvatar",
+    InfiniDeviceEnum.KUNLUN: "Kunlun",
+    InfiniDeviceEnum.SUGON: "Sugon",
+}
+
 # Mapping that maps InfiniDeviceEnum to torch device string
-infiniDeviceEnum_str_map = {
+torch_device_map = {
    InfiniDeviceEnum.CPU: "cpu",
    InfiniDeviceEnum.NVIDIA: "cuda",
    InfiniDeviceEnum.CAMBRICON: "mlu",

--- a/test/infiniop/libinfiniop/liboperators.py
+++ b/test/infiniop/libinfiniop/liboperators.py
 import os
 import platform
 import ctypes
-from ctypes import c_int, c_int64, c_uint64, Structure, POINTER
+from ctypes import c_int, c_int64, c_uint64, POINTER
 from .datatypes import *
 from .devices import *
+from .op_register import OpRegister
 from pathlib import Path
-
-Device = c_int
-Optype = c_int
+from .structs import *

 INFINI_ROOT = os.getenv("INFINI_ROOT") or str(Path.home() / ".infini")


-class TensorDescriptor(Structure):
-    _fields_ = []
-
-
-infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
-
-
-class CTensor:
-    def __init__(self, desc, torch_tensor):
-        self.descriptor = desc
-        self.torch_tensor_ = torch_tensor
-        self.data = torch_tensor.data_ptr()
-
-    def destroyDesc(self, lib_):
-        lib_.infiniopDestroyTensorDescriptor(self.descriptor)
-        self.descriptor = None
-
-
-class Handle(Structure):
-    _fields_ = [("device", c_int), ("device_id", c_int)]
-
-
-infiniopHandle_t = POINTER(Handle)
-
-
 class InfiniLib:
    def __init__(self, librt, libop):
        self.librt = librt
@@ -98,4 +72,9 @@ def open_lib():
    lib.infinirtSetDevice.argtypes = [c_int, c_int]
    lib.infinirtSetDevice.restype = c_int

+    OpRegister.register_lib(lib)
+
    return lib
+
+
+LIBINFINIOP = open_lib()
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
+from .structs import (
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    infiniopOperatorDescriptor_t,
+)
+
+from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
+
+
+class OpRegister:
+    registry = []
+
+    @classmethod
+    def operator(cls, op):
+        cls.registry.append(op)
+        return op
+
+    @classmethod
+    def register_lib(cls, lib):
+        for op in cls.registry:
+            op(lib)
+
+
+@OpRegister.operator
+def add_(lib):
+    lib.infiniopCreateAddDescriptor.restype = c_int32
+    lib.infiniopCreateAddDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetAddWorkspaceSize.restype = c_int32
+    lib.infiniopGetAddWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAdd.restype = c_int32
+    lib.infiniopAdd.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAddDescriptor.restype = c_int32
+    lib.infiniopDestroyAddDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def attention_(lib):
+    lib.infiniopCreateAttentionDescriptor.restype = c_int32
+    lib.infiniopCreateAttentionDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+
+    lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
+    lib.infiniopGetAttentionWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAttention.restype = c_int32
+    lib.infiniopAttention.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAttentionDescriptor.restype = c_int32
+    lib.infiniopDestroyAttentionDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def causal_softmax_(lib):
+    lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCausalSoftmax.restype = c_int32
+    lib.infiniopCausalSoftmax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def clip_(lib):
+    lib.infiniopCreateClipDescriptor.restype = c_int32
+    lib.infiniopCreateClipDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetClipWorkspaceSize.restype = c_int32
+    lib.infiniopGetClipWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopClip.restype = c_int32
+    lib.infiniopClip.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyClipDescriptor.restype = c_int32
+    lib.infiniopDestroyClipDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def conv_(lib):
+    pass
+
+
+@OpRegister.operator
+def gemm_(lib):
+    lib.infiniopCreateGemmDescriptor.restype = c_int32
+    lib.infiniopCreateGemmDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetGemmWorkspaceSize.restype = c_int32
+    lib.infiniopGetGemmWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopGemm.restype = c_int32
+    lib.infiniopGemm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_float,
+        c_float,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGemmDescriptor.restype = c_int32
+    lib.infiniopDestroyGemmDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def mul_(lib):
+    lib.infiniopCreateMulDescriptor.restype = c_int32
+    lib.infiniopCreateMulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMulWorkspaceSize.restype = c_int32
+    lib.infiniopGetMulWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMul.restype = c_int32
+    lib.infiniopMul.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMulDescriptor.restype = c_int32
+    lib.infiniopDestroyMulDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def random_sample_(lib):
+    lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
+    lib.infiniopCreateRandomSampleDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
+    lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRandomSample.restype = c_int32
+    lib.infiniopRandomSample.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_size_t,
+        c_void_p,
+        c_float,
+        c_float,
+        c_int32,
+        c_float,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
+    lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def rearrange_(lib):
+    lib.infiniopCreateRearrangeDescriptor.restype = c_int32
+    lib.infiniopCreateRearrangeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopRearrange.restype = c_int32
+    lib.infiniopRearrange.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
+    lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def relu_(lib):
+    lib.infiniopCreateReluDescriptor.restype = c_int32
+    lib.infiniopCreateReluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopRelu.restype = c_int32
+    lib.infiniopRelu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReluDescriptor.restype = c_int32
+    lib.infiniopDestroyReluDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def rms_norm_(lib):
+    lib.infiniopCreateRMSNormDescriptor.restype = c_int32
+    lib.infiniopCreateRMSNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+
+    lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRMSNorm.restype = c_int32
+    lib.infiniopRMSNorm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
+    lib.infiniopDestroyRMSNormDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def rope_(lib):
+    lib.infiniopCreateRoPEDescriptor.restype = c_int32
+    lib.infiniopCreateRoPEDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
+    lib.infiniopGetRoPEWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRoPE.restype = c_int32
+    lib.infiniopRoPE.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRoPEDescriptor.restype = c_int32
+    lib.infiniopDestroyRoPEDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sub_(lib):
+    lib.infiniopCreateSubDescriptor.restype = c_int32
+    lib.infiniopCreateSubDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSubWorkspaceSize.restype = c_int32
+    lib.infiniopGetSubWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSub.restype = c_int32
+    lib.infiniopSub.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySubDescriptor.restype = c_int32
+    lib.infiniopDestroySubDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def swiglu_(lib):
+    lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
+    lib.infiniopCreateSwiGLUDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
+    lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSwiGLU.restype = c_int32
+    lib.infiniopSwiGLU.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
+    lib.infiniopDestroySwiGLUDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def conv_(lib):
+    lib.infiniopCreateConvDescriptor.restype = c_int32
+    lib.infiniopCreateConvDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_size_t,
+    ]
+    lib.infiniopGetConvWorkspaceSize.restype = c_int32
+    lib.infiniopGetConvWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopConv.restype = c_int32
+    lib.infiniopConv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyConvDescriptor.restype = c_int32
+    lib.infiniopDestroyConvDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
--- a/test/infiniop/libinfiniop/structs.py
+++ b/test/infiniop/libinfiniop/structs.py
+from ctypes import c_int, Structure, POINTER
+
+
+class TensorDescriptor(Structure):
+    _fields_ = []
+
+
+infiniopTensorDescriptor_t = POINTER(TensorDescriptor)
+
+
+class Handle(Structure):
+    _fields_ = [("device", c_int), ("device_id", c_int)]
+
+
+infiniopHandle_t = POINTER(Handle)
+
+
+class OpDescriptor(Structure):
+    _fields_ = [("device", c_int), ("device_id", c_int)]
+
+
+infiniopOperatorDescriptor_t = POINTER(OpDescriptor)
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
+from typing import Sequence
 import torch
 import ctypes
 from .datatypes import *
 from .devices import *
-from typing import Sequence
-from .liboperators import infiniopTensorDescriptor_t, CTensor, infiniopHandle_t
+from .liboperators import infiniopTensorDescriptor_t, LIBINFINIOP, infiniopHandle_t


 def check_error(status):
@@ -11,71 +11,173 @@ def check_error(status):
        raise Exception("Error code " + str(status))


-def to_tensor(tensor, lib, force_unsigned=False):
-    """
-    Convert a PyTorch tensor to a library Tensor(descriptor, data).
-    """
-    import torch
+class CTensor:
+    def __init__(self, dt: InfiniDtype, shape, strides):
+        self.descriptor = infiniopTensorDescriptor_t()
+        self.dt = dt
+        self.ndim = len(shape)
+        if strides is None:
+            strides = [1 for _ in shape]
+            for i in range(self.ndim - 2, -1, -1):
+                strides[i] = strides[i + 1] * shape[i + 1]
+
+        assert self.ndim == len(strides)
+        self.c_shape = (ctypes.c_size_t * self.ndim)(*shape)
+        self.c_strides = (ctypes.c_ssize_t * self.ndim)(*strides)
+
+        LIBINFINIOP.infiniopCreateTensorDescriptor(
+            ctypes.byref(self.descriptor),
+            self.ndim,
+            self.c_shape,
+            self.c_strides,
+            self.dt,
+        )

-    ndim = tensor.ndimension()
-    shape = (ctypes.c_size_t * ndim)(*tensor.shape)
-    strides = (ctypes.c_int64 * ndim)(*(tensor.stride()))
-    # fmt: off
-    dt = (
-        InfiniDtype.I8 if tensor.dtype == torch.int8 else
-        InfiniDtype.I16 if tensor.dtype == torch.int16 else
-        InfiniDtype.I32 if tensor.dtype == torch.int32 else
-        InfiniDtype.I64 if tensor.dtype == torch.int64 else
-        InfiniDtype.U8 if tensor.dtype == torch.uint8 else
-        InfiniDtype.F16 if tensor.dtype == torch.float16 else
-        InfiniDtype.BF16 if tensor.dtype == torch.bfloat16 else
-        InfiniDtype.F32 if tensor.dtype == torch.float32 else
-        InfiniDtype.F64 if tensor.dtype == torch.float64 else
-        # TODO: These following types may not be supported by older
-        # versions of PyTorch.
-        InfiniDtype.U16 if tensor.dtype == torch.uint16 else
-        InfiniDtype.U32 if tensor.dtype == torch.uint32 else
-        InfiniDtype.U64 if tensor.dtype == torch.uint64 else
-        None
-    )
-    
-    if force_unsigned:
-        dt = (
-            InfiniDtype.U8 if dt == InfiniDtype.I8 else
-            InfiniDtype.U16 if dt == InfiniDtype.I16 else
-            InfiniDtype.U32 if dt == InfiniDtype.I32 else
-            InfiniDtype.U64 if dt == InfiniDtype.I64 else
-            dt
+    def destroy_desc(self):
+        if self.descriptor is not None:
+            LIBINFINIOP.infiniopDestroyTensorDescriptor(self.descriptor)
+            self.descriptor = None
+
+
+class TestTensor(CTensor):
+    def __init__(
+        self,
+        shape,
+        strides,
+        dt: InfiniDtype,
+        device: InfiniDeviceEnum,
+        mode="random",
+        scale=None,
+        bias=None,
+        set_tensor=None,
+    ):
+        self.dt = dt
+        self.device = device
+        self.shape = shape
+        self.strides = strides
+        torch_shape = []
+        torch_strides = [] if strides is not None else None
+        for i in range(len(shape)):
+            if strides is not None and strides[i] == 0:
+                torch_shape.append(1)
+                torch_strides.append(1)
+            elif strides is not None and strides[i] != 0:
+                torch_shape.append(shape[i])
+                torch_strides.append(strides[i])
+            else:
+                torch_shape.append(shape[i])
+        if mode == "random":
+            self._torch_tensor = torch.rand(
+                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
+            )
+        elif mode == "zeros":
+            self._torch_tensor = torch.zeros(
+                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
+            )
+        elif mode == "ones":
+            self._torch_tensor = torch.ones(
+                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
+            )
+        elif mode == "manual":
+            assert set_tensor is not None
+            assert torch_shape == list(set_tensor.shape)
+            assert torch_strides == list(set_tensor.stride())
+            self._torch_tensor = set_tensor.to(to_torch_dtype(dt)).to(
+                torch_device_map[device]
+            )
+        else:
+            raise ValueError("Unsupported mode")
+
+        if scale is not None:
+            self._torch_tensor *= scale
+        if bias is not None:
+            self._torch_tensor += bias
+
+        if strides is not None:
+            self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides)
+        else:
+            self._data_tensor = self._torch_tensor.clone()
+
+        super().__init__(self.dt, shape, strides)
+
+    def torch_tensor(self):
+        return self._torch_tensor
+
+    def actual_tensor(self):
+        return self._data_tensor
+
+    def data(self):
+        return self._data_tensor.data_ptr()
+
+    def is_broadcast(self):
+        return self.strides is not None and 0 in self.strides
+
+    @staticmethod
+    def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
+        shape_ = list(torch_tensor.shape)
+        strides_ = list(torch_tensor.stride())
+        return TestTensor(
+            shape_, strides_, dt, device, mode="manual", set_tensor=torch_tensor
        )

-    # fmt: on
-    assert dt is not None
-    # Create TensorDecriptor
-    tensor_desc = infiniopTensorDescriptor_t()
-    lib.infiniopCreateTensorDescriptor(
-        ctypes.byref(tensor_desc), ndim, shape, strides, dt
-    )
-    # Create Tensor
-    return CTensor(tensor_desc, tensor)

+def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
+    if dt == InfiniDtype.I8:
+        return torch.int8
+    elif dt == InfiniDtype.I16:
+        return torch.int16
+    elif dt == InfiniDtype.I32:
+        return torch.int32
+    elif dt == InfiniDtype.I64:
+        return torch.int64
+    elif dt == InfiniDtype.U8:
+        return torch.uint8
+    elif dt == InfiniDtype.F16:
+        return torch.float16
+    elif dt == InfiniDtype.BF16:
+        return torch.bfloat16
+    elif dt == InfiniDtype.F32:
+        return torch.float32
+    elif dt == InfiniDtype.F64:
+        return torch.float64
+    # TODO: These following types may not be supported by older
+    # versions of PyTorch. Use compatability mode to convert them.
+    elif dt == InfiniDtype.U16:
+        return torch.int16 if compatability_mode else torch.uint16
+    elif dt == InfiniDtype.U32:
+        return torch.int32 if compatability_mode else torch.uint32
+    elif dt == InfiniDtype.U64:
+        return torch.int64 if compatability_mode else torch.uint64
+    else:
+        raise ValueError("Unsupported data type")

-def create_workspace(size, torch_device):
-    print(f" - Workspace Size : {size}")
-    if size == 0:
-        return None
-    import torch

-    return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
+class TestWorkspace:
+    def __init__(self, size, device):
+        if size != 0:
+            self.tensor = TestTensor((size,), None, InfiniDtype.U8, device, mode="ones")
+        else:
+            self.tensor = None
+        self._size = size
+
+    def data(self):
+        if self.tensor is not None:
+            return self.tensor.data()
+        else:
+            return None
+
+    def size(self):
+        return ctypes.c_uint64(self._size)


-def create_handle(lib):
+def create_handle():
    handle = infiniopHandle_t()
-    check_error(lib.infiniopCreateHandle(ctypes.byref(handle)))
+    check_error(LIBINFINIOP.infiniopCreateHandle(ctypes.byref(handle)))
    return handle


-def destroy_handle(lib, handle):
-    check_error(lib.infiniopDestroyHandle(handle))
+def destroy_handle(handle):
+    check_error(LIBINFINIOP.infiniopDestroyHandle(handle))


 def rearrange_tensor(tensor, new_strides):
@@ -124,13 +226,6 @@ def rearrange_tensor(tensor, new_strides):
    return new_tensor


-def rearrange_if_needed(tensor, stride):
-    """
-    Rearrange a PyTorch tensor if the given stride is not None.
-    """
-    return rearrange_tensor(tensor, stride) if stride is not None else tensor
-
-
 def get_args():
    import argparse

@@ -167,6 +262,11 @@ def get_args():
        action="store_true",
        help="Run NVIDIA GPU test",
    )
+    parser.add_argument(
+        "--iluvatar",
+        action="store_true",
+        help="Run Iluvatar GPU test",
+    )
    parser.add_argument(
        "--cambricon",
        action="store_true",
@@ -224,6 +324,7 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
        If True, the function will print detailed information about any discrepancies between the tensors.
    """
    import numpy as np
+
    # 如果是BF16，全部转成FP32再比对
    if actual.dtype == torch.bfloat16 or desired.dtype == torch.bfloat16:
        actual = actual.to(torch.float32)
@@ -308,7 +409,9 @@ def debug_all(
    assert passed, "\033[31mThe condition has not been satisfied\033[0m"


-def print_discrepancy(actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbose=True):
+def print_discrepancy(
+    actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbose=True
+):
    if actual.shape != expected.shape:
        raise ValueError("Tensors must have the same shape to compare.")

@@ -321,8 +424,12 @@ def print_discrepancy(actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbo
    expected_isnan = torch.isnan(expected)

    # Calculate the difference mask based on atol and rtol
-    nan_mismatch = actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan
-    diff_mask = nan_mismatch | (torch.abs(actual - expected) > (atol + rtol * torch.abs(expected)))
+    nan_mismatch = (
+        actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan
+    )
+    diff_mask = nan_mismatch | (
+        torch.abs(actual - expected) > (atol + rtol * torch.abs(expected))
+    )
    diff_indices = torch.nonzero(diff_mask, as_tuple=False)
    delta = actual - expected

@@ -419,35 +526,33 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
    print(f" {desc} time: {elapsed * 1000 :6f} ms")


-def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
+def test_operator(device, test_func, test_cases, tensor_dtypes):
    """
    Testing a specified operator on the given device with the given test function, test cases, and tensor data types.

    Arguments:
    ----------
-    - lib (ctypes.CDLL): The library object containing the operator implementations.
    - device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
    - test_func (function): The test function to be executed for each test case.
    - test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
        to be passed to `test_func`.
    - tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
    """
-    lib.infinirtSetDevice(device, ctypes.c_int(0))
-    handle = create_handle(lib)
+    LIBINFINIOP.infinirtSetDevice(device, ctypes.c_int(0))
+    handle = create_handle()
    tensor_dtypes = filter_tensor_dtypes_by_device(device, tensor_dtypes)
    try:
        for test_case in test_cases:
            for tensor_dtype in tensor_dtypes:
                test_func(
-                    lib,
                    handle,
-                    infiniDeviceEnum_str_map[device],
+                    device,
                    *test_case,
                    tensor_dtype,
                    get_sync_func(device),
                )
    finally:
-        destroy_handle(lib, handle)
+        destroy_handle(handle)


 def get_test_devices(args):
@@ -466,6 +571,8 @@ def get_test_devices(args):
        devices_to_test.append(InfiniDeviceEnum.CPU)
    if args.nvidia:
        devices_to_test.append(InfiniDeviceEnum.NVIDIA)
+    if args.iluvatar:
+        devices_to_test.append(InfiniDeviceEnum.ILUVATAR)
    if args.cambricon:
        import torch_mlu

@@ -498,7 +605,7 @@ def get_test_devices(args):
 def get_sync_func(device):
    import torch

-    device_str = infiniDeviceEnum_str_map[device]
+    device_str = torch_device_map[device]

    if device == InfiniDeviceEnum.CPU:
        sync = None